NVIDIA · yaoyu-33 · May 22, 2024 · Mar 16, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py b/examples/multimodal/multimodal_llm/neva/eval/gradio_server.py
@@ -20,6 +20,7 @@
 from omegaconf import OmegaConf
 
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
+from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
 
 CFG_STRING = """
 trainer:

diff --git a/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py b/examples/multimodal/multimodal_llm/neva/eval/vqa_science.py
@@ -169,7 +169,7 @@ def eval_model(args):
  parser.add_argument("--image-folder", type=str, default="")
  parser.add_argument("--question-file", type=str, default="tables/question.json")
  parser.add_argument("--answers-file", type=str, default="answer.jsonl")
- parser.add_argument("--conv-mode", type=str, default="llava_v0")
+ parser.add_argument("--conv-mode", type=str, default="llava_v0") # this flag has no use!
  parser.add_argument("--tp", type=int, default=1)
  parser.add_argument("--pp", type=int, default=1)
  parser.add_argument("--num-chunks", type=int, default=1)

diff --git a/nemo/collections/multimodal/data/neva/conversation.py b/nemo/collections/multimodal/data/neva/conversation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import dataclasses
+from collections import defaultdict
 from enum import Enum, auto
 from typing import List
 
@@ -24,9 +25,14 @@
 DEFAULT_SYSTEM_TOKEN = "<extra_id_0>"
 DEFAULT_SEPARATOR_TOKEN = "<extra_id_1>"
 DEFAULT_LABELS_TOKEN = "<extra_id_2>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<extra_id_3>"
-DEFAULT_IM_START_TOKEN = "<extra_id_4>"
-DEFAULT_IM_END_TOKEN = "<extra_id_5>"
+DEFAULT_IMAGE_PATCH_TOKEN = defaultdict(lambda: "<extra_id_3>")
+DEFAULT_IM_START_TOKEN = defaultdict(lambda: "<extra_id_4>")
+DEFAULT_IM_END_TOKEN = defaultdict(lambda: "<extra_id_5>")
+
+# Update llama3 default
+DEFAULT_IMAGE_PATCH_TOKEN["llama_3"] = "<|reserved_special_token_3|>"
+DEFAULT_IM_START_TOKEN["llama_3"] = "<|reserved_special_token_4|>"
+DEFAULT_IM_END_TOKEN["llama_3"] = "<|reserved_special_token_5|>"
 
 
 class SeparatorStyle(Enum):
@@ -36,6 +42,7 @@ class SeparatorStyle(Enum):
  TWO = auto()
  PLAIN = auto()
  LLAMA_2 = auto()
+ LLAMA_3 = auto()
  NVGPT = auto()
 
 
@@ -109,6 +116,34 @@ def get_prompt(self):
  else:
  ret += ""
  ret = ret.lstrip(self.sep)
+ elif self.sep_style == SeparatorStyle.LLAMA_3:
+ """
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+ {{ system_prompt }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+ {{ user_message_1 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+ {{ model_answer_1 }}<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+ {{ user_message_2 }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+ """
+ wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}"
+ wrap_user = lambda msg: f"<|start_header_id|>user<|end_header_id|>\n\n{msg}"
+ wrap_assistant = lambda msg: f"<|start_header_id|>assistant<|end_header_id|>\n\n{msg}"
+
+ ret = "<|begin_of_text|>" + wrap_sys(self.system) + self.sep
+ for i, (role, message) in enumerate(messages):
+ if i == 0:
+ assert message, "first message should not be none"
+ assert role == self.roles[0], "first message should come from user"
+ if type(message) is tuple:
+ message, _, _ = message
+ elif i % 2 == 0:
+ ret += wrap_user(message) + self.sep
+ else:
+ ret += wrap_assistant(message) + (self.sep if message else "")
+
  elif self.sep_style == SeparatorStyle.PLAIN:
  seps = [self.sep, self.sep2]
  ret = self.system
@@ -346,8 +381,25 @@ def dict(self):
  sep2=DEFAULT_EOS_TOKEN,
 )
 
+conv_llava_llama_3 = Conversation(
+ system="You are a helpful language and vision assistant. "
+ "You are able to understand the visual content that the user provides, "
+ "and assist the user with a variety of tasks using natural language.",
+ roles=("user", "assistant"),
+ version="llama_v3",
+ messages=(),
+ offset=0,
+ sep_style=SeparatorStyle.LLAMA_3,
+ sep="<|eot_id|>",
+)
+
 conv_llava_plain = Conversation(
- system="", roles=("", ""), messages=(), offset=0, sep_style=SeparatorStyle.PLAIN, sep="\n",
+ system="",
+ roles=("", ""),
+ messages=(),
+ offset=0,
+ sep_style=SeparatorStyle.PLAIN,
+ sep="\n",
 )
 
 conv_llava_v0 = Conversation(
@@ -416,6 +468,5 @@ def dict(self):
  "nv_dpo": conv_nv_dpo,
 }
 
-
 if __name__ == "__main__":
  print(default_conversation.get_prompt())