camel-ai · chenllliang · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023 · Oct 19, 2023
diff --git a/camel/prompts/__init__.py b/camel/prompts/__init__.py
@@ -21,10 +21,12 @@
 from .role_description_prompt_template import RoleDescriptionPromptTemplateDict
 from .task_prompt_template import TaskPromptTemplateDict
 from .prompt_templates import PromptTemplateGenerator
+from .multimodal import MultiModalPrompt
 
 __all__ = [
  'TextPrompt',
  'CodePrompt',
+ 'MultiModalPrompt',
  'TextPromptDict',
  'AISocietyPromptTemplateDict',
  'CodePromptTemplateDict',

diff --git a/camel/prompts/multimodal.py b/camel/prompts/multimodal.py
@@ -0,0 +1,90 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from typing import Any, Callable, Dict, List, Union
+
+from camel.prompts import TextPrompt
+
+MODALITIES = ["CAMEL_IMAGE"]
+
+
+def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict:
+ r"""
+ The default format is return the text and multimodal information in dict.
+ This function should be implemented in the multimodal prompt class.
+
+ Returns:
+ dict: The input format that the multimodal model can understand.
+ """
+
+ return {"text": text_prompt, "multimodal_information": modalities_dict}
+
+
+class MultiModalPrompt:
+ r"""
+ To enable information transfer between multimodal agents, we need a multimodal prompt class.
+ It contains a text prompt and multimodal information.
+ """
+
+ def __init__(self, text_prompt: TextPrompt, modalities: Union[List, Dict]):
+ r"""
+ Initializes the multimodal prompt.
+
+ Args:
+ text_prompt (TextPrompt): The text prompt.
+ multimodal_info (dict): The supported modalities list or modality information dict.
+ """
+ # check if multimodal_info is valid
+ for modality in modalities:
+ assert modality in MODALITIES, f"modality {modality} not supported."
+
+ self.text_prompt = text_prompt
+ self.modalities = modalities
+
+ def format(self, *args: Any, **kwargs: Any) -> 'MultiModalPrompt':
+ r"""
+ Formats the text prompt and the multimodal information at the same time.
+ if the keyword argument is in MODALITIES, then pop it and add it to multimodal_info, otherwise, apply it to text_prompt.
+
+ Args:
+ *args (Any): Variable length argument list.
+ **kwargs (Any): Arbitrary keyword arguments.
+
+ Returns:
+ MultiModalPrompt: The formatted multimodal prompt.
+ """
+
+ # pop the kwargs that is in MODALITIES
+ multimodal_info = {}
+ for modality in self.modalities:
+ multimodal_info[modality] = kwargs.pop(modality)
+
+ text_prompt = self.text_prompt.format(*args, **kwargs)
+ return MultiModalPrompt(text_prompt, multimodal_info)
+
+ def to_model_format(
+ self,
+ method: Callable = default_to_model_format) -> Any:
+ r"""
+ Converts the prompt to the input format that the multimodal model can understand. Different multimodal models have different input formats.
+ The default format is return the text and multimodal information in dict.
+ This function should be implemented in the multimodal prompt class.
+
+ Returns:
+ dict: The input format that the multimodal model can understand.
+ """
+
+ return method(self.text_prompt, self.modalities)
+
+
+# TODO: MultiModalPromptDict
diff --git a/docs/get_started/multimodal_prompt.md b/docs/get_started/multimodal_prompt.md
@@ -0,0 +1,159 @@
+# Introduction to `MultiModalPrompt` Class
+
+## Overview
+
+The `MultiModalPrompt` class streamlines the process of creating integrated prompts for multimodal agents. By bringing together text and other modalities, it establishes a unified structure for communication.
+
+## Supported Modalities
+
+As of now, the class recognizes the following modality:
+- `CAMEL_IMAGE`
+
+## Initialization
+
+To initialize a `MultiModalPrompt` instance:
+
+```python
+from camel.prompts import MultiModalPrompt, TextPrompt
+
+vqa_prompt = MultiModalPrompt(
+ text_prompt=TextPrompt("Please answer the following question related to the provided image:\nQuestion: {Question}"),
+ modalities=["CAMEL_IMAGE"]
+)
+```
+
+**Arguments**:
+- `text_prompt` (TextPrompt): The text-based template. It dictates the format of the text segment of the prompt.
+- `modalities` (Union[List, Dict]): Either a list of modality names or a dictionary pairing modality names with their respective data. If the input is a dictionary, it should follow the pattern `{Modality Name: Modality Data}`.
+
+## Methods
+
+### `format(*args, **kwargs) -> 'MultiModalPrompt'`
+
+This method concurrently formats both the text and multimodal components. Once formatted, the output is a new `MultiModalPrompt` instance.
+
+```python
+vqa_prompt = MultiModalPrompt(
+ text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
+ modalities=["CAMEL_IMAGE"]
+ )
+
+question = "What animal is in the picture?"
+image_path = "examples/multimodal/camel.jpg"
+
+vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path)
+# the prompt is now an instance of MultiModalPrompt, initializing with all multimodal information, and can be used to generate model input when the to_model_format method is called
+```
+
+### `to_model_format(method=default_to_model_format) -> Any`
+
+Transforms the prompt to a format understood by the multimodal model.
+
+By default, this method returns the prompt as a dictionary. However, by specifying a different `model_format` method, the output can be adapted to various multimodal model requirements.
+
+```python
+def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict:
+ r"""
+ The default format is return the text and multimodal information in dict.
+ This function should be implemented in the multimodal prompt class.
+
+ Returns:
+ dict: The input format that the multimodal model can understand.
+ """
+
+ return {"text": text_prompt, "multimodal_information": modalities_dict}
+
+vqa_prompt = MultiModalPrompt(
+ text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
+ modalities=["CAMEL_IMAGE"]
+ )
+
+question = "What animal is in the picture?"
+image_path = "examples/multimodal/camel.jpg"
+
+vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path)
+
+
+print(vqa_prompt.to_model_format(default_to_model_format))
+# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}}
+```
+
+## Usage Examples
+
+We provide an example in `camel/examples/multimodal/formating_example.py` which you can directly run to see the output.
+
+### 1. Single Image VQA (Visual Question Answering):
+
+This example illustrates how to generate prompts for a Visual Question Answering task associated with a single image.
+
+```python
+from camel.prompts import MultiModalPrompt, TextPrompt
+
+# Create a VQA prompt template
+vqa_prompt = MultiModalPrompt(
+ text_prompt=TextPrompt("Please answer the following question about the given image:\nQuestion: {Question}"),
+ modalities=["CAMEL_IMAGE"]
+)
+
+# Define questions and their respective image paths
+question1 = "What animal is in the picture?"
+question2 = "What is the color of the animal?"
+image1_path = "examples/multimodal/camel.jpg"
+image2_path = "examples/multimodal/llama.jpg"
+
+# Format and display the prompts
+vqa_prompt1 = vqa_prompt.format(Question=question1, CAMEL_IMAGE=image1_path)
+vqa_prompt2 = vqa_prompt.format(Question=question2, CAMEL_IMAGE=image2_path)
+
+print("vqa_prompt1:", vqa_prompt1.to_model_format())
+# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}}
+print("vqa_prompt2:", vqa_prompt2.to_model_format())
+# {'text': 'Please anwser the following question about the given image:\nQuestion: What is the color of the animal?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/llama.jpg'}}
+```
+
+### 2. Multi-Image Question with a Custom Model Input:
+
+This showcases the creation of a prompt involving multiple images for a single question. Furthermore, it illustrates how to adapt the prompt to a model-specific format.
+
+```python
+def multi_image_input_format(text_prompt, modalities_dict):
+ """ 
+ Label each image in the prompt with numbers.
+ """
+ if not isinstance(modalities_dict["CAMEL_IMAGE"], List):
+ modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]]
+
+ for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]):
+ text_prompt = f"Image {i} is <Image{i}> [Image{i}]\n" + text_prompt
+
+ return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]}
+
+# Define the multi-image question and format the prompt
+question3 = "Are the animals from the two images the same?"
+multi_image_prompt = vqa_prompt.format(Question=question3, CAMEL_IMAGE=[image1_path, image2_path])
+
+# Display the multi-image prompt and its corresponding images
+model_input = multi_image_prompt.to_model_format(multi_image_input_format)
+
+print("Prompt:", model_input["prompt"])
+'''
+Image 1 is <Image1> [Image1]
+Image 0 is <Image0> [Image0]
+Please anwser the following question about the given image:
+Question: Are the animals from the two images the same?
+'''
+
+
+print("Images:", model_input["image"])
+'''
+['examples/multimodal/camel.jpg', 'examples/multimodal/llama.jpg']
+'''
+
+```
+
+## Application with different multimodal models
+
+
+### LLAVA-1.5
+
+- TODO: add examples of how to use the multimodal prompt with different multimodal models with simple VL tasks.
diff --git a/docs/index.rst b/docs/index.rst
@@ -19,6 +19,7 @@ Welcome to CAMEL's documentation!
  get_started/text_prompt.md
  get_started/code_prompt.md
  get_started/messages.md
+ get_started/multimodal_prompt.md
 
 .. toctree::
  :maxdepth: 1

diff --git a/examples/multimodal/camel.jpg b/examples/multimodal/camel.jpg
diff --git a/examples/multimodal/formating_example.py b/examples/multimodal/formating_example.py
@@ -0,0 +1,72 @@
+from typing import List
+
+from camel.prompts import MultiModalPrompt, TextPrompt
+
+if __name__ == "__main__":
+ # example prompt for simple one image vqa, using default model input format
+ vqa_prompt = MultiModalPrompt(
+ text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
+ modalities=["CAMEL_IMAGE"])
+
+ print("Example prompt for simple one image vqa, using default model input format:")
+
+ question1 = "What animal is in the picture?"
+ question2 = "What is the color of the animal?"
+
+ image1_path = "examples/multimodal/camel.jpg"
+ image2_path = "examples/multimodal/llama.jpg"
+
+ vqa_prompt1 = vqa_prompt.format(
+ Question=question1,
+ CAMEL_IMAGE=image1_path)
+ vqa_prompt2 = vqa_prompt.format(
+ Question=question2,
+ CAMEL_IMAGE=image2_path)
+
+ print("vqa_prompt1:")
+ print(vqa_prompt1.to_model_format())
+
+ print("vqa_prompt2:")
+ print(vqa_prompt2.to_model_format())
+
+ print("-" * 100)
+
+ # example prompt for multiple image question, with custom model input
+ # format
+ def multi_image_input_format(text_prompt, modalities_dict):
+ r"""
+ Label the image in the front of text prompt with numbers.
+ The multi image indexing format is taken from MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning
+ [Image{i}] in the prompt would be replaced by the visual prompts for the i-th image.
+
+ Returns:
+ dict: The input format that the multimodal model can understand.
+ """
+
+ if not isinstance(modalities_dict["CAMEL_IMAGE"], List):
+ modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]]
+
+ for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]):
+ text_prompt = f"Image {i} is <Image{i}> [Image{i}]\n" + text_prompt
+
+ return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]}
+
+ question3 = "Are the animals from the two images the same?"
+ multi_image_prompt = vqa_prompt.format(
+ Question=question3,
+ CAMEL_IMAGE=[
+ image1_path,
+ image2_path]) # easily apply custom input format for different VLM agents
+
+ print(
+ r"Example prompt for multiple image question, with custom model input format(<Image{i}> is the special token, [Image{i}] is the image visual prompt]):")
+ print("multi_image_prompt: \n")
+
+ model_input = multi_image_prompt.to_model_format(multi_image_input_format)
+
+ prompt = model_input["prompt"]
+ print(prompt)
+
+ images = model_input["image"]
+ print("images:")
+ print(images)
diff --git a/examples/multimodal/llama.jpg b/examples/multimodal/llama.jpg