Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Multimodal agents demo #320

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions camel/prompts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@
from .role_description_prompt_template import RoleDescriptionPromptTemplateDict
from .task_prompt_template import TaskPromptTemplateDict
from .prompt_templates import PromptTemplateGenerator
from .multimodal import MultiModalPrompt

__all__ = [
'TextPrompt',
'CodePrompt',
'MultiModalPrompt',
'TextPromptDict',
'AISocietyPromptTemplateDict',
'CodePromptTemplateDict',
Expand Down
90 changes: 90 additions & 0 deletions camel/prompts/multimodal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
from typing import Any, Callable, Dict, List, Union

from camel.prompts import TextPrompt

MODALITIES = ["CAMEL_IMAGE"]


def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict:
r"""
The default format is return the text and multimodal information in dict.
This function should be implemented in the multimodal prompt class.

Returns:
dict: The input format that the multimodal model can understand.
"""

return {"text": text_prompt, "multimodal_information": modalities_dict}


class MultiModalPrompt:
r"""
To enable information transfer between multimodal agents, we need a multimodal prompt class.
It contains a text prompt and multimodal information.
"""

def __init__(self, text_prompt: TextPrompt, modalities: Union[List, Dict]):
r"""
Initializes the multimodal prompt.

Args:
text_prompt (TextPrompt): The text prompt.
multimodal_info (dict): The supported modalities list or modality information dict.
"""
# check if multimodal_info is valid
for modality in modalities:
assert modality in MODALITIES, f"modality {modality} not supported."

self.text_prompt = text_prompt
self.modalities = modalities

def format(self, *args: Any, **kwargs: Any) -> 'MultiModalPrompt':
r"""
Formats the text prompt and the multimodal information at the same time.
if the keyword argument is in MODALITIES, then pop it and add it to multimodal_info, otherwise, apply it to text_prompt.

Args:
*args (Any): Variable length argument list.
**kwargs (Any): Arbitrary keyword arguments.

Returns:
MultiModalPrompt: The formatted multimodal prompt.
"""

# pop the kwargs that is in MODALITIES
multimodal_info = {}
for modality in self.modalities:
multimodal_info[modality] = kwargs.pop(modality)

text_prompt = self.text_prompt.format(*args, **kwargs)
return MultiModalPrompt(text_prompt, multimodal_info)

def to_model_format(
self,
method: Callable = default_to_model_format) -> Any:
r"""
Converts the prompt to the input format that the multimodal model can understand. Different multimodal models have different input formats.
The default format is return the text and multimodal information in dict.
This function should be implemented in the multimodal prompt class.

Returns:
dict: The input format that the multimodal model can understand.
"""

return method(self.text_prompt, self.modalities)


# TODO: MultiModalPromptDict
159 changes: 159 additions & 0 deletions docs/get_started/multimodal_prompt.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Introduction to `MultiModalPrompt` Class

## Overview

The `MultiModalPrompt` class streamlines the process of creating integrated prompts for multimodal agents. By bringing together text and other modalities, it establishes a unified structure for communication.

## Supported Modalities

As of now, the class recognizes the following modality:
- `CAMEL_IMAGE`

## Initialization

To initialize a `MultiModalPrompt` instance:

```python
from camel.prompts import MultiModalPrompt, TextPrompt

vqa_prompt = MultiModalPrompt(
text_prompt=TextPrompt("Please answer the following question related to the provided image:\nQuestion: {Question}"),
modalities=["CAMEL_IMAGE"]
)
```

**Arguments**:
- `text_prompt` (TextPrompt): The text-based template. It dictates the format of the text segment of the prompt.
- `modalities` (Union[List, Dict]): Either a list of modality names or a dictionary pairing modality names with their respective data. If the input is a dictionary, it should follow the pattern `{Modality Name: Modality Data}`.

## Methods

### `format(*args, **kwargs) -> 'MultiModalPrompt'`

This method concurrently formats both the text and multimodal components. Once formatted, the output is a new `MultiModalPrompt` instance.

```python
vqa_prompt = MultiModalPrompt(
text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
modalities=["CAMEL_IMAGE"]
)

question = "What animal is in the picture?"
image_path = "examples/multimodal/camel.jpg"

vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path)
# the prompt is now an instance of MultiModalPrompt, initializing with all multimodal information, and can be used to generate model input when the to_model_format method is called
```

### `to_model_format(method=default_to_model_format) -> Any`

Transforms the prompt to a format understood by the multimodal model.

By default, this method returns the prompt as a dictionary. However, by specifying a different `model_format` method, the output can be adapted to various multimodal model requirements.

```python
def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict:
r"""
The default format is return the text and multimodal information in dict.
This function should be implemented in the multimodal prompt class.

Returns:
dict: The input format that the multimodal model can understand.
"""

return {"text": text_prompt, "multimodal_information": modalities_dict}

vqa_prompt = MultiModalPrompt(
text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
modalities=["CAMEL_IMAGE"]
)

question = "What animal is in the picture?"
image_path = "examples/multimodal/camel.jpg"

vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path)


print(vqa_prompt.to_model_format(default_to_model_format))
# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}}
```

## Usage Examples

We provide an example in `camel/examples/multimodal/formating_example.py` which you can directly run to see the output.

### 1. Single Image VQA (Visual Question Answering):

This example illustrates how to generate prompts for a Visual Question Answering task associated with a single image.

```python
from camel.prompts import MultiModalPrompt, TextPrompt

# Create a VQA prompt template
vqa_prompt = MultiModalPrompt(
text_prompt=TextPrompt("Please answer the following question about the given image:\nQuestion: {Question}"),
modalities=["CAMEL_IMAGE"]
)

# Define questions and their respective image paths
question1 = "What animal is in the picture?"
question2 = "What is the color of the animal?"
image1_path = "examples/multimodal/camel.jpg"
image2_path = "examples/multimodal/llama.jpg"

# Format and display the prompts
vqa_prompt1 = vqa_prompt.format(Question=question1, CAMEL_IMAGE=image1_path)
vqa_prompt2 = vqa_prompt.format(Question=question2, CAMEL_IMAGE=image2_path)

print("vqa_prompt1:", vqa_prompt1.to_model_format())
# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}}
print("vqa_prompt2:", vqa_prompt2.to_model_format())
# {'text': 'Please anwser the following question about the given image:\nQuestion: What is the color of the animal?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/llama.jpg'}}
```

### 2. Multi-Image Question with a Custom Model Input:

This showcases the creation of a prompt involving multiple images for a single question. Furthermore, it illustrates how to adapt the prompt to a model-specific format.

```python
def multi_image_input_format(text_prompt, modalities_dict):
"""
Label each image in the prompt with numbers.
"""
if not isinstance(modalities_dict["CAMEL_IMAGE"], List):
modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]]

for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]):
text_prompt = f"Image {i} is <Image{i}> [Image{i}]\n" + text_prompt

return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]}

# Define the multi-image question and format the prompt
question3 = "Are the animals from the two images the same?"
multi_image_prompt = vqa_prompt.format(Question=question3, CAMEL_IMAGE=[image1_path, image2_path])

# Display the multi-image prompt and its corresponding images
model_input = multi_image_prompt.to_model_format(multi_image_input_format)

print("Prompt:", model_input["prompt"])
'''
Image 1 is <Image1> [Image1]
Image 0 is <Image0> [Image0]
Please anwser the following question about the given image:
Question: Are the animals from the two images the same?
'''


print("Images:", model_input["image"])
'''
['examples/multimodal/camel.jpg', 'examples/multimodal/llama.jpg']
'''

```

## Application with different multimodal models


### LLAVA-1.5

- TODO: add examples of how to use the multimodal prompt with different multimodal models with simple VL tasks.
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Welcome to CAMEL's documentation!
get_started/text_prompt.md
get_started/code_prompt.md
get_started/messages.md
get_started/multimodal_prompt.md

.. toctree::
:maxdepth: 1
Expand Down
Binary file added examples/multimodal/camel.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
72 changes: 72 additions & 0 deletions examples/multimodal/formating_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import List

from camel.prompts import MultiModalPrompt, TextPrompt

if __name__ == "__main__":
# example prompt for simple one image vqa, using default model input format
vqa_prompt = MultiModalPrompt(
text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
modalities=["CAMEL_IMAGE"])

print("Example prompt for simple one image vqa, using default model input format:")

question1 = "What animal is in the picture?"
question2 = "What is the color of the animal?"

image1_path = "examples/multimodal/camel.jpg"
image2_path = "examples/multimodal/llama.jpg"

vqa_prompt1 = vqa_prompt.format(
Question=question1,
CAMEL_IMAGE=image1_path)
vqa_prompt2 = vqa_prompt.format(
Question=question2,
CAMEL_IMAGE=image2_path)

print("vqa_prompt1:")
print(vqa_prompt1.to_model_format())

print("vqa_prompt2:")
print(vqa_prompt2.to_model_format())

print("-" * 100)

# example prompt for multiple image question, with custom model input
# format
def multi_image_input_format(text_prompt, modalities_dict):
r"""
Label the image in the front of text prompt with numbers.
The multi image indexing format is taken from MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning
[Image{i}] in the prompt would be replaced by the visual prompts for the i-th image.

Returns:
dict: The input format that the multimodal model can understand.
"""

if not isinstance(modalities_dict["CAMEL_IMAGE"], List):
modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]]

for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]):
text_prompt = f"Image {i} is <Image{i}> [Image{i}]\n" + text_prompt

return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]}

question3 = "Are the animals from the two images the same?"
multi_image_prompt = vqa_prompt.format(
Question=question3,
CAMEL_IMAGE=[
image1_path,
image2_path]) # easily apply custom input format for different VLM agents

print(
r"Example prompt for multiple image question, with custom model input format(<Image{i}> is the special token, [Image{i}] is the image visual prompt]):")
print("multi_image_prompt: \n")

model_input = multi_image_prompt.to_model_format(multi_image_input_format)

prompt = model_input["prompt"]
print(prompt)

images = model_input["image"]
print("images:")
print(images)
Binary file added examples/multimodal/llama.jpg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.