microsoft · jluey1 · May 13, 2024 · Mar 26, 2024 · Apr 10, 2024 · Apr 10, 2024
diff --git a/.github/workflows/contrib-openai.yml b/.github/workflows/contrib-openai.yml
@@ -74,7 +74,44 @@ jobs:
  with:
  file: ./coverage.xml
  flags: unittests
-
+ AgentEvalTest:
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: ["3.10"]
+ runs-on: ${{ matrix.os }}
+ environment: openai1
+ steps:
+ # checkout to pr branch
+ - name: Checkout
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ github.event.pull_request.head.sha }}
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install packages and dependencies
+ run: |
+ docker --version
+ python -m pip install --upgrade pip wheel
+ pip install -e .
+ python -c "import autogen"
+ pip install pytest-cov>=5 pytest-asyncio
+ - name: Coverage
+ env:
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+ AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+ AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }}
+ OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }}
+ run: |
+ coverage run -a -m pytest test/agentchat/contrib/agent_eval/test_agent_eval.py::test_generate_criteria test/agentchat/contrib/agent_eval/test_agent_eval.py::test_quantify_criteria
+ coverage xml
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v3
+ with:
+ file: ./coverage.xml
+ flags: unittests
  CompressionTest:
  strategy:
  matrix:

diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml
@@ -125,6 +125,35 @@ jobs:
  file: ./coverage.xml
  flags: unittests
 
+ AgentEvalTest:
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest]
+ python-version: ["3.10"]
+ runs-on: ${{ matrix.os }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install packages and dependencies for all tests
+ run: |
+ python -m pip install --upgrade pip wheel
+ pip install pytest-cov>=5
+ - name: Install packages and dependencies for AgentEval
+ run: |
+ pip install -e .
+ - name: Coverage
+ run: |
+ pytest test/agentchat/contrib/agent_eval/ --skip-openai
+ - name: Upload coverage to Codecov
+ uses: codecov/codecov-action@v3
+ with:
+ file: ./coverage.xml
+ flags: unittests
+
  CompressionTest:
  runs-on: ${{ matrix.os }}
  strategy:

diff --git a/autogen/agentchat/contrib/agent_eval/README.md b/autogen/agentchat/contrib/agent_eval/README.md
@@ -0,0 +1,7 @@
+Agents for running the AgentEval pipeline.
+
+AgentEval is a process for evaluating a LLM-based system's performance on a given task.
+
+When given a task to evaluate and a few example runs, the critic and subcritic agents create evaluation criteria for evaluating a system's solution. Once the criteria has been created, the quantifier agent can evaluate subsequent task solutions based on the generated criteria.
+
+For more information see: [AgentEval Integration Roadmap](https://github.com/microsoft/autogen/issues/2162)
diff --git a/autogen/agentchat/contrib/agent_eval/agent_eval.py b/autogen/agentchat/contrib/agent_eval/agent_eval.py
@@ -0,0 +1,101 @@
+from typing import Dict, List, Optional, Union
+
+import autogen
+from autogen.agentchat.contrib.agent_eval.criterion import Criterion
+from autogen.agentchat.contrib.agent_eval.critic_agent import CriticAgent
+from autogen.agentchat.contrib.agent_eval.quantifier_agent import QuantifierAgent
+from autogen.agentchat.contrib.agent_eval.subcritic_agent import SubCriticAgent
+from autogen.agentchat.contrib.agent_eval.task import Task
+
+
+def generate_criteria(
+ llm_config: Optional[Union[Dict, bool]] = None,
+ task: Task = None,
+ additional_instructions: str = "",
+ max_round=2,
+ use_subcritic: bool = False,
+):
+ """
+ Creates a list of criteria for evaluating the utility of a given task.
+ Args:
+ llm_config (dict or bool): llm inference configuration.
+ task (Task): The task to evaluate.
+ additional_instructions (str): Additional instructions for the criteria agent.
+ max_round (int): The maximum number of rounds to run the conversation.
+ use_subcritic (bool): Whether to use the subcritic agent to generate subcriteria.
+ Returns:
+ list: A list of Criterion objects for evaluating the utility of the given task.
+ """
+ critic = CriticAgent(
+ system_message=CriticAgent.DEFAULT_SYSTEM_MESSAGE + "\n" + additional_instructions,
+ llm_config=llm_config,
+ )
+
+ critic_user = autogen.UserProxyAgent(
+ name="critic_user",
+ max_consecutive_auto_reply=0, # terminate without auto-reply
+ human_input_mode="NEVER",
+ code_execution_config={"use_docker": False},
+ )
+
+ agents = [critic_user, critic]
+
+ if use_subcritic:
+ subcritic = SubCriticAgent(
+ llm_config=llm_config,
+ )
+ agents.append(subcritic)
+
+ groupchat = autogen.GroupChat(
+ agents=agents, messages=[], max_round=max_round, speaker_selection_method="round_robin"
+ )
+ critic_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)
+
+ critic_user.initiate_chat(critic_manager, message=task.sys_msg)
+ criteria = critic_user.last_message()
+ content = criteria["content"]
+ # need to strip out any extra code around the returned json
+ content = content[content.find("{") : content.rfind("}") + 1]
+ criteria = Criterion.parse_json_str(content)
+ return criteria
+
+
+def quantify_criteria(
+ llm_config: Optional[Union[Dict, bool]] = None,
+ criteria: List[Criterion] = None,
+ task: Task = None,
+ test_case: Dict = None,
+ ground_truth: str = "",
+):
+ """
+ Quantifies the performance of a system using the provided criteria.
+ Args:
+ llm_config (dict or bool): llm inference configuration.
+ criteria ([Criterion]): A list of criteria for evaluating the utility of a given task.
+ task (Task): The task to evaluate.
+ test_case (dict): The test case to evaluate.
+ ground_truth (str): The ground truth for the test case.
+ Returns:
+ dict: A dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.
+ """
+ quantifier = QuantifierAgent(
+ llm_config=llm_config,
+ )
+
+ quantifier_user = autogen.UserProxyAgent(
+ name="quantifier_user",
+ max_consecutive_auto_reply=0, # terminate without auto-reply
+ human_input_mode="NEVER",
+ code_execution_config={"use_docker": False},
+ )
+
+ quantifier_user.initiate_chat( # noqa: F841
+ quantifier,
+ message=task.sys_msg
+ + "Evaluation dictionary: "
+ + Criterion.write_json(criteria)
+ + "actual test case to evaluate: "
+ + str(test_case),
+ )
+ quantified_results = quantifier_user.last_message()
+ return {"actual_success": ground_truth, "estimated_performance": quantified_results["content"]}
diff --git a/autogen/agentchat/contrib/agent_eval/criterion.py b/autogen/agentchat/contrib/agent_eval/criterion.py
@@ -0,0 +1,69 @@
+import json
+from typing import List
+
+
+class Criterion:
+ """
+ A class that represents a criterion for agent evaluation.
+ """
+
+ def __init__(self, name: str, description: str, accepted_values: List[str], sub_criteria=[]):
+ """
+ Args:
+ name (str): The name of the criterion.
+ description (str): The description of the criterion.
+ accepted_values ([str]): The list of accepted values for the criterion.
+ sub_criteria ([Criterion]): The list of sub-criteria for the criterion.
+ """
+ self.name = name
+ self.description = description
+ self.accepted_values = accepted_values
+ self.sub_criteria = sub_criteria
+
+ def to_json(self):
+ """
+ Create a json object from the criterion.
+ """
+ return {
+ self.name: {
+ "description": self.description,
+ "accepted_values": self.accepted_values,
+ "sub_criteria": [x.to_json() for x in self.sub_criteria],
+ }
+ }
+
+ @staticmethod
+ def parse_json_str(criteria: str):
+ """
+ Create a list of Criterion objects from a json string.
+ Args:
+ criteria (str): Json string that represents the criteria
+ returns:
+ [Criterion]: A list of Criterion objects that represents the json criteria information.
+ """
+ criteria_list = []
+ parsed_json = json.loads(criteria)
+ for criterion_name, criterion_data in parsed_json.items():
+ sub_criteria = []
+ accepted_values = ""
+ if criterion_data.get("sub_criteria") is not None and len(criterion_data.get("sub_criteria")) > 0:
+ sub_criteria = Criterion.parse_json_str(json.dumps(criterion_data.get("sub_criteria")))
+ else:
+ accepted_values = criterion_data.get("accepted_values")
+ criterion = Criterion(criterion_name, criterion_data["description"], accepted_values, sub_criteria)
+ criteria_list.append(criterion)
+ return criteria_list
+
+ @staticmethod
+ def write_json(criteria):
+ """
+ Create a json string from a list of Criterion objects.
+ Args:
+ criteria ([Criterion]): A list of Criterion objects.
+ Returns:
+ str: A json string that represents the list of Criterion objects.
+ """
+ criteria_json = {}
+ for criterion in criteria:
+ criteria_json.update(criterion.to_json())
+ return json.dumps(criteria_json, indent=2)
diff --git a/autogen/agentchat/contrib/agent_eval/critic_agent.py b/autogen/agentchat/contrib/agent_eval/critic_agent.py
@@ -0,0 +1,40 @@
+from typing import Optional
+
+from autogen.agentchat.conversable_agent import ConversableAgent
+
+
+class CriticAgent(ConversableAgent):
+ """
+ An agent for creating list of criteria for evaluating the utility of a given task.
+ """
+
+ DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant. You suggest criteria for evaluating different tasks. They should be distinguishable, quantifiable and not redundant.
+ Convert the evaluation criteria into a dictionary where the keys are the criteria and the value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key}
+ Make sure "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels and "description" includes the criterion description.
+ Output just the criteria string you have created, no code.
+ """
+
+ DEFAULT_DESCRIPTION = "An AI agent for creating list criteria for evaluating the utility of a given task."
+
+ def __init__(
+ self,
+ name="critic",
+ system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
+ description: Optional[str] = DEFAULT_DESCRIPTION,
+ **kwargs,
+ ):
+ """
+ Args:
+ name (str): agent name.
+ system_message (str): system message for the ChatCompletion inference.
+ Please override this attribute if you want to reprogram the agent.
+ description (str): The description of the agent.
+ **kwargs (dict): Please refer to other kwargs in
+ [ConversableAgent](../../conversable_agent#__init__).
+ """
+ super().__init__(
+ name=name,
+ system_message=system_message,
+ description=description,
+ **kwargs,
+ )
diff --git a/autogen/agentchat/contrib/agent_eval/quantifier_agent.py b/autogen/agentchat/contrib/agent_eval/quantifier_agent.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+from autogen.agentchat.conversable_agent import ConversableAgent
+
+
+class QuantifierAgent(ConversableAgent):
+ """
+ An agent for quantifing the performance of a system using the provided criteria.
+ """
+
+ DEFAULT_SYSTEM_MESSAGE = """"You are a helpful assistant. You quantify the output of different tasks based on the given criteria.
+ The criterion is given in a dictionary format where each key is a dintinct criteria.
+ The value of each key is a dictionary as follows {"description": criteria description , "accepted_values": possible accepted inputs for this key}
+ You are going to quantify each of the crieria for a given task based on the task description.
+ Return a dictionary where the keys are the criteria and the values are the assessed performance based on accepted values for each criteria.
+ Return only the dictionary."""
+
+ DEFAULT_DESCRIPTION = "An AI agent for quantifing the performance of a system using the provided criteria."
+
+ def __init__(
+ self,
+ name="quantifier",
+ system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
+ description: Optional[str] = DEFAULT_DESCRIPTION,
+ **kwargs,
+ ):
+ """
+ Args:
+ name (str): agent name.
+ system_message (str): system message for the ChatCompletion inference.
+ Please override this attribute if you want to reprogram the agent.
+ description (str): The description of the agent.
+ **kwargs (dict): Please refer to other kwargs in
+ [ConversableAgent](../../conversable_agent#__init__).
+ """
+ super().__init__(name=name, system_message=system_message, description=description, **kwargs)
diff --git a/autogen/agentchat/contrib/agent_eval/subcritic_agent.py b/autogen/agentchat/contrib/agent_eval/subcritic_agent.py
@@ -0,0 +1,42 @@
+from typing import Optional
+
+from autogen.agentchat.conversable_agent import ConversableAgent
+
+
+class SubCriticAgent(ConversableAgent):
+ """
+ An agent for creating subcriteria from a given list of criteria for evaluating the utility of a given task.
+ """
+
+ DEFAULT_SYSTEM_MESSAGE = """You are a helpful assistant to the critic agent. You suggest sub criteria for evaluating different tasks based on the criteria provided by the critic agent (if you feel it is needed).
+ They should be distinguishable, quantifiable, and related to the overall theme of the critic's provided criteria.
+ You operate by taking in the description of the criteria. You then create a new key called sub criteria where you provide the sub criteria for the given criteria.
+ The value of the sub_criteria is a dictionary where the keys are the subcriteria and each value is as follows {"description": sub criteria description , "accepted_values": possible accepted inputs for this key}
+ Do this for each criteria provided by the critic (removing the criteria's accepted values). "accepted_values" include the acceptable inputs for each key that are fine-grained and preferably multi-graded levels. "description" includes the criterion description.
+ Once you have created the sub criteria for the given criteria, you return the json (make sure to include the contents of the critic's dictionary in the final dictionary as well).
+ Make sure to return a valid json and no code"""
+
+ DEFAULT_DESCRIPTION = "An AI agent for creating subcriteria from a given list of criteria."
+
+ def __init__(
+ self,
+ name="subcritic",
+ system_message: Optional[str] = DEFAULT_SYSTEM_MESSAGE,
+ description: Optional[str] = DEFAULT_DESCRIPTION,
+ **kwargs,
+ ):
+ """
+ Args:
+ name (str): agent name.
+ system_message (str): system message for the ChatCompletion inference.
+ Please override this attribute if you want to reprogram the agent.
+ description (str): The description of the agent.
+ **kwargs (dict): Please refer to other kwargs in
+ [ConversableAgent](../../conversable_agent#__init__).
+ """
+ super().__init__(
+ name=name,
+ system_message=system_message,
+ description=description,
+ **kwargs,
+ )