Make ChatEvaluator output consistent with latest chat flow (#2598)

# Description Please add an informative description that covers that changes made by the pull request and link all relevant issues. # All Promptflow Contribution checklist: - [ ] **The pull request does not introduce [breaking changes].** - [ ] **CHANGELOG is updated for new features, bug fixes or other significant changes.** - [ ] **I have read the [contribution guidelines](../CONTRIBUTING.md).** - [ ] **Create an issue and link to the pull request to get dedicated review from promptflow team. Learn more: [suggested workflow](../CONTRIBUTING.md#suggested-workflow).** ## General Guidelines and Best Practices - [ ] Title of the pull request is clear and informative. - [ ] There are a small number of commits, each of which have an informative message. This means that previously merged commits do not appear in the history of the PR. For more information on cleaning up the commits in your PR, [see this page](https://github.com/Azure/azure-powershell/blob/master/documentation/development-docs/cleaning-up-commits.md). ### Testing Guidelines - [ ] Pull request includes test coverage for the included changes. --------- Co-authored-by: Ankit Singhal <[email protected]> Co-authored-by: Ankit Singhal <[email protected]> Co-authored-by: Clement Wang <[email protected]>
microsoft · Apr 3, 2024 · 6b9c5c6 · 6b9c5c6
1 parent 526bd08
commit 6b9c5c6
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 16 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py b/src/promptflow-evals/promptflow/evals/evaluators/chat/__init__.py
@@ -128,28 +128,27 @@ def __call__(self, *, conversation: List[Dict], **kwargs):
  }
 
  for future in as_completed(future_to_evaluator):
- score = future.result()
- current_turn_result.update(score)
+ result = future.result()
+ current_turn_result.update(result)
  else:
  # Sequential execution
  for evaluator in selected_evaluators:
- score = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
- current_turn_result.update(score)
+ result = self._evaluate_turn(turn_num, questions, answers, contexts, evaluator)
+ current_turn_result.update(result)
 
  per_turn_results.append(current_turn_result)
 
  # Aggregate results
  # Final aggregated results for a conversation will look like:
- # {
- # "gpt_groundedness": 0.9,
- # "gpt_groundedness_per_turn": [0.9, 0.8, 0.9, ...],
- # ...
+ # "gpt_groundedness": 2.0, # Mean of all groundedness scores
+ # "evaluation_per_turn": {
+ # "gpt_groundedness": {
+ # "score": [1.0, ...],
+ # "reason": ["reason1", ...],
+ # },
+ # },
  # }
- aggregated = {}
- for key in per_turn_results[0].keys():
- values = [d[key] for d in per_turn_results]
- aggregated[key] = np.nanmean(values)
- aggregated[key + "_per_turn"] = values
+ aggregated = self._aggregate_results(per_turn_results)
 
  return aggregated
 
@@ -170,6 +169,37 @@ def _evaluate_turn(self, turn_num, questions, answers, contexts, evaluator):
  f"Evaluator {evaluator.__class__.__name__} failed for turn {turn_num + 1} with exception: {e}")
  return {}
 
+ def _aggregate_results(self, per_turn_results: List[Dict]):
+ scores = {}
+ reasons = {}
+
+ for turn in per_turn_results:
+ for metric, value in turn.items():
+ if 'reason' in metric:
+ if metric not in reasons:
+ reasons[metric] = []
+ reasons[metric].append(value)
+ else:
+ if metric not in scores:
+ scores[metric] = []
+ scores[metric].append(value)
+
+ aggregated = {}
+ evaluation_per_turn = {}
+
+ for metric, values in scores.items():
+ aggregated[metric] = np.nanmean(values)
+
+ # Prepare per-turn evaluations
+ evaluation_per_turn[metric] = {"score": values}
+ reason_key = f"{metric}_reason"
+ if reason_key in reasons:
+ evaluation_per_turn[metric]["reason"] = reasons[reason_key]
+
+ aggregated["evaluation_per_turn"] = evaluation_per_turn
+
+ return aggregated
+
  def _validate_conversation(self, conversation: List[Dict]):
  if conversation is None or not isinstance(conversation, list):
  raise ValueError("'conversation' must be a list of dictionaries.")

diff --git a/src/promptflow-evals/samples/built_in_evaluators.py b/src/promptflow-evals/samples/built_in_evaluators.py
@@ -142,9 +142,9 @@ def run_chat_evaluator():
  ]
  score = chat_eval(conversation=conversation)
  print(score)
- # {'gpt_coherence': 5.0, 'gpt_coherence_per_turn': [5.0, 5.0], 'gpt_fluency': 5.0, 'gpt_fluency_per_turn': [5.0,
- # 5.0], 'gpt_groundedness': 5.0, 'gpt_groundedness_per_turn': [5.0, 5.0], 'gpt_relevance': 5.0,
- # 'gpt_relevance_per_turn': [5.0, 5.0]}
+ # {'gpt_fluency': 5.0, 'gpt_groundedness': 5.0, 'gpt_coherence': 5.0, 'gpt_relevance': 5.0,
+ # 'evaluation_per_turn': {'gpt_fluency': {'score': [5.0, 5.0]}, 'gpt_groundedness': {'score': [5.0, 5.0]},
+ #  'gpt_coherence': {'score': [5.0, 5.0]}, 'gpt_relevance': {'score': [5.0, 5.0]}}}
 
 
 if __name__ == "__main__":

diff --git a/src/promptflow-evals/tests/unittests/test_chat_evaluator.py b/src/promptflow-evals/tests/unittests/test_chat_evaluator.py
@@ -89,3 +89,34 @@ def test_conversation_validation_invalid_citations(self):
  with pytest.raises(ValueError) as e:
  chat_eval(conversation=conversation)
  assert str(e.value) == "'citations' in context must be a list. Turn number: 2"
+
+ def test_per_turn_results_aggregation(self):
+ model_config = AzureOpenAIConnection(
+ api_base="mocked_endpoint",
+ api_key="mocked_key",
+ api_type="azure",
+ )
+ chat_eval = ChatEvaluator(model_config=model_config, deployment_name="gpt-4")
+
+ per_turn_results = [
+ {
+ "gpt_groundedness": 1.0,
+ "gpt_groundedness_reason": "reason1",
+ "gpt_fluency": 2.0,
+
+ },
+ {
+ "gpt_groundedness": 3.0,
+ "gpt_groundedness_reason": "reason2",
+ "gpt_fluency": 4.0,
+ },
+ ]
+ aggregated = chat_eval._aggregate_results(per_turn_results)
+ assert aggregated == {
+ "gpt_groundedness": 2.0,
+ "gpt_fluency": 3.0,
+ "evaluation_per_turn": {
+ "gpt_groundedness": {"score": [1.0, 3.0], "reason": ["reason1", "reason2"]},
+ "gpt_fluency": {"score": [2.0, 4.0]},
+ }
+ }