Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow target function to output columns already present in the data set #3214

Merged
merged 5 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
174 changes: 118 additions & 56 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import inspect
import os
import re
import tempfile
from typing import Any, Callable, Dict, Optional, Set, Tuple

import pandas as pd
Expand Down Expand Up @@ -139,19 +137,29 @@ def _apply_target_to_data(
# Remove input and output prefix
prefix = "outputs."
rename_dict = {col: col[len(prefix):] for col in target_output.columns if col.startswith(prefix)}
# Sometimes user data may contain column named the same as the one generated by target.
# In this case we will not rename the column.
generated_columns = set(rename_dict.values())
tgt_out_and_input = set()
for col in initial_data.columns:
if col in generated_columns:
tgt_out = f'{prefix}{col}'
del rename_dict[tgt_out]
tgt_out_and_input.add(tgt_out)
tgt_out_and_input.add(col)
nick863 marked this conversation as resolved.
Show resolved Hide resolved
# Sort output by line numbers
target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
target_output.sort_index(inplace=True)
target_output.reset_index(inplace=True, drop=False)
# target_output contains only input columns, taken by function,
# so we need to concatenate it to the input data frame.
drop_columns = set(target_output.columns) - set(rename_dict.keys())
drop_columns = set(target_output.columns) - set(rename_dict.keys()) - tgt_out_and_input
target_output.drop(drop_columns, inplace=True, axis=1)
# Remove outputs. prefix
target_output.rename(columns=rename_dict, inplace=True)
# Concatenate output to input
target_output = pd.concat([target_output, initial_data], axis=1)
return target_output, set(rename_dict.values()), run
return target_output, generated_columns, run


def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False):
nick863 marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -163,17 +171,34 @@ def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace

if mapping_config:
column_mapping = {}
columns_to_drop = set()
pattern_prefix = "data."
run_outputs_prefix = "run.outputs."

for map_to_key, map_value in mapping_config.items():
match = re.search(r"^\${([^{}]+)}$", map_value)
if match is not None:
pattern = match.group(1)
if pattern.startswith(pattern_prefix):
map_from_key = pattern.split(pattern_prefix)[1]
column_mapping[map_from_key] = map_to_key

result_df = source_df.rename(columns=column_mapping, inplace=inplace)
column_mapping[pattern[len(pattern_prefix):]] = map_to_key
elif pattern.startswith(run_outputs_prefix):
map_from_key = pattern[len(run_outputs_prefix):]
col_outputs = f'outputs.{map_from_key}'
nick863 marked this conversation as resolved.
Show resolved Hide resolved
# If data set had target-generated column before application of
# target, the column will have "outputs." prefix. We will use
# target-generated column for validation.
if col_outputs in source_df.columns:
# If column needs to be mapped to already existing column.
if map_to_key in source_df.columns:
columns_to_drop.add(map_to_key)
column_mapping[col_outputs] = map_to_key
else:
column_mapping[map_from_key] = map_to_key
# If some columns, which has to be dropped actually can renamed,
# we will not drop it.
columns_to_drop = columns_to_drop - set(column_mapping.keys())
result_df = source_df.drop(columns=columns_to_drop, inplace=inplace)
result_df.rename(columns=column_mapping, inplace=True)

return result_df

Expand All @@ -199,12 +224,39 @@ def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]):
"Ensure only ${target.} and ${data.} are used."
)

# Replace ${target.} with ${data.}
processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${data.")
# Replace ${target.} with ${run.}
nick863 marked this conversation as resolved.
Show resolved Hide resolved
processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${run.outputs.")

return processed_config


def _rename_columns_maybe(df: pd.DataFrame, target_generated: Set[str]):
nick863 marked this conversation as resolved.
Show resolved Hide resolved
"""
Change the column names for data frame. The change happens inplace.

The columns with "outputs." prefix will not be changed. "outputs." prefix will
will be added to columns in target_generated set. The rest columns will get
".inputs" prefix.
:param df: The data frame to apply changes to.
:param target_generated: The columns generated by the target.
:return: The changed data frame.
"""
rename_dict = {}
for col in df.columns:
outputs_col = f'outputs.{col}'
# Do not rename columns with "outputs."
if 'outputs.' in col and col[len('outputs.'):] in target_generated:
continue
# If target has generated outputs.{col}, we do not need to rename column
# as it is actually input. Otherwise add outputs. prefix.
if col in target_generated and outputs_col not in df.columns:
rename_dict[col] = outputs_col
else:
rename_dict[col] = f'inputs.{col}'
df.rename(columns=rename_dict, inplace=True)
return df


def evaluate(
*,
evaluation_name: Optional[str] = None,
Expand Down Expand Up @@ -256,60 +308,70 @@ def evaluate(
if data is not None and target is not None:
input_data_df, target_generated_columns, target_run = _apply_target_to_data(target, data, pf_client,
input_data_df, evaluation_name)

# Make sure, the default is always in the configuration.
if not evaluator_config:
evaluator_config = {}
if 'default' not in evaluator_config:
evaluator_config['default'] = {}

for evaluator_name, mapping in evaluator_config.items():
mapped_to_values = set(mapping.values())
for col in target_generated_columns:
# If user defined mapping differently, do not change it.
# If it was mapped to target, we have already changed it
# in _process_evaluator_config
run_output = f'${{run.outputs.{col}}}'
# We will add our mapping only if
# customer did not mapped target output.
if col not in mapping and run_output not in mapped_to_values:
evaluator_config[evaluator_name][col] = run_output

Comment on lines +318 to +329
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why we would need the code here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case we are making sure, we map all the outputs of target function to the apropriate columns in the evaluators input, however, we should not do that if customer have mapped this output to something else.

# After we have generated all columns we can check if we have
# everything we need for evaluators.
_validate_columns(input_data_df, evaluators, target=None, evaluator_config=evaluator_config)

evaluator_info = {}

with tempfile.TemporaryDirectory() as d:
data_file = data
if target_generated_columns:
data_file = os.path.join(d, "input.jsonl")
input_data_df.to_json(data_file, orient="records", lines=True)

for evaluator_name, evaluator in evaluators.items():
evaluator_info[evaluator_name] = {}
evaluator_info[evaluator_name]["run"] = pf_client.run(
flow=evaluator,
run=target_run,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data_file,
stream=True,
)

evaluators_result_df = None
for evaluator_name, evaluator_info in evaluator_info.items():
evaluator_result_df = pf_client.get_details(evaluator_info["run"], all_results=True)

# drop input columns
evaluator_result_df = evaluator_result_df.drop(
columns=[col for col in evaluator_result_df.columns if col.startswith("inputs.")]
)

# rename output columns
# Assuming after removing inputs columns, all columns are output columns
evaluator_result_df.rename(
columns={
col: "outputs." f"{evaluator_name}.{col.replace('outputs.', '')}"
for col in evaluator_result_df.columns
},
inplace=True,
)

evaluators_result_df = (
pd.concat([evaluators_result_df, evaluator_result_df], axis=1, verify_integrity=True)
if evaluators_result_df is not None
else evaluator_result_df
)
for evaluator_name, evaluator in evaluators.items():
evaluator_info[evaluator_name] = {}
evaluator_info[evaluator_name]["run"] = pf_client.run(
flow=evaluator,
run=target_run,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data,
stream=True,
)

evaluators_result_df = None
for evaluator_name, evaluator_info in evaluator_info.items():
evaluator_result_df = pf_client.get_details(evaluator_info["run"], all_results=True)

# drop input columns
evaluator_result_df = evaluator_result_df.drop(
columns=[col for col in evaluator_result_df.columns if col.startswith("inputs.")]
)

# rename output columns
# Assuming after removing inputs columns, all columns are output columns
evaluator_result_df.rename(
columns={
col: "outputs." f"{evaluator_name}.{col.replace('outputs.', '')}"
for col in evaluator_result_df.columns
},
inplace=True,
)

evaluators_result_df = (
pd.concat([evaluators_result_df, evaluator_result_df], axis=1, verify_integrity=True)
if evaluators_result_df is not None
else evaluator_result_df
)

# Rename columns, generated by template function to outputs instead of inputs.
input_data_df.rename(
columns={
col: f"{'outputs' if col in target_generated_columns else 'inputs'}.{col}" for col in input_data_df.columns
},
inplace=True,
)
# If target generates columns, already present in the input data, these columns
# will be marked as outputs already so we do not need to rename them.
input_data_df = _rename_columns_maybe(input_data_df, target_generated_columns)

result_df = pd.concat([input_data_df, evaluators_result_df], axis=1, verify_integrity=True)
metrics = _calculate_mean(evaluators_result_df)
Expand Down
6 changes: 6 additions & 0 deletions src/promptflow-evals/tests/evals/e2etests/target_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,9 @@ def target_fn(question: str) -> str:
def target_fn2(question: str) -> str:
answer = target_fn(question)["answer"]
return {"response": answer}


def target_fn3(question: str) -> str:
response = target_fn(question)
response['question'] = f'The question is as follows: {question}'
return response
49 changes: 48 additions & 1 deletion src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ def answer_evaluator(answer):
return {"length": len(answer)}


def question_evaluator(question):
return {"length": len(question)}


def _get_run_from_run_history(flow_run_id, runs_operation):
"""Get run info from run history"""
token = "Bearer " + AzureCliCredential().get_token("https://management.azure.com/.default").token
Expand Down Expand Up @@ -132,7 +136,10 @@ def test_evaluate_with_target(self, questions_file):
result = evaluate(
data=questions_file,
target=target_fn,
evaluators={"answer": answer_evaluator, "f1": f1_score_eval},
evaluators={
"answer": answer_evaluator,
"f1": f1_score_eval
},
)
row_result_df = pd.DataFrame(result["rows"])
assert "outputs.answer" in row_result_df.columns
Expand All @@ -141,6 +148,46 @@ def test_evaluate_with_target(self, questions_file):
assert "outputs.f1.f1_score" in row_result_df.columns
assert not any(np.isnan(f1) for f1 in row_result_df["outputs.f1.f1_score"])

@pytest.mark.parametrize(
'evaluation_config',
[
None,
{"default": {}},
{"default": {}, 'question_ev': {}},
{"default": {'question': '${target.question}'}},
{"default": {'question': '${data.question}'}},
{"default": {}, 'question_ev': {'question': '${data.question}'}},
{"default": {}, 'question_ev': {'question': '${target.question}'}},
{"default": {}, 'question_ev': {'another_question': '${target.question}'}},
{"default": {'another_question': '${target.question}'}},
])
def test_evaluate_another_questions(self, questions_file, evaluation_config):
"""Test evaluation with target function."""
from .target_fn import target_fn3
# run the evaluation with targets
result = evaluate(
target=target_fn3,
data=questions_file,
evaluators={
"question_ev": question_evaluator,
},
evaluator_config=evaluation_config
)
row_result_df = pd.DataFrame(result["rows"])
assert "outputs.answer" in row_result_df.columns
assert "inputs.question" in row_result_df.columns
assert "outputs.question" in row_result_df.columns
assert "outputs.question_ev.length" in row_result_df.columns
question = "outputs.question"

mapping = None
if evaluation_config:
mapping = evaluation_config.get('question_ev', evaluation_config.get("default", None))
if mapping and ('another_question' in mapping or mapping['question'] == '${data.question}'):
question = "inputs.question"
expected = list(row_result_df[question].str.len())
assert expected == list(row_result_df['outputs.question_ev.length'])

@pytest.mark.parametrize(
"evaluate_config",
[
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"question":"How long is flight from Earth to LV-426?","answer":"There is nothing good there.", "ground_truth": "Far away.", "outputs.question": "The question is as follows: How long is flight from Earth to LV-426?"}
{"question":"Why there is no central heating on the street?","answer":"There is no central heating on the streets today, but it will be, I promise.", "ground_truth": "It is expensive.", "outputs.question": "The question is as follows: Why there is no central heating on the street?"}
{"question":"Why these questions are so strange?","answer":"The life is strange...", "ground_truth": "The life is strange...", "outputs.question": "The question is as follows: Why these questions are so strange?"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"question":"How long is flight from Earth to LV-426?","ground_truth":"Far away."}
{"question":"Why there is no central heating on the street?","ground_truth":"It is expensive."}
{"question":"Why these questions are so strange?","ground_truth":"The life is strange..."}