From b453a089eb7abb349eb50cd0985290b4037a281b Mon Sep 17 00:00:00 2001 From: vaibhav-patel Date: Mon, 22 Jun 2026 14:59:37 +0530 Subject: [PATCH 1/2] feat(evaluation): add option to save eval results to CSV Add an optional `output_file` parameter to `AgentEvaluator.evaluate` and `AgentEvaluator.evaluate_eval_set`. When set, per-invocation evaluation results for every metric (both passing and failing) are flattened and written to the given path as a CSV file, making it easy to persist and inspect results from pytest-based eval runs. The option is disabled by default, so existing behavior is unchanged. The parent directory is created if needed, and rows are appended so results from a directory of test files accumulate in a single file. CSV writing reuses the existing text/tool-call formatting helpers and relies on pandas, which is already part of the `eval` optional dependencies. Fixes #2652. --- src/google/adk/evaluation/agent_evaluator.py | 107 +++++++++- .../evaluation/test_agent_evaluator.py | 198 ++++++++++++++++++ 2 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 tests/unittests/evaluation/test_agent_evaluator.py diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index f52a367950f..e943e0f6047 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -113,6 +113,7 @@ async def evaluate_eval_set( num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, print_detailed_results: bool = True, + output_file: Optional[str] = None, ): """Evaluates an agent using the given EvalSet. @@ -130,6 +131,10 @@ async def evaluate_eval_set( than root agent. If left empty or none, then root agent is evaluated. print_detailed_results: Whether to print detailed results for each metric evaluation. + output_file: If provided, per-invocation evaluation results (for both + passing and failing metrics) are written to this path as a CSV file. + Disabled by default. The parent directory is created if it does not + already exist. """ if criteria: logger.warning( @@ -169,7 +174,11 @@ async def evaluate_eval_set( # test failures. We track them and then report them towards the end. failures: list[str] = [] - for _, eval_results_per_eval_id in eval_results_by_eval_id.items(): + # Optionally, we collect per-invocation results across all eval cases and + # metrics so that they can be written out to a CSV file at the end. + csv_rows: list[dict[str, Any]] = [] + + for eval_id, eval_results_per_eval_id in eval_results_by_eval_id.items(): eval_metric_results = ( AgentEvaluator._get_eval_metric_results_with_invocation( eval_results_per_eval_id @@ -183,6 +192,20 @@ async def evaluate_eval_set( failures.extend(failures_per_eval_case) + if output_file: + csv_rows.extend( + AgentEvaluator._get_results_as_rows( + eval_set_id=eval_set.eval_set_id, + eval_id=eval_id, + eval_metric_results=eval_metric_results, + ) + ) + + if output_file: + AgentEvaluator._write_results_to_csv( + rows=csv_rows, output_file=output_file + ) + failure_message = "Following are all the test failures." if not print_detailed_results: failure_message += ( @@ -200,6 +223,7 @@ async def evaluate( agent_name: Optional[str] = None, initial_session_file: Optional[str] = None, print_detailed_results: bool = True, + output_file: Optional[str] = None, ): """Evaluates an Agent given eval data. @@ -218,6 +242,10 @@ async def evaluate( needed by all the evals in the eval dataset. print_detailed_results: Whether to print detailed results for each metric evaluation. + output_file: If provided, per-invocation evaluation results are written to + this path as a CSV file. Disabled by default. When the eval data spans + multiple test files, results from all of them are appended to the same + file. """ test_files = [] if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir( @@ -245,6 +273,7 @@ async def evaluate( num_runs=num_runs, agent_name=agent_name, print_detailed_results=print_detailed_results, + output_file=output_file, ) @staticmethod @@ -698,3 +727,79 @@ def _process_metrics_and_get_failures( ) return failures + + @staticmethod + def _get_results_as_rows( + eval_set_id: str, + eval_id: str, + eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]], + ) -> list[dict[str, Any]]: + """Flattens eval results into one row per metric per invocation. + + The columns mirror the ones used in `_print_details`, with additional + identifier columns so that rows from different eval cases and metrics can be + distinguished within a single CSV file. + """ + rows: list[dict[str, Any]] = [] + for metric_name, results_with_invocations in eval_metric_results.items(): + for result_with_invocation in results_with_invocations: + eval_metric_result = result_with_invocation.eval_metric_result + expected_invocation = result_with_invocation.expected_invocation + actual_invocation = result_with_invocation.actual_invocation + rows.append({ + "eval_set_id": eval_set_id, + "eval_id": eval_id, + "metric_name": metric_name, + "threshold": eval_metric_result.threshold, + "score": eval_metric_result.score, + "eval_status": eval_metric_result.eval_status.name, + "prompt": AgentEvaluator._convert_content_to_text( + expected_invocation.user_content + if expected_invocation + else actual_invocation.user_content + ), + "expected_response": AgentEvaluator._convert_content_to_text( + expected_invocation.final_response + if expected_invocation + else None + ), + "actual_response": AgentEvaluator._convert_content_to_text( + actual_invocation.final_response + ), + "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text( + expected_invocation.intermediate_data + if expected_invocation + else None + ), + "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text( + actual_invocation.intermediate_data + ), + }) + + return rows + + @staticmethod + def _write_results_to_csv( + rows: list[dict[str, Any]], + output_file: str, + ) -> None: + """Appends the collected eval result rows to a CSV file. + + Rows are appended so that results from multiple eval sets (for example, when + evaluating a directory of test files) can be accumulated in a single file. + The header is only written when the file does not already exist. + """ + try: + import pandas as pd + except ModuleNotFoundError as e: + raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e + + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + + file_exists = os.path.isfile(output_file) + pd.DataFrame(rows).to_csv( + output_file, mode="a", header=not file_exists, index=False + ) + logger.info("Saved eval results to %s", output_file) diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py new file mode 100644 index 00000000000..8a3582edfae --- /dev/null +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -0,0 +1,198 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os + +from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation +from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.eval_case import Invocation +from google.adk.evaluation.eval_metrics import EvalMetricResult +from google.adk.evaluation.evaluator import EvalStatus +from google.genai import types as genai_types +import pandas as pd +import pytest + + +def _content(text: str) -> genai_types.Content: + return genai_types.Content(parts=[genai_types.Part(text=text)]) + + +def _make_result_with_invocation( + metric_name: str, + score: float, + threshold: float, + eval_status: EvalStatus, + prompt: str, + expected_response: str, + actual_response: str, +) -> _EvalMetricResultWithInvocation: + return _EvalMetricResultWithInvocation( + actual_invocation=Invocation( + user_content=_content(prompt), + final_response=_content(actual_response), + ), + expected_invocation=Invocation( + user_content=_content(prompt), + final_response=_content(expected_response), + ), + eval_metric_result=EvalMetricResult( + metric_name=metric_name, + threshold=threshold, + score=score, + eval_status=eval_status, + ), + ) + + +def test_get_results_as_rows_flattens_metrics_and_invocations(): + eval_metric_results = { + "response_match_score": [ + _make_result_with_invocation( + metric_name="response_match_score", + score=1.0, + threshold=0.8, + eval_status=EvalStatus.PASSED, + prompt="What is 2 + 2?", + expected_response="4", + actual_response="4", + ), + _make_result_with_invocation( + metric_name="response_match_score", + score=0.0, + threshold=0.8, + eval_status=EvalStatus.FAILED, + prompt="Capital of France?", + expected_response="Paris", + actual_response="London", + ), + ], + } + + rows = AgentEvaluator._get_results_as_rows( + eval_set_id="my_eval_set", + eval_id="my_eval_case", + eval_metric_results=eval_metric_results, + ) + + assert len(rows) == 2 + first = rows[0] + assert first["eval_set_id"] == "my_eval_set" + assert first["eval_id"] == "my_eval_case" + assert first["metric_name"] == "response_match_score" + assert first["threshold"] == 0.8 + assert first["score"] == 1.0 + assert first["eval_status"] == "PASSED" + assert first["prompt"] == "What is 2 + 2?" + assert first["expected_response"] == "4" + assert first["actual_response"] == "4" + + # Failing invocation should still be captured. + assert rows[1]["eval_status"] == "FAILED" + assert rows[1]["actual_response"] == "London" + + +def test_get_results_as_rows_handles_missing_expected_invocation(): + result = _EvalMetricResultWithInvocation( + actual_invocation=Invocation( + user_content=_content("hi"), + final_response=_content("hello"), + ), + expected_invocation=None, + eval_metric_result=EvalMetricResult( + metric_name="safety_v1", + threshold=0.5, + score=1.0, + eval_status=EvalStatus.PASSED, + ), + ) + + rows = AgentEvaluator._get_results_as_rows( + eval_set_id="s", + eval_id="c", + eval_metric_results={"safety_v1": [result]}, + ) + + assert len(rows) == 1 + assert rows[0]["prompt"] == "hi" + assert rows[0]["expected_response"] == "" + assert rows[0]["actual_response"] == "hello" + + +def test_write_results_to_csv_writes_expected_file(tmp_path): + rows = [ + { + "eval_set_id": "s", + "eval_id": "c", + "metric_name": "response_match_score", + "threshold": 0.8, + "score": 1.0, + "eval_status": "PASSED", + "prompt": "What is 2 + 2?", + "expected_response": "4", + "actual_response": "4", + "expected_tool_calls": "", + "actual_tool_calls": "", + }, + ] + output_file = os.path.join(str(tmp_path), "nested", "eval_results.csv") + + AgentEvaluator._write_results_to_csv(rows=rows, output_file=output_file) + + # The nested directory should have been created. + assert os.path.isfile(output_file) + + df = pd.read_csv(output_file) + assert list(df.columns) == list(rows[0].keys()) + assert len(df) == 1 + assert df.iloc[0]["metric_name"] == "response_match_score" + assert df.iloc[0]["eval_status"] == "PASSED" + assert df.iloc[0]["score"] == 1.0 + + +def test_write_results_to_csv_appends_without_duplicate_header(tmp_path): + output_file = os.path.join(str(tmp_path), "eval_results.csv") + + def _row(eval_id: str, score: float, status: str) -> dict: + return { + "eval_set_id": "s", + "eval_id": eval_id, + "metric_name": "response_match_score", + "threshold": 0.8, + "score": score, + "eval_status": status, + "prompt": "p", + "expected_response": "e", + "actual_response": "a", + "expected_tool_calls": "", + "actual_tool_calls": "", + } + + AgentEvaluator._write_results_to_csv( + rows=[_row("case_1", 1.0, "PASSED")], output_file=output_file + ) + AgentEvaluator._write_results_to_csv( + rows=[_row("case_2", 0.0, "FAILED")], output_file=output_file + ) + + df = pd.read_csv(output_file) + # Two appends should accumulate two rows, with the header written only once. + assert len(df) == 2 + assert sorted(df["eval_id"].tolist()) == ["case_1", "case_2"] + assert "eval_id" not in df["eval_id"].tolist() + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, "-v"])) From aa74e0dbdd3671b29909c970bc9a459b7859446c Mon Sep 17 00:00:00 2001 From: vaibhav-patel Date: Mon, 22 Jun 2026 15:33:54 +0530 Subject: [PATCH 2/2] chore: re-trigger CI checks