Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 106 additions & 1 deletion src/google/adk/evaluation/agent_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ async def evaluate_eval_set(
num_runs: int = NUM_RUNS,
agent_name: Optional[str] = None,
print_detailed_results: bool = True,
output_file: Optional[str] = None,
):
"""Evaluates an agent using the given EvalSet.

Expand All @@ -130,6 +131,10 @@ async def evaluate_eval_set(
than root agent. If left empty or none, then root agent is evaluated.
print_detailed_results: Whether to print detailed results for each metric
evaluation.
output_file: If provided, per-invocation evaluation results (for both
passing and failing metrics) are written to this path as a CSV file.
Disabled by default. The parent directory is created if it does not
already exist.
"""
if criteria:
logger.warning(
Expand Down Expand Up @@ -169,7 +174,11 @@ async def evaluate_eval_set(
# test failures. We track them and then report them towards the end.
failures: list[str] = []

for _, eval_results_per_eval_id in eval_results_by_eval_id.items():
# Optionally, we collect per-invocation results across all eval cases and
# metrics so that they can be written out to a CSV file at the end.
csv_rows: list[dict[str, Any]] = []

for eval_id, eval_results_per_eval_id in eval_results_by_eval_id.items():
eval_metric_results = (
AgentEvaluator._get_eval_metric_results_with_invocation(
eval_results_per_eval_id
Expand All @@ -183,6 +192,20 @@ async def evaluate_eval_set(

failures.extend(failures_per_eval_case)

if output_file:
csv_rows.extend(
AgentEvaluator._get_results_as_rows(
eval_set_id=eval_set.eval_set_id,
eval_id=eval_id,
eval_metric_results=eval_metric_results,
)
)

if output_file:
AgentEvaluator._write_results_to_csv(
rows=csv_rows, output_file=output_file
)

failure_message = "Following are all the test failures."
if not print_detailed_results:
failure_message += (
Expand All @@ -200,6 +223,7 @@ async def evaluate(
agent_name: Optional[str] = None,
initial_session_file: Optional[str] = None,
print_detailed_results: bool = True,
output_file: Optional[str] = None,
):
"""Evaluates an Agent given eval data.

Expand All @@ -218,6 +242,10 @@ async def evaluate(
needed by all the evals in the eval dataset.
print_detailed_results: Whether to print detailed results for each metric
evaluation.
output_file: If provided, per-invocation evaluation results are written to
this path as a CSV file. Disabled by default. When the eval data spans
multiple test files, results from all of them are appended to the same
file.
"""
test_files = []
if isinstance(eval_dataset_file_path_or_dir, str) and os.path.isdir(
Expand Down Expand Up @@ -245,6 +273,7 @@ async def evaluate(
num_runs=num_runs,
agent_name=agent_name,
print_detailed_results=print_detailed_results,
output_file=output_file,
)

@staticmethod
Expand Down Expand Up @@ -698,3 +727,79 @@ def _process_metrics_and_get_failures(
)

return failures

@staticmethod
def _get_results_as_rows(
eval_set_id: str,
eval_id: str,
eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]],
) -> list[dict[str, Any]]:
"""Flattens eval results into one row per metric per invocation.

The columns mirror the ones used in `_print_details`, with additional
identifier columns so that rows from different eval cases and metrics can be
distinguished within a single CSV file.
"""
rows: list[dict[str, Any]] = []
for metric_name, results_with_invocations in eval_metric_results.items():
for result_with_invocation in results_with_invocations:
eval_metric_result = result_with_invocation.eval_metric_result
expected_invocation = result_with_invocation.expected_invocation
actual_invocation = result_with_invocation.actual_invocation
rows.append({
"eval_set_id": eval_set_id,
"eval_id": eval_id,
"metric_name": metric_name,
"threshold": eval_metric_result.threshold,
"score": eval_metric_result.score,
"eval_status": eval_metric_result.eval_status.name,
"prompt": AgentEvaluator._convert_content_to_text(
expected_invocation.user_content
if expected_invocation
else actual_invocation.user_content
),
"expected_response": AgentEvaluator._convert_content_to_text(
expected_invocation.final_response
if expected_invocation
else None
),
"actual_response": AgentEvaluator._convert_content_to_text(
actual_invocation.final_response
),
"expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
expected_invocation.intermediate_data
if expected_invocation
else None
),
"actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
actual_invocation.intermediate_data
),
})

return rows

@staticmethod
def _write_results_to_csv(
rows: list[dict[str, Any]],
output_file: str,
) -> None:
"""Appends the collected eval result rows to a CSV file.

Rows are appended so that results from multiple eval sets (for example, when
evaluating a directory of test files) can be accumulated in a single file.
The header is only written when the file does not already exist.
"""
try:
import pandas as pd
except ModuleNotFoundError as e:
raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e

output_dir = os.path.dirname(output_file)
if output_dir:
os.makedirs(output_dir, exist_ok=True)

file_exists = os.path.isfile(output_file)
pd.DataFrame(rows).to_csv(
output_file, mode="a", header=not file_exists, index=False
)
logger.info("Saved eval results to %s", output_file)
198 changes: 198 additions & 0 deletions tests/unittests/evaluation/test_agent_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# Copyright 2026 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import os

from google.adk.evaluation.agent_evaluator import _EvalMetricResultWithInvocation
from google.adk.evaluation.agent_evaluator import AgentEvaluator
from google.adk.evaluation.eval_case import Invocation
from google.adk.evaluation.eval_metrics import EvalMetricResult
from google.adk.evaluation.evaluator import EvalStatus
from google.genai import types as genai_types
import pandas as pd
import pytest


def _content(text: str) -> genai_types.Content:
return genai_types.Content(parts=[genai_types.Part(text=text)])


def _make_result_with_invocation(
metric_name: str,
score: float,
threshold: float,
eval_status: EvalStatus,
prompt: str,
expected_response: str,
actual_response: str,
) -> _EvalMetricResultWithInvocation:
return _EvalMetricResultWithInvocation(
actual_invocation=Invocation(
user_content=_content(prompt),
final_response=_content(actual_response),
),
expected_invocation=Invocation(
user_content=_content(prompt),
final_response=_content(expected_response),
),
eval_metric_result=EvalMetricResult(
metric_name=metric_name,
threshold=threshold,
score=score,
eval_status=eval_status,
),
)


def test_get_results_as_rows_flattens_metrics_and_invocations():
eval_metric_results = {
"response_match_score": [
_make_result_with_invocation(
metric_name="response_match_score",
score=1.0,
threshold=0.8,
eval_status=EvalStatus.PASSED,
prompt="What is 2 + 2?",
expected_response="4",
actual_response="4",
),
_make_result_with_invocation(
metric_name="response_match_score",
score=0.0,
threshold=0.8,
eval_status=EvalStatus.FAILED,
prompt="Capital of France?",
expected_response="Paris",
actual_response="London",
),
],
}

rows = AgentEvaluator._get_results_as_rows(
eval_set_id="my_eval_set",
eval_id="my_eval_case",
eval_metric_results=eval_metric_results,
)

assert len(rows) == 2
first = rows[0]
assert first["eval_set_id"] == "my_eval_set"
assert first["eval_id"] == "my_eval_case"
assert first["metric_name"] == "response_match_score"
assert first["threshold"] == 0.8
assert first["score"] == 1.0
assert first["eval_status"] == "PASSED"
assert first["prompt"] == "What is 2 + 2?"
assert first["expected_response"] == "4"
assert first["actual_response"] == "4"

# Failing invocation should still be captured.
assert rows[1]["eval_status"] == "FAILED"
assert rows[1]["actual_response"] == "London"


def test_get_results_as_rows_handles_missing_expected_invocation():
result = _EvalMetricResultWithInvocation(
actual_invocation=Invocation(
user_content=_content("hi"),
final_response=_content("hello"),
),
expected_invocation=None,
eval_metric_result=EvalMetricResult(
metric_name="safety_v1",
threshold=0.5,
score=1.0,
eval_status=EvalStatus.PASSED,
),
)

rows = AgentEvaluator._get_results_as_rows(
eval_set_id="s",
eval_id="c",
eval_metric_results={"safety_v1": [result]},
)

assert len(rows) == 1
assert rows[0]["prompt"] == "hi"
assert rows[0]["expected_response"] == ""
assert rows[0]["actual_response"] == "hello"


def test_write_results_to_csv_writes_expected_file(tmp_path):
rows = [
{
"eval_set_id": "s",
"eval_id": "c",
"metric_name": "response_match_score",
"threshold": 0.8,
"score": 1.0,
"eval_status": "PASSED",
"prompt": "What is 2 + 2?",
"expected_response": "4",
"actual_response": "4",
"expected_tool_calls": "",
"actual_tool_calls": "",
},
]
output_file = os.path.join(str(tmp_path), "nested", "eval_results.csv")

AgentEvaluator._write_results_to_csv(rows=rows, output_file=output_file)

# The nested directory should have been created.
assert os.path.isfile(output_file)

df = pd.read_csv(output_file)
assert list(df.columns) == list(rows[0].keys())
assert len(df) == 1
assert df.iloc[0]["metric_name"] == "response_match_score"
assert df.iloc[0]["eval_status"] == "PASSED"
assert df.iloc[0]["score"] == 1.0


def test_write_results_to_csv_appends_without_duplicate_header(tmp_path):
output_file = os.path.join(str(tmp_path), "eval_results.csv")

def _row(eval_id: str, score: float, status: str) -> dict:
return {
"eval_set_id": "s",
"eval_id": eval_id,
"metric_name": "response_match_score",
"threshold": 0.8,
"score": score,
"eval_status": status,
"prompt": "p",
"expected_response": "e",
"actual_response": "a",
"expected_tool_calls": "",
"actual_tool_calls": "",
}

AgentEvaluator._write_results_to_csv(
rows=[_row("case_1", 1.0, "PASSED")], output_file=output_file
)
AgentEvaluator._write_results_to_csv(
rows=[_row("case_2", 0.0, "FAILED")], output_file=output_file
)

df = pd.read_csv(output_file)
# Two appends should accumulate two rows, with the header written only once.
assert len(df) == 2
assert sorted(df["eval_id"].tolist()) == ["case_1", "case_2"]
assert "eval_id" not in df["eval_id"].tolist()


if __name__ == "__main__":
raise SystemExit(pytest.main([__file__, "-v"]))