Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions src/google/adk/evaluation/local_eval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,29 @@ async def _evaluate_single_inference_result(
else 'test_user_id'
)

if inference_result.inferences is None:
session_details = None
if inference_result.session_id is not None:
session_details = await self._session_service.get_session(
app_name=inference_result.app_name,
user_id=user_id,
session_id=inference_result.session_id,
)
return (
inference_result,
EvalCaseResult(
eval_set_file=inference_result.eval_set_id,
eval_set_id=inference_result.eval_set_id,
eval_id=inference_result.eval_case_id,
final_eval_status=EvalStatus.FAILED,
overall_eval_metric_results=[],
eval_metric_result_per_invocation=[],
session_id=inference_result.session_id or '',
session_details=session_details,
user_id=user_id,
),
)

if eval_case.conversation_scenario is None and len(
inference_result.inferences
) != len(eval_case.conversation):
Expand Down
36 changes: 34 additions & 2 deletions tests/unittests/evaluation/test_local_eval_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from __future__ import annotations

import asyncio
import sys
from typing import Optional

from google.adk.agents.llm_agent import LlmAgent
Expand Down Expand Up @@ -465,6 +464,39 @@ async def test_evaluate_single_inference_result(
assert metric_result.eval_status == EvalStatus.PASSED


@pytest.mark.asyncio
async def test_evaluate_single_inference_result_failed_without_inferences(
eval_service, mock_eval_sets_manager, mocker
):
inference_result = InferenceResult(
app_name="test_app",
eval_set_id="test_eval_set",
eval_case_id="case1",
inferences=None,
session_id="session1",
status=InferenceStatus.FAILURE,
error_message="auth failed",
)
eval_metric = EvalMetric(metric_name="fake_metric", threshold=0.5)
evaluate_config = EvaluateConfig(eval_metrics=[eval_metric], parallelism=1)

mock_eval_case = mocker.MagicMock(spec=EvalCase)
mock_eval_case.conversation = []
mock_eval_case.conversation_scenario = None
mock_eval_case.session_input = None
mock_eval_sets_manager.get_eval_case.return_value = mock_eval_case

_, result = await eval_service._evaluate_single_inference_result(
inference_result=inference_result, evaluate_config=evaluate_config
)

assert result.eval_id == "case1"
assert result.session_id == "session1"
assert result.final_eval_status == EvalStatus.FAILED
assert result.overall_eval_metric_results == []
assert result.eval_metric_result_per_invocation == []


@pytest.mark.asyncio
async def test_evaluate_single_inference_result_for_conversation_scenario(
eval_service, mock_eval_sets_manager, mocker
Expand Down Expand Up @@ -520,7 +552,7 @@ async def test_evaluate_single_inference_result_for_conversation_scenario(
for i in range(3):
invocation_result = result.eval_metric_result_per_invocation[i]
assert invocation_result.actual_invocation == inference_result.inferences[i]
assert invocation_result.expected_invocation == None
assert invocation_result.expected_invocation is None
assert len(invocation_result.eval_metric_results) == 1
metric_result = invocation_result.eval_metric_results[0]
assert metric_result.metric_name == "fake_single_sided_metric"
Expand Down
Loading