diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py index 33c1693208..5aa0ecb848 100644 --- a/src/google/adk/cli/cli_eval.py +++ b/src/google/adk/cli/cli_eval.py @@ -24,7 +24,9 @@ import click from google.genai import types as genai_types +from ..agents.base_agent import BaseAgent from ..agents.llm_agent import Agent +from ..apps.app import App from ..evaluation.base_eval_service import BaseEvalService from ..evaluation.base_eval_service import EvaluateConfig from ..evaluation.base_eval_service import EvaluateRequest @@ -86,11 +88,33 @@ def get_default_metric_info( ) -def get_root_agent(agent_module_file_path: str) -> Agent: - """Returns root agent given the agent module.""" +def get_app_or_root_agent( + agent_module_file_path: str, +) -> tuple[Optional[App], BaseAgent]: + """Returns the (app, root_agent) pair for the given agent module. + + Resolution order mirrors `AgentLoader._load_from_module_or_package`: + if the module exposes an `App` instance via `agent.app`, that App and its + `root_agent` are returned. Otherwise `app` is None and the bare + `agent.root_agent` is returned. This lets eval flows participate in the + App's plugin / cache / resumability lifecycle when one is defined, while + preserving the bare-`root_agent` path for projects that don't use App. + """ agent_module = _get_agent_module(agent_module_file_path) - root_agent = agent_module.agent.root_agent - return root_agent + app = getattr(agent_module.agent, "app", None) + if isinstance(app, App): + return app, app.root_agent + return None, agent_module.agent.root_agent + + +def get_root_agent(agent_module_file_path: str) -> Agent: + """Returns root agent given the agent module. + + Kept for backward compatibility. New callers should prefer + `get_app_or_root_agent`, which also surfaces the wrapping `App` (if any) + so plugins, context-cache, and resumability configs are honored. + """ + return get_app_or_root_agent(agent_module_file_path)[1] def try_get_reset_func(agent_module_file_path: str) -> Any: diff --git a/src/google/adk/cli/cli_tools_click.py b/src/google/adk/cli/cli_tools_click.py index c6a71175a1..76764cfbb4 100644 --- a/src/google/adk/cli/cli_tools_click.py +++ b/src/google/adk/cli/cli_tools_click.py @@ -979,8 +979,8 @@ def cli_eval( from ..evaluation.simulation.user_simulator_provider import UserSimulatorProvider from .cli_eval import _collect_eval_results from .cli_eval import _collect_inferences + from .cli_eval import get_app_or_root_agent from .cli_eval import get_default_metric_info - from .cli_eval import get_root_agent from .cli_eval import parse_and_get_evals_to_run from .cli_eval import pretty_print_eval_result except ModuleNotFoundError as mnf: @@ -990,7 +990,7 @@ def cli_eval( print(f"Using evaluation criteria: {eval_config}") eval_metrics = get_eval_metrics_from_config(eval_config) - root_agent = get_root_agent(agent_module_file_path) + app, root_agent = get_app_or_root_agent(agent_module_file_path) app_name = os.path.basename(agent_module_file_path) agents_dir = os.path.dirname(agent_module_file_path) eval_sets_manager = None @@ -1098,6 +1098,7 @@ def cli_eval( eval_set_results_manager=eval_set_results_manager, user_simulator_provider=user_simulator_provider, metric_evaluator_registry=metric_evaluator_registry, + app=app, ) inference_results = asyncio.run( @@ -1121,8 +1122,6 @@ def cli_eval( eval_run_summary = {} for eval_result in eval_results: - eval_result: EvalCaseResult - if eval_result.eval_set_id not in eval_run_summary: eval_run_summary[eval_result.eval_set_id] = [0, 0] @@ -1139,7 +1138,6 @@ def cli_eval( if print_detailed_results: for eval_result in eval_results: - eval_result: EvalCaseResult click.echo( "********************************************************************" ) diff --git a/src/google/adk/evaluation/evaluation_generator.py b/src/google/adk/evaluation/evaluation_generator.py index 5b0100818c..f6e88bdb2f 100644 --- a/src/google/adk/evaluation/evaluation_generator.py +++ b/src/google/adk/evaluation/evaluation_generator.py @@ -36,6 +36,7 @@ from ..agents.llm_agent import Agent from ..agents.run_config import RunConfig from ..agents.run_config import StreamingMode +from ..apps.app import App from ..artifacts.base_artifact_service import BaseArtifactService from ..artifacts.in_memory_artifact_service import InMemoryArtifactService from ..events.event import Event @@ -331,20 +332,30 @@ async def _process_query( """Process a query using the agent and evaluation dataset.""" module_path = f"{module_name}" agent_module = importlib.import_module(module_path) - root_agent = agent_module.agent.root_agent + # Prefer the wrapping `App` when the module exposes one, so that + # `app.plugins`, context-cache, and resumability configs participate + # in eval runs the same way they do for `adk web` / `adk run`. + app_obj = getattr(agent_module.agent, "app", None) + if isinstance(app_obj, App): + root_agent = app_obj.root_agent + else: + app_obj = None + root_agent = agent_module.agent.root_agent reset_func = getattr(agent_module.agent, "reset_data", None) agent_to_evaluate = root_agent if agent_name: - agent_to_evaluate = root_agent.find_agent(agent_name) - assert agent_to_evaluate, f"Sub-Agent `{agent_name}` not found." + found_agent = root_agent.find_agent(agent_name) + assert found_agent, f"Sub-Agent `{agent_name}` not found." + agent_to_evaluate = found_agent return await EvaluationGenerator._generate_inferences_from_root_agent( agent_to_evaluate, user_simulator=user_simulator, reset_func=reset_func, initial_session=initial_session, + app=app_obj, ) @staticmethod @@ -543,8 +554,17 @@ async def _generate_inferences_from_root_agent( session_service: Optional[BaseSessionService] = None, artifact_service: Optional[BaseArtifactService] = None, memory_service: Optional[BaseMemoryService] = None, + app: Optional[App] = None, ) -> list[Invocation]: - """Scrapes the root agent in coordination with the user simulator.""" + """Scrapes the root agent in coordination with the user simulator. + + If `app` is provided, the eval Runner is built from a copy of the App + with internal eval plugins merged into `app.plugins`, preserving the + App's `context_cache_config`, `resumability_config`, and any other + application-wide configuration. Otherwise the Runner is built from + the bare `root_agent` with only the internal eval plugins, matching + the legacy behavior. + """ if not session_service: session_service = InMemorySessionService() @@ -581,13 +601,39 @@ async def _generate_inferences_from_root_agent( ensure_retry_options_plugin = EnsureRetryOptionsPlugin( name="ensure_retry_options" ) + internal_eval_plugins = [ + request_intercepter_plugin, + ensure_retry_options_plugin, + ] + + if app is not None: + # Copy the App so we don't mutate the user's instance, and merge our + # internal eval plugins with the user's. Override `root_agent` so the + # Runner targets the agent the caller actually asked us to evaluate + # (e.g., a sub-agent), while still carrying the App's plugins, + # context_cache_config, and resumability_config. + runner_app = app.model_copy( + update={ + "plugins": list(app.plugins) + internal_eval_plugins, + "root_agent": root_agent, + } + ) + runner_kwargs: dict[str, Any] = { + "app": runner_app, + "app_name": app_name, + } + else: + runner_kwargs = { + "app_name": app_name, + "agent": root_agent, + "plugins": internal_eval_plugins, + } + async with Runner( - app_name=app_name, - agent=root_agent, + **runner_kwargs, artifact_service=artifact_service, session_service=session_service, memory_service=memory_service, - plugins=[request_intercepter_plugin, ensure_retry_options_plugin], ) as runner: events = [] while True: diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py index 1a032bad64..bb2cc0d38c 100644 --- a/src/google/adk/evaluation/local_eval_service.py +++ b/src/google/adk/evaluation/local_eval_service.py @@ -25,6 +25,7 @@ from typing_extensions import override from ..agents.base_agent import BaseAgent +from ..apps.app import App from ..artifacts.base_artifact_service import BaseArtifactService from ..artifacts.in_memory_artifact_service import InMemoryArtifactService from ..errors.not_found_error import NotFoundError @@ -123,8 +124,20 @@ def __init__( session_id_supplier: Callable[[], str] = _get_session_id, user_simulator_provider: UserSimulatorProvider = UserSimulatorProvider(), memory_service: Optional[BaseMemoryService] = None, + *, + app: Optional[App] = None, ): + """Initializes a LocalEvalService. + + Args: + app: Optional `App` that wraps `root_agent`. When provided, eval runs + are executed through a Runner built from the App, so `app.plugins`, + `app.context_cache_config`, and `app.resumability_config` are + honored during inference. When None, the legacy bare-agent path is + used. + """ self._root_agent = root_agent + self._app = app self._eval_sets_manager = eval_sets_manager metric_evaluator_registry = ( metric_evaluator_registry or DEFAULT_METRIC_EVALUATOR_REGISTRY @@ -516,6 +529,7 @@ async def _perform_inference_single_eval_item( session_service=self._session_service, artifact_service=self._artifact_service, memory_service=self._memory_service, + app=self._app, ) ) diff --git a/tests/unittests/cli/utils/test_cli_eval.py b/tests/unittests/cli/utils/test_cli_eval.py index c6d21fa707..368a943dc6 100644 --- a/tests/unittests/cli/utils/test_cli_eval.py +++ b/tests/unittests/cli/utils/test_cli_eval.py @@ -19,6 +19,9 @@ from types import SimpleNamespace from unittest import mock +from google.adk.agents.base_agent import BaseAgent +from google.adk.apps.app import App + def test_get_eval_sets_manager_local(monkeypatch): mock_local_manager = mock.MagicMock() @@ -49,3 +52,66 @@ def test_get_eval_sets_manager_gcs(monkeypatch): ) assert manager == mock_gcs_manager mock_create_gcs.assert_called_once_with("gs://bucket") + + +def _patch_agent_module(monkeypatch, agent_namespace): + """Patches `_get_agent_module` to return a stub whose `.agent` matches.""" + monkeypatch.setattr( + "google.adk.cli.cli_eval._get_agent_module", + lambda _path: SimpleNamespace(agent=agent_namespace), + ) + + +def test_get_app_or_root_agent_with_app(monkeypatch): + """When the module exposes an App, both app and its root_agent are returned.""" + root_agent = BaseAgent(name="root_agent") + app = App(name="my_app", root_agent=root_agent) + _patch_agent_module( + monkeypatch, SimpleNamespace(root_agent=root_agent, app=app) + ) + + from google.adk.cli.cli_eval import get_app_or_root_agent + + resolved_app, resolved_root = get_app_or_root_agent("some/path") + assert resolved_app is app + assert resolved_root is root_agent + + +def test_get_app_or_root_agent_without_app(monkeypatch): + """When only `root_agent` is exposed, app is None.""" + root_agent = BaseAgent(name="root_agent") + _patch_agent_module(monkeypatch, SimpleNamespace(root_agent=root_agent)) + + from google.adk.cli.cli_eval import get_app_or_root_agent + + resolved_app, resolved_root = get_app_or_root_agent("some/path") + assert resolved_app is None + assert resolved_root is root_agent + + +def test_get_app_or_root_agent_app_attribute_not_an_app_instance(monkeypatch): + """If `app` exists but is not an App, it is ignored and we fall back.""" + root_agent = BaseAgent(name="root_agent") + _patch_agent_module( + monkeypatch, + SimpleNamespace(root_agent=root_agent, app="not-an-app"), + ) + + from google.adk.cli.cli_eval import get_app_or_root_agent + + resolved_app, resolved_root = get_app_or_root_agent("some/path") + assert resolved_app is None + assert resolved_root is root_agent + + +def test_get_root_agent_back_compat(monkeypatch): + """Existing `get_root_agent` callers keep getting the bare agent back.""" + root_agent = BaseAgent(name="root_agent") + app = App(name="my_app", root_agent=root_agent) + _patch_agent_module( + monkeypatch, SimpleNamespace(root_agent=root_agent, app=app) + ) + + from google.adk.cli.cli_eval import get_root_agent + + assert get_root_agent("some/path") is root_agent diff --git a/tests/unittests/cli/utils/test_cli_tools_click.py b/tests/unittests/cli/utils/test_cli_tools_click.py index 4f77a71f16..459b7ab673 100644 --- a/tests/unittests/cli/utils/test_cli_tools_click.py +++ b/tests/unittests/cli/utils/test_cli_tools_click.py @@ -60,8 +60,14 @@ def mock_load_eval_set_from_file(): @pytest.fixture def mock_get_root_agent(): - with mock.patch("google.adk.cli.cli_eval.get_root_agent") as mock_func: - mock_func.return_value = root_agent + """Patches the agent resolver used by the eval CLI. + + `cli_eval` resolves agents via `get_app_or_root_agent` (which returns + `(app, root_agent)`); the eval-set tests don't exercise the App path, + so we yield `(None, root_agent)`. + """ + with mock.patch("google.adk.cli.cli_eval.get_app_or_root_agent") as mock_func: + mock_func.return_value = (None, root_agent) yield mock_func diff --git a/tests/unittests/evaluation/test_evaluation_generator.py b/tests/unittests/evaluation/test_evaluation_generator.py index 05ab25cc72..9cefb387dc 100644 --- a/tests/unittests/evaluation/test_evaluation_generator.py +++ b/tests/unittests/evaluation/test_evaluation_generator.py @@ -16,6 +16,8 @@ import asyncio +from google.adk.agents.base_agent import BaseAgent +from google.adk.apps.app import App from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails from google.adk.evaluation.evaluation_generator import _LiveSession @@ -26,6 +28,7 @@ from google.adk.evaluation.simulation.user_simulator import UserSimulator from google.adk.events.event import Event from google.adk.models.llm_request import LlmRequest +from google.adk.plugins.base_plugin import BasePlugin from google.genai import types import pytest @@ -860,3 +863,133 @@ async def mock_run_live(*args, **kwargs): ) assert isinstance(called_after_args.kwargs["llm_response"], Event) assert called_after_args.kwargs["llm_response"] == mock_event + + +class _SpyPlugin(BasePlugin): + """A user-defined plugin used to assert merge behavior.""" + + pass + + +class TestGenerateInferencesFromRootAgentWithApp: + """Tests that App.plugins / configs are honored when an App is provided.""" + + @pytest.fixture + def runner_cls(self, mocker): + """Patches Runner and returns the patched class for kwargs inspection.""" + mock_runner_cls = mocker.patch( + "google.adk.evaluation.evaluation_generator.Runner" + ) + mock_runner_instance = mocker.AsyncMock() + mock_runner_instance.__aenter__.return_value = mock_runner_instance + mock_runner_cls.return_value = mock_runner_instance + yield mock_runner_cls + + @pytest.fixture + def stop_immediately_simulator(self, mocker): + """Returns a UserSimulator that stops on first call (no inference work).""" + sim = mocker.MagicMock(spec=UserSimulator) + sim.get_next_user_message = mocker.AsyncMock( + return_value=NextUserMessage( + status=UserSimulatorStatus.STOP_SIGNAL_DETECTED + ) + ) + return sim + + @pytest.mark.asyncio + async def test_runner_built_from_app_when_provided( + self, runner_cls, mock_session_service, stop_immediately_simulator + ): + """When `app` is passed, Runner is built with `app=` (merged) instead of `agent=`.""" + root_agent = BaseAgent(name="root_agent") + user_plugin = _SpyPlugin(name="user_plugin") + app = App(name="my_app", root_agent=root_agent, plugins=[user_plugin]) + + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=root_agent, + user_simulator=stop_immediately_simulator, + app=app, + ) + + runner_cls.assert_called_once() + kwargs = runner_cls.call_args.kwargs + assert "agent" not in kwargs, ( + "Runner must not receive `agent=` when `app=` is provided " + "(would raise ValueError)." + ) + assert "plugins" not in kwargs, ( + "Runner must not receive `plugins=` when `app=` is provided " + "(would raise ValueError)." + ) + runner_app = kwargs["app"] + assert isinstance(runner_app, App) + plugin_names = [p.name for p in runner_app.plugins] + assert ( + "user_plugin" in plugin_names + ), "User plugin must be preserved in the merged App passed to Runner." + assert "request_intercepter_plugin" in plugin_names + assert "ensure_retry_options" in plugin_names + + @pytest.mark.asyncio + async def test_user_app_is_not_mutated( + self, runner_cls, mock_session_service, stop_immediately_simulator + ): + """The user's App instance must not be mutated across eval runs.""" + root_agent = BaseAgent(name="root_agent") + user_plugin = _SpyPlugin(name="user_plugin") + app = App(name="my_app", root_agent=root_agent, plugins=[user_plugin]) + original_plugins_id = id(app.plugins) + + for _ in range(3): + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=root_agent, + user_simulator=stop_immediately_simulator, + app=app, + ) + + # The user's App instance must still hold exactly its original plugin set, + # regardless of how many eval runs reused it. + assert app.plugins == [user_plugin] + assert id(app.plugins) == original_plugins_id + + @pytest.mark.asyncio + async def test_runner_falls_back_to_bare_agent_when_no_app( + self, runner_cls, mock_session_service, stop_immediately_simulator + ): + """When `app` is None, Runner is built with the legacy `agent=`/`plugins=` shape.""" + root_agent = BaseAgent(name="root_agent") + + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=root_agent, + user_simulator=stop_immediately_simulator, + ) + + runner_cls.assert_called_once() + kwargs = runner_cls.call_args.kwargs + assert "app" not in kwargs + assert kwargs["agent"] is root_agent + plugin_names = [p.name for p in kwargs["plugins"]] + assert plugin_names == [ + "request_intercepter_plugin", + "ensure_retry_options", + ] + + @pytest.mark.asyncio + async def test_root_agent_override_propagates_to_merged_app( + self, runner_cls, mock_session_service, stop_immediately_simulator + ): + """If a sub-agent is passed as root_agent, the merged App reflects that.""" + full_root = BaseAgent(name="full_root") + sub_agent = BaseAgent(name="sub_agent") + app = App(name="my_app", root_agent=full_root) + + await EvaluationGenerator._generate_inferences_from_root_agent( + root_agent=sub_agent, + user_simulator=stop_immediately_simulator, + app=app, + ) + + runner_app = runner_cls.call_args.kwargs["app"] + assert runner_app.root_agent is sub_agent + # User's App must be untouched. + assert app.root_agent is full_root diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py index 3bbfafc5be..7c5755ae84 100644 --- a/tests/unittests/evaluation/test_local_eval_service.py +++ b/tests/unittests/evaluation/test_local_eval_service.py @@ -19,6 +19,7 @@ from typing import Optional from google.adk.agents.llm_agent import LlmAgent +from google.adk.apps.app import App from google.adk.errors.not_found_error import NotFoundError from google.adk.evaluation.base_eval_service import EvaluateConfig from google.adk.evaluation.base_eval_service import EvaluateRequest @@ -906,6 +907,10 @@ async def test_perform_inference_single_eval_item_non_live( live_timeout_seconds=300, ) + # The non-live branch forwards `app=self._app` to the underlying + # `_generate_inferences_from_root_agent` (see fix in + # `local_eval_service.py`). The `eval_service` fixture builds the service + # without an `app`, so we expect `app=None`. mock_generate.assert_called_once_with( root_agent=dummy_agent, user_simulator=mock_user_sim, @@ -914,4 +919,76 @@ async def test_perform_inference_single_eval_item_non_live( session_service=eval_service._session_service, artifact_service=eval_service._artifact_service, memory_service=eval_service._memory_service, + app=None, ) + + +@pytest.mark.asyncio +async def test_perform_inference_forwards_app_to_evaluation_generator( + dummy_agent, mock_eval_sets_manager, mocker +): + """LocalEvalService passes its `app` through to _generate_inferences_from_root_agent.""" + app = App(name="test_app", root_agent=dummy_agent) + + eval_case = EvalCase(eval_id="case-1", conversation=[]) + mock_eval_sets_manager.get_eval_set.return_value = EvalSet( + eval_set_id="set-1", + eval_cases=[eval_case], + ) + + mock_generate = mocker.patch( + "google.adk.evaluation.local_eval_service.EvaluationGenerator._generate_inferences_from_root_agent", + new=mocker.AsyncMock(return_value=[]), + ) + + service = LocalEvalService( + root_agent=dummy_agent, + eval_sets_manager=mock_eval_sets_manager, + app=app, + ) + + request = InferenceRequest( + app_name="test_app", + eval_set_id="set-1", + eval_case_ids=["case-1"], + inference_config=InferenceConfig(), + ) + async for _ in service.perform_inference(inference_request=request): + pass + + mock_generate.assert_awaited_once() + assert mock_generate.await_args.kwargs["app"] is app + + +@pytest.mark.asyncio +async def test_perform_inference_passes_none_when_no_app( + dummy_agent, mock_eval_sets_manager, mocker +): + """When LocalEvalService has no `app`, it forwards None (legacy behavior).""" + eval_case = EvalCase(eval_id="case-1", conversation=[]) + mock_eval_sets_manager.get_eval_set.return_value = EvalSet( + eval_set_id="set-1", + eval_cases=[eval_case], + ) + + mock_generate = mocker.patch( + "google.adk.evaluation.local_eval_service.EvaluationGenerator._generate_inferences_from_root_agent", + new=mocker.AsyncMock(return_value=[]), + ) + + service = LocalEvalService( + root_agent=dummy_agent, + eval_sets_manager=mock_eval_sets_manager, + ) + + request = InferenceRequest( + app_name="test_app", + eval_set_id="set-1", + eval_case_ids=["case-1"], + inference_config=InferenceConfig(), + ) + async for _ in service.perform_inference(inference_request=request): + pass + + mock_generate.assert_awaited_once() + assert mock_generate.await_args.kwargs["app"] is None