From 1c50bab2742c21d66f7c7f0147b7f4320d415855 Mon Sep 17 00:00:00 2001 From: Jason Dai Date: Thu, 19 Mar 2026 15:37:02 -0700 Subject: [PATCH] chore: GenAI Client(evals) - simplify create eval run evaluation interface for user simulation PiperOrigin-RevId: 886409456 --- .../replays/test_create_evaluation_run.py | 105 ++++++++++++++- vertexai/_genai/_evals_common.py | 121 +++++++++++++++--- vertexai/_genai/_evals_data_converters.py | 24 ++-- vertexai/_genai/evals.py | 64 ++++++++- 4 files changed, 278 insertions(+), 36 deletions(-) diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py index 33521f0c8f..e39953eb39 100644 --- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py +++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py @@ -17,8 +17,8 @@ from tests.unit.vertexai.genai.replays import pytest_helper from vertexai import types from google.genai import types as genai_types -import pytest import pandas as pd +import pytest GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output" GENERAL_QUALITY_METRIC = types.EvaluationRunMetric( @@ -42,7 +42,7 @@ metric_config=types.UnifiedMetric( llm_based_metric_spec=genai_types.LLMBasedMetricSpec( metric_prompt_template=( - "\nEvaluate the fluency of the response. Provide a score from 1-5." + "\nEvaluate the fluency of the response. Provide a score from" " 1-5." ) ) ), @@ -80,7 +80,7 @@ ] ) AGENT_INFO = types.evals.AgentInfo( - agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456", + agent_resource_name=("projects/123/locations/us-central1/reasoningEngines/456"), name="agent-1", agents={ "agent-1": types.evals.AgentConfig( @@ -147,6 +147,10 @@ def test_create_eval_run_data_source_evaluation_set(client): AGENT_INFO.name ] == types.EvaluationRunInferenceConfig( agent_configs=AGENT_INFO.agents, + agent_run_config=types.AgentRunConfig( + agent_engine=AGENT_INFO.agent_resource_name, + user_simulator_config={"max_turn": 5}, + ), ) assert evaluation_run.labels == { "vertex-ai-evaluation-agent-engine-id": "456", @@ -203,6 +207,53 @@ def test_create_eval_run_data_source_bigquery_request_set(client): assert evaluation_run.error is None +def test_create_eval_run_with_user_simulator_config(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config.""" + client._api_client._http_options.api_version = "v1beta1" + evaluation_run = client.evals.create_evaluation_run( + name="test_user_simulator_config", + display_name="test_user_simulator_config", + dataset=types.EvaluationRunDataSource( + evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040" + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + agent_info=AGENT_INFO, + user_simulator_config=types.evals.UserSimulatorConfig( + max_turn=5, + ), + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test_user_simulator_config" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert ( + evaluation_run.data_source.evaluation_set + == "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040" + ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[GENERAL_QUALITY_METRIC], + ) + assert evaluation_run.inference_configs[ + AGENT_INFO.name + ] == types.EvaluationRunInferenceConfig( + agent_configs=AGENT_INFO.agents, + agent_run_config=types.AgentRunConfig( + agent_engine=AGENT_INFO.agent_resource_name, + user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5), + ), + ) + assert evaluation_run.labels == { + "vertex-ai-evaluation-agent-engine-id": "456", + "label1": "value1", + } + assert evaluation_run.error is None + + def test_create_eval_run_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs.""" client._api_client._http_options.api_version = "v1beta1" @@ -669,6 +720,54 @@ async def test_create_eval_run_async(client): assert evaluation_run.error is None +@pytest.mark.asyncio +async def test_create_eval_run_async_with_user_simulator_config(client): + """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config asynchronously.""" + client._api_client._http_options.api_version = "v1beta1" + evaluation_run = await client.aio.evals.create_evaluation_run( + name="test_user_simulator_config_async", + display_name="test_user_simulator_config_async", + dataset=types.EvaluationRunDataSource( + evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040" + ), + dest=GCS_DEST, + metrics=[GENERAL_QUALITY_METRIC], + agent_info=AGENT_INFO, + user_simulator_config=types.evals.UserSimulatorConfig( + max_turn=5, + ), + labels={"label1": "value1"}, + ) + assert isinstance(evaluation_run, types.EvaluationRun) + assert evaluation_run.display_name == "test_user_simulator_config_async" + assert evaluation_run.state == types.EvaluationRunState.PENDING + assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource) + assert ( + evaluation_run.data_source.evaluation_set + == "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040" + ) + assert evaluation_run.evaluation_config == types.EvaluationRunConfig( + output_config=genai_types.OutputConfig( + gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST) + ), + metrics=[GENERAL_QUALITY_METRIC], + ) + assert evaluation_run.inference_configs[ + AGENT_INFO.name + ] == types.EvaluationRunInferenceConfig( + agent_configs=AGENT_INFO.agents, + agent_run_config=types.AgentRunConfig( + agent_engine=AGENT_INFO.agent_resource_name, + user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5), + ), + ) + assert evaluation_run.labels == { + "label1": "value1", + "vertex-ai-evaluation-agent-engine-id": "456", + } + assert evaluation_run.error is None + + @pytest.mark.asyncio async def test_create_eval_run_async_with_inference_configs(client): """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously.""" diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py index c3a488eb06..f2ca33648c 100644 --- a/vertexai/_genai/_evals_common.py +++ b/vertexai/_genai/_evals_common.py @@ -283,15 +283,66 @@ def _resolve_dataset( api_client: BaseApiClient, dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset], dest: str, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + parsed_agent_info: Optional[types.evals.AgentInfo] = None, ) -> types.EvaluationRunDataSource: """Resolves dataset for the evaluation run.""" if isinstance(dataset, types.EvaluationDataset): - candidate_name = _get_candidate_name(dataset, agent_info_pydantic) + candidate_name = _get_candidate_name(dataset, parsed_agent_info) + eval_df = dataset.eval_dataset_df + if eval_df is None and dataset.eval_cases: + rows = [] + for case in dataset.eval_cases: + row: dict[str, Any] = {} + if case.prompt: + row[_evals_constant.PROMPT] = ( + _evals_data_converters._get_content_text(case.prompt) + ) + + if ( + case.responses + and len(case.responses) > 0 + and case.responses[0].response + ): + row[_evals_constant.RESPONSE] = ( + _evals_data_converters._get_content_text( + case.responses[0].response + ) + ) + + if case.reference and case.reference.response: + row[_evals_constant.REFERENCE] = ( + _evals_data_converters._get_content_text( + case.reference.response + ) + ) + + if case.agent_data: + row[AGENT_DATA] = case.agent_data + + if case.intermediate_events: + row[_evals_constant.INTERMEDIATE_EVENTS] = [ + {CONTENT: event.content} + for event in case.intermediate_events + if event.content + ] + + if case.user_scenario: + if case.user_scenario.starting_prompt: + row[_evals_constant.STARTING_PROMPT] = ( + case.user_scenario.starting_prompt + ) + if case.user_scenario.conversation_plan: + row[_evals_constant.CONVERSATION_PLAN] = ( + case.user_scenario.conversation_plan + ) + + rows.append(row) + eval_df = pd.DataFrame(rows) + eval_set = _create_evaluation_set_from_dataframe( api_client, dest, - dataset.eval_dataset_df, + eval_df, candidate_name, ) dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name) @@ -339,15 +390,34 @@ def _resolve_inference_configs( inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + parsed_agent_info: Optional[types.evals.AgentInfo] = None, ) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]: """Resolves inference configs for the evaluation run.""" # Resolve agent config - if agent_info_pydantic and agent_info_pydantic.name: - inference_configs = {} - inference_configs[agent_info_pydantic.name] = ( - types.EvaluationRunInferenceConfig(agent_configs=agent_info_pydantic.agents) - ) + if parsed_agent_info and parsed_agent_info.name: + if inference_configs is None: + inference_configs = {} + + # We might have used "candidate-1" as a placeholder key in the caller, + # let's migrate it to the agent name, or if it doesn't exist, just create it. + if "candidate-1" in inference_configs: + inference_configs[parsed_agent_info.name] = inference_configs.pop( + "candidate-1" + ) + + if parsed_agent_info.name not in inference_configs: + inference_configs[parsed_agent_info.name] = ( + types.EvaluationRunInferenceConfig( + agent_configs=parsed_agent_info.agents + ) + ) + else: + config = inference_configs[parsed_agent_info.name] + if isinstance(config, dict): + config["agent_configs"] = parsed_agent_info.agents + else: + config.agent_configs = parsed_agent_info.agents + # Resolve prompt template data if inference_configs: for inference_config in inference_configs.values(): @@ -381,33 +451,33 @@ def _resolve_inference_configs( def _add_evaluation_run_labels( labels: Optional[dict[str, str]] = None, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + parsed_agent_info: Optional[types.evals.AgentInfo] = None, ) -> Optional[dict[str, str]]: """Adds labels to the evaluation run.""" - if agent_info_pydantic and agent_info_pydantic.agent_resource_name: + if parsed_agent_info and parsed_agent_info.agent_resource_name: labels = labels or {} labels["vertex-ai-evaluation-agent-engine-id"] = ( - agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[-1] + parsed_agent_info.agent_resource_name.split("reasoningEngines/")[-1] ) return labels def _get_candidate_name( dataset: types.EvaluationDataset, - agent_info_pydantic: Optional[types.evals.AgentInfo] = None, + parsed_agent_info: Optional[types.evals.AgentInfo] = None, ) -> Optional[str]: """Internal helper to get candidate name.""" - if agent_info_pydantic is not None and ( + if parsed_agent_info is not None and ( dataset.candidate_name - and agent_info_pydantic - and agent_info_pydantic.name - and dataset.candidate_name != agent_info_pydantic.name + and parsed_agent_info + and parsed_agent_info.name + and dataset.candidate_name != parsed_agent_info.name ): logger.warning( "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended." ) - elif dataset.candidate_name is None and agent_info_pydantic: - return agent_info_pydantic.name + elif dataset.candidate_name is None and parsed_agent_info: + return parsed_agent_info.name return dataset.candidate_name or None @@ -2406,10 +2476,21 @@ def _create_evaluation_set_from_dataframe( candidate_responses = [] if _evals_constant.RESPONSE in row or agent_data_obj or intermediate_events: + # Resolve the oneof conflict: prioritize agent_data over flat text + response_text = row.get(_evals_constant.RESPONSE) or None + + if agent_data_obj and response_text: + logger.info( + "Both 'response' and 'agent_data' columns found in the evaluation dataset. " + "Prioritizing 'agent_data' and omitting 'response' text to satisfy " + "CandidateResponse protobuf oneof constraints." + ) + response_text = None + candidate_responses.append( types.CandidateResponse( candidate=candidate_name or "Candidate 1", - text=row.get(_evals_constant.RESPONSE) or None, + text=response_text, events=intermediate_events or None, agent_data=agent_data_obj, ) diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py index 33f18b1519..c194c3a9d5 100644 --- a/vertexai/_genai/_evals_data_converters.py +++ b/vertexai/_genai/_evals_data_converters.py @@ -672,18 +672,24 @@ def get_dataset_converter( raise ValueError("Unsupported dataset schema: %s" % dataset_schema) -def _get_first_part_text(content: genai_types.Content) -> str: - """Safely extracts text from the first part of a content.""" +def _get_content_text(content: genai_types.Content) -> str: + """Safely extracts text from all parts of a content. + + If the content has multiple parts, text from all parts is concatenated. + If a part is not text, it is ignored. If no text parts are found, + an empty string is returned. + """ + text_parts = [] if ( content and hasattr(content, "parts") and isinstance(content.parts, list) and content.parts ): - first_part = content.parts[0] - if hasattr(first_part, "text"): - return str(first_part.text) - return "" + for part in content.parts: + if hasattr(part, "text") and part.text is not None: + text_parts.append(str(part.text)) + return "".join(text_parts) def _get_text_from_reference( @@ -691,7 +697,7 @@ def _get_text_from_reference( ) -> Optional[str]: """Safely extracts text from a reference field.""" if reference and hasattr(reference, "response") and reference.response: - return _get_first_part_text(reference.response) + return _get_content_text(reference.response) return None @@ -703,8 +709,8 @@ def _validate_case_consistency( ) -> None: """Logs warnings if prompt or reference mismatches occur.""" if base_case.prompt != current_case.prompt: - base_prompt_text_preview = _get_first_part_text(base_case.prompt)[:50] - current_prompt_text_preview = _get_first_part_text(current_case.prompt)[:50] + base_prompt_text_preview = _get_content_text(base_case.prompt)[:50] + current_prompt_text_preview = _get_content_text(current_case.prompt)[:50] logger.warning( "Prompt mismatch for case index %d between base dataset (0)" " and dataset %d. Using prompt from base. Base prompt" diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py index 0881bd571e..6c80482227 100644 --- a/vertexai/_genai/evals.py +++ b/vertexai/_genai/evals.py @@ -2099,6 +2099,7 @@ def create_evaluation_run( name: Optional[str] = None, display_name: Optional[str] = None, agent_info: Optional[evals_types.AgentInfoOrDict] = None, + user_simulator_config: Optional[evals_types.UserSimulatorConfigOrDict] = None, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, @@ -2113,10 +2114,16 @@ def create_evaluation_run( metrics: The list of metrics to evaluate. name: The name of the evaluation run. display_name: The display name of the evaluation run. - agent_info: The agent info to evaluate. + agent_info: The agent info to evaluate. Mutually exclusive with + `inference_configs`. + user_simulator_config: The user simulator configuration for agent evaluation. + If `agent_info` is provided without `inference_configs`, this config is used + to automatically construct the inference configuration. If not specified, + or if `max_turn` is not set, `max_turn` defaults to 5. inference_configs: The candidate to inference config map for the evaluation run. The key is the candidate name, and the value is the inference config. - If provided, agent_info must be None. + If provided, `agent_info` must be None. If omitted and `agent_info` is provided, + this will be automatically constructed using `agent_info` and `user_simulator_config`. Example: {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")} labels: The labels to apply to the evaluation run. @@ -2134,6 +2141,27 @@ def create_evaluation_run( if isinstance(agent_info, dict) else (agent_info or evals_types.AgentInfo()) ) + + if agent_info and not inference_configs: + parsed_user_simulator_config = ( + evals_types.UserSimulatorConfig.model_validate(user_simulator_config) + if isinstance(user_simulator_config, dict) + else (user_simulator_config or evals_types.UserSimulatorConfig()) + ) + if getattr(parsed_user_simulator_config, "max_turn", None) is None: + parsed_user_simulator_config.max_turn = 5 + + candidate_name = parsed_agent_info.name or "candidate-1" + inference_configs = { + candidate_name: types.EvaluationRunInferenceConfig( + agent_configs=parsed_agent_info.agents, + agent_run_config=types.AgentRunConfig( + agent_engine=parsed_agent_info.agent_resource_name, + user_simulator_config=parsed_user_simulator_config, + ), + ) + } + if isinstance(dataset, types.EvaluationDataset): _evals_utils._validate_dataset_agent_data(dataset, inference_configs) resolved_dataset = _evals_common._resolve_dataset( @@ -3277,6 +3305,7 @@ async def create_evaluation_run( name: Optional[str] = None, display_name: Optional[str] = None, agent_info: Optional[evals_types.AgentInfo] = None, + user_simulator_config: Optional[evals_types.UserSimulatorConfigOrDict] = None, inference_configs: Optional[ dict[str, types.EvaluationRunInferenceConfigOrDict] ] = None, @@ -3291,10 +3320,16 @@ async def create_evaluation_run( metrics: The list of metrics to evaluate. name: The name of the evaluation run. display_name: The display name of the evaluation run. - agent_info: The agent info to evaluate. + agent_info: The agent info to evaluate. Mutually exclusive with + `inference_configs`. + user_simulator_config: The user simulator configuration for agent evaluation. + If `agent_info` is provided without `inference_configs`, this config is used + to automatically construct the inference configuration. If not specified, + or if `max_turn` is not set, `max_turn` defaults to 5. inference_configs: The candidate to inference config map for the evaluation run. The key is the candidate name, and the value is the inference config. - If provided, agent_info must be None. + If provided, `agent_info` must be None. If omitted and `agent_info` is provided, + this will be automatically constructed using `agent_info` and `user_simulator_config`. Example: {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")} labels: The labels to apply to the evaluation run. @@ -3312,6 +3347,27 @@ async def create_evaluation_run( if isinstance(agent_info, dict) else (agent_info or evals_types.AgentInfo()) ) + + if agent_info and not inference_configs: + parsed_user_simulator_config = ( + evals_types.UserSimulatorConfig.model_validate(user_simulator_config) + if isinstance(user_simulator_config, dict) + else (user_simulator_config or evals_types.UserSimulatorConfig()) + ) + if getattr(parsed_user_simulator_config, "max_turn", None) is None: + parsed_user_simulator_config.max_turn = 5 + + candidate_name = parsed_agent_info.name or "candidate-1" + inference_configs = { + candidate_name: types.EvaluationRunInferenceConfig( + agent_configs=parsed_agent_info.agents, + agent_run_config=types.AgentRunConfig( + agent_engine=parsed_agent_info.agent_resource_name, + user_simulator_config=parsed_user_simulator_config, + ), + ) + } + if isinstance(dataset, types.EvaluationDataset): _evals_utils._validate_dataset_agent_data(dataset, inference_configs) resolved_dataset = _evals_common._resolve_dataset(