From 1c50bab2742c21d66f7c7f0147b7f4320d415855 Mon Sep 17 00:00:00 2001
From: Jason Dai <jsndai@google.com>
Date: Thu, 19 Mar 2026 15:37:02 -0700
Subject: [PATCH] chore: GenAI Client(evals) - simplify create eval run
 evaluation interface for user simulation

PiperOrigin-RevId: 886409456
---
 .../replays/test_create_evaluation_run.py     | 105 ++++++++++++++-
 vertexai/_genai/_evals_common.py              | 121 +++++++++++++++---
 vertexai/_genai/_evals_data_converters.py     |  24 ++--
 vertexai/_genai/evals.py                      |  64 ++++++++-
 4 files changed, 278 insertions(+), 36 deletions(-)

diff --git a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
index 33521f0c8f..e39953eb39 100644
--- a/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
+++ b/tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
@@ -17,8 +17,8 @@
 from tests.unit.vertexai.genai.replays import pytest_helper
 from vertexai import types
 from google.genai import types as genai_types
-import pytest
 import pandas as pd
+import pytest
 
 GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
 GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -42,7 +42,7 @@
     metric_config=types.UnifiedMetric(
         llm_based_metric_spec=genai_types.LLMBasedMetricSpec(
             metric_prompt_template=(
-                "\nEvaluate the fluency of the response. Provide a score from 1-5."
+                "\nEvaluate the fluency of the response. Provide a score from" " 1-5."
             )
         )
     ),
@@ -80,7 +80,7 @@
     ]
 )
 AGENT_INFO = types.evals.AgentInfo(
-    agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
+    agent_resource_name=("projects/123/locations/us-central1/reasoningEngines/456"),
     name="agent-1",
     agents={
         "agent-1": types.evals.AgentConfig(
@@ -147,6 +147,10 @@ def test_create_eval_run_data_source_evaluation_set(client):
         AGENT_INFO.name
     ] == types.EvaluationRunInferenceConfig(
         agent_configs=AGENT_INFO.agents,
+        agent_run_config=types.AgentRunConfig(
+            agent_engine=AGENT_INFO.agent_resource_name,
+            user_simulator_config={"max_turn": 5},
+        ),
     )
     assert evaluation_run.labels == {
         "vertex-ai-evaluation-agent-engine-id": "456",
@@ -203,6 +207,53 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
     assert evaluation_run.error is None
 
 
+def test_create_eval_run_with_user_simulator_config(client):
+    """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config."""
+    client._api_client._http_options.api_version = "v1beta1"
+    evaluation_run = client.evals.create_evaluation_run(
+        name="test_user_simulator_config",
+        display_name="test_user_simulator_config",
+        dataset=types.EvaluationRunDataSource(
+            evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        agent_info=AGENT_INFO,
+        user_simulator_config=types.evals.UserSimulatorConfig(
+            max_turn=5,
+        ),
+        labels={"label1": "value1"},
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test_user_simulator_config"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert (
+        evaluation_run.data_source.evaluation_set
+        == "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
+    )
+    assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+        output_config=genai_types.OutputConfig(
+            gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+        ),
+        metrics=[GENERAL_QUALITY_METRIC],
+    )
+    assert evaluation_run.inference_configs[
+        AGENT_INFO.name
+    ] == types.EvaluationRunInferenceConfig(
+        agent_configs=AGENT_INFO.agents,
+        agent_run_config=types.AgentRunConfig(
+            agent_engine=AGENT_INFO.agent_resource_name,
+            user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5),
+        ),
+    )
+    assert evaluation_run.labels == {
+        "vertex-ai-evaluation-agent-engine-id": "456",
+        "label1": "value1",
+    }
+    assert evaluation_run.error is None
+
+
 def test_create_eval_run_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
     client._api_client._http_options.api_version = "v1beta1"
@@ -669,6 +720,54 @@ async def test_create_eval_run_async(client):
     assert evaluation_run.error is None
 
 
+@pytest.mark.asyncio
+async def test_create_eval_run_async_with_user_simulator_config(client):
+    """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config asynchronously."""
+    client._api_client._http_options.api_version = "v1beta1"
+    evaluation_run = await client.aio.evals.create_evaluation_run(
+        name="test_user_simulator_config_async",
+        display_name="test_user_simulator_config_async",
+        dataset=types.EvaluationRunDataSource(
+            evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
+        ),
+        dest=GCS_DEST,
+        metrics=[GENERAL_QUALITY_METRIC],
+        agent_info=AGENT_INFO,
+        user_simulator_config=types.evals.UserSimulatorConfig(
+            max_turn=5,
+        ),
+        labels={"label1": "value1"},
+    )
+    assert isinstance(evaluation_run, types.EvaluationRun)
+    assert evaluation_run.display_name == "test_user_simulator_config_async"
+    assert evaluation_run.state == types.EvaluationRunState.PENDING
+    assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+    assert (
+        evaluation_run.data_source.evaluation_set
+        == "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
+    )
+    assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+        output_config=genai_types.OutputConfig(
+            gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+        ),
+        metrics=[GENERAL_QUALITY_METRIC],
+    )
+    assert evaluation_run.inference_configs[
+        AGENT_INFO.name
+    ] == types.EvaluationRunInferenceConfig(
+        agent_configs=AGENT_INFO.agents,
+        agent_run_config=types.AgentRunConfig(
+            agent_engine=AGENT_INFO.agent_resource_name,
+            user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5),
+        ),
+    )
+    assert evaluation_run.labels == {
+        "label1": "value1",
+        "vertex-ai-evaluation-agent-engine-id": "456",
+    }
+    assert evaluation_run.error is None
+
+
 @pytest.mark.asyncio
 async def test_create_eval_run_async_with_inference_configs(client):
     """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index c3a488eb06..f2ca33648c 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -283,15 +283,66 @@ def _resolve_dataset(
     api_client: BaseApiClient,
     dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
     dest: str,
-    agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
+    parsed_agent_info: Optional[types.evals.AgentInfo] = None,
 ) -> types.EvaluationRunDataSource:
     """Resolves dataset for the evaluation run."""
     if isinstance(dataset, types.EvaluationDataset):
-        candidate_name = _get_candidate_name(dataset, agent_info_pydantic)
+        candidate_name = _get_candidate_name(dataset, parsed_agent_info)
+        eval_df = dataset.eval_dataset_df
+        if eval_df is None and dataset.eval_cases:
+            rows = []
+            for case in dataset.eval_cases:
+                row: dict[str, Any] = {}
+                if case.prompt:
+                    row[_evals_constant.PROMPT] = (
+                        _evals_data_converters._get_content_text(case.prompt)
+                    )
+
+                if (
+                    case.responses
+                    and len(case.responses) > 0
+                    and case.responses[0].response
+                ):
+                    row[_evals_constant.RESPONSE] = (
+                        _evals_data_converters._get_content_text(
+                            case.responses[0].response
+                        )
+                    )
+
+                if case.reference and case.reference.response:
+                    row[_evals_constant.REFERENCE] = (
+                        _evals_data_converters._get_content_text(
+                            case.reference.response
+                        )
+                    )
+
+                if case.agent_data:
+                    row[AGENT_DATA] = case.agent_data
+
+                if case.intermediate_events:
+                    row[_evals_constant.INTERMEDIATE_EVENTS] = [
+                        {CONTENT: event.content}
+                        for event in case.intermediate_events
+                        if event.content
+                    ]
+
+                if case.user_scenario:
+                    if case.user_scenario.starting_prompt:
+                        row[_evals_constant.STARTING_PROMPT] = (
+                            case.user_scenario.starting_prompt
+                        )
+                    if case.user_scenario.conversation_plan:
+                        row[_evals_constant.CONVERSATION_PLAN] = (
+                            case.user_scenario.conversation_plan
+                        )
+
+                rows.append(row)
+            eval_df = pd.DataFrame(rows)
+
         eval_set = _create_evaluation_set_from_dataframe(
             api_client,
             dest,
-            dataset.eval_dataset_df,
+            eval_df,
             candidate_name,
         )
         dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name)
@@ -339,15 +390,34 @@ def _resolve_inference_configs(
     inference_configs: Optional[
         dict[str, types.EvaluationRunInferenceConfigOrDict]
     ] = None,
-    agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
+    parsed_agent_info: Optional[types.evals.AgentInfo] = None,
 ) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]:
     """Resolves inference configs for the evaluation run."""
     # Resolve agent config
-    if agent_info_pydantic and agent_info_pydantic.name:
-        inference_configs = {}
-        inference_configs[agent_info_pydantic.name] = (
-            types.EvaluationRunInferenceConfig(agent_configs=agent_info_pydantic.agents)
-        )
+    if parsed_agent_info and parsed_agent_info.name:
+        if inference_configs is None:
+            inference_configs = {}
+
+        # We might have used "candidate-1" as a placeholder key in the caller,
+        # let's migrate it to the agent name, or if it doesn't exist, just create it.
+        if "candidate-1" in inference_configs:
+            inference_configs[parsed_agent_info.name] = inference_configs.pop(
+                "candidate-1"
+            )
+
+        if parsed_agent_info.name not in inference_configs:
+            inference_configs[parsed_agent_info.name] = (
+                types.EvaluationRunInferenceConfig(
+                    agent_configs=parsed_agent_info.agents
+                )
+            )
+        else:
+            config = inference_configs[parsed_agent_info.name]
+            if isinstance(config, dict):
+                config["agent_configs"] = parsed_agent_info.agents
+            else:
+                config.agent_configs = parsed_agent_info.agents
+
     # Resolve prompt template data
     if inference_configs:
         for inference_config in inference_configs.values():
@@ -381,33 +451,33 @@ def _resolve_inference_configs(
 
 def _add_evaluation_run_labels(
     labels: Optional[dict[str, str]] = None,
-    agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
+    parsed_agent_info: Optional[types.evals.AgentInfo] = None,
 ) -> Optional[dict[str, str]]:
     """Adds labels to the evaluation run."""
-    if agent_info_pydantic and agent_info_pydantic.agent_resource_name:
+    if parsed_agent_info and parsed_agent_info.agent_resource_name:
         labels = labels or {}
         labels["vertex-ai-evaluation-agent-engine-id"] = (
-            agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[-1]
+            parsed_agent_info.agent_resource_name.split("reasoningEngines/")[-1]
         )
     return labels
 
 
 def _get_candidate_name(
     dataset: types.EvaluationDataset,
-    agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
+    parsed_agent_info: Optional[types.evals.AgentInfo] = None,
 ) -> Optional[str]:
     """Internal helper to get candidate name."""
-    if agent_info_pydantic is not None and (
+    if parsed_agent_info is not None and (
         dataset.candidate_name
-        and agent_info_pydantic
-        and agent_info_pydantic.name
-        and dataset.candidate_name != agent_info_pydantic.name
+        and parsed_agent_info
+        and parsed_agent_info.name
+        and dataset.candidate_name != parsed_agent_info.name
     ):
         logger.warning(
             "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
         )
-    elif dataset.candidate_name is None and agent_info_pydantic:
-        return agent_info_pydantic.name
+    elif dataset.candidate_name is None and parsed_agent_info:
+        return parsed_agent_info.name
     return dataset.candidate_name or None
 
 
@@ -2406,10 +2476,21 @@ def _create_evaluation_set_from_dataframe(
 
         candidate_responses = []
         if _evals_constant.RESPONSE in row or agent_data_obj or intermediate_events:
+            # Resolve the oneof conflict: prioritize agent_data over flat text
+            response_text = row.get(_evals_constant.RESPONSE) or None
+
+            if agent_data_obj and response_text:
+                logger.info(
+                    "Both 'response' and 'agent_data' columns found in the evaluation dataset. "
+                    "Prioritizing 'agent_data' and omitting 'response' text to satisfy "
+                    "CandidateResponse protobuf oneof constraints."
+                )
+                response_text = None
+
             candidate_responses.append(
                 types.CandidateResponse(
                     candidate=candidate_name or "Candidate 1",
-                    text=row.get(_evals_constant.RESPONSE) or None,
+                    text=response_text,
                     events=intermediate_events or None,
                     agent_data=agent_data_obj,
                 )
diff --git a/vertexai/_genai/_evals_data_converters.py b/vertexai/_genai/_evals_data_converters.py
index 33f18b1519..c194c3a9d5 100644
--- a/vertexai/_genai/_evals_data_converters.py
+++ b/vertexai/_genai/_evals_data_converters.py
@@ -672,18 +672,24 @@ def get_dataset_converter(
         raise ValueError("Unsupported dataset schema: %s" % dataset_schema)
 
 
-def _get_first_part_text(content: genai_types.Content) -> str:
-    """Safely extracts text from the first part of a content."""
+def _get_content_text(content: genai_types.Content) -> str:
+    """Safely extracts text from all parts of a content.
+
+    If the content has multiple parts, text from all parts is concatenated.
+    If a part is not text, it is ignored. If no text parts are found,
+    an empty string is returned.
+    """
+    text_parts = []
     if (
         content
         and hasattr(content, "parts")
         and isinstance(content.parts, list)
         and content.parts
     ):
-        first_part = content.parts[0]
-        if hasattr(first_part, "text"):
-            return str(first_part.text)
-    return ""
+        for part in content.parts:
+            if hasattr(part, "text") and part.text is not None:
+                text_parts.append(str(part.text))
+    return "".join(text_parts)
 
 
 def _get_text_from_reference(
@@ -691,7 +697,7 @@ def _get_text_from_reference(
 ) -> Optional[str]:
     """Safely extracts text from a reference field."""
     if reference and hasattr(reference, "response") and reference.response:
-        return _get_first_part_text(reference.response)
+        return _get_content_text(reference.response)
     return None
 
 
@@ -703,8 +709,8 @@ def _validate_case_consistency(
 ) -> None:
     """Logs warnings if prompt or reference mismatches occur."""
     if base_case.prompt != current_case.prompt:
-        base_prompt_text_preview = _get_first_part_text(base_case.prompt)[:50]
-        current_prompt_text_preview = _get_first_part_text(current_case.prompt)[:50]
+        base_prompt_text_preview = _get_content_text(base_case.prompt)[:50]
+        current_prompt_text_preview = _get_content_text(current_case.prompt)[:50]
         logger.warning(
             "Prompt mismatch for case index %d between base dataset (0)"
             " and dataset %d. Using prompt from base. Base prompt"
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index 0881bd571e..6c80482227 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -2099,6 +2099,7 @@ def create_evaluation_run(
         name: Optional[str] = None,
         display_name: Optional[str] = None,
         agent_info: Optional[evals_types.AgentInfoOrDict] = None,
+        user_simulator_config: Optional[evals_types.UserSimulatorConfigOrDict] = None,
         inference_configs: Optional[
             dict[str, types.EvaluationRunInferenceConfigOrDict]
         ] = None,
@@ -2113,10 +2114,16 @@ def create_evaluation_run(
           metrics: The list of metrics to evaluate.
           name: The name of the evaluation run.
           display_name: The display name of the evaluation run.
-          agent_info: The agent info to evaluate.
+          agent_info: The agent info to evaluate. Mutually exclusive with
+              `inference_configs`.
+          user_simulator_config: The user simulator configuration for agent evaluation.
+              If `agent_info` is provided without `inference_configs`, this config is used
+              to automatically construct the inference configuration. If not specified,
+              or if `max_turn` is not set, `max_turn` defaults to 5.
           inference_configs: The candidate to inference config map for the evaluation run.
               The key is the candidate name, and the value is the inference config.
-              If provided, agent_info must be None.
+              If provided, `agent_info` must be None. If omitted and `agent_info` is provided,
+              this will be automatically constructed using `agent_info` and `user_simulator_config`.
               Example:
               {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")}
           labels: The labels to apply to the evaluation run.
@@ -2134,6 +2141,27 @@ def create_evaluation_run(
             if isinstance(agent_info, dict)
             else (agent_info or evals_types.AgentInfo())
         )
+
+        if agent_info and not inference_configs:
+            parsed_user_simulator_config = (
+                evals_types.UserSimulatorConfig.model_validate(user_simulator_config)
+                if isinstance(user_simulator_config, dict)
+                else (user_simulator_config or evals_types.UserSimulatorConfig())
+            )
+            if getattr(parsed_user_simulator_config, "max_turn", None) is None:
+                parsed_user_simulator_config.max_turn = 5
+
+            candidate_name = parsed_agent_info.name or "candidate-1"
+            inference_configs = {
+                candidate_name: types.EvaluationRunInferenceConfig(
+                    agent_configs=parsed_agent_info.agents,
+                    agent_run_config=types.AgentRunConfig(
+                        agent_engine=parsed_agent_info.agent_resource_name,
+                        user_simulator_config=parsed_user_simulator_config,
+                    ),
+                )
+            }
+
         if isinstance(dataset, types.EvaluationDataset):
             _evals_utils._validate_dataset_agent_data(dataset, inference_configs)
         resolved_dataset = _evals_common._resolve_dataset(
@@ -3277,6 +3305,7 @@ async def create_evaluation_run(
         name: Optional[str] = None,
         display_name: Optional[str] = None,
         agent_info: Optional[evals_types.AgentInfo] = None,
+        user_simulator_config: Optional[evals_types.UserSimulatorConfigOrDict] = None,
         inference_configs: Optional[
             dict[str, types.EvaluationRunInferenceConfigOrDict]
         ] = None,
@@ -3291,10 +3320,16 @@ async def create_evaluation_run(
           metrics: The list of metrics to evaluate.
           name: The name of the evaluation run.
           display_name: The display name of the evaluation run.
-          agent_info: The agent info to evaluate.
+          agent_info: The agent info to evaluate. Mutually exclusive with
+              `inference_configs`.
+          user_simulator_config: The user simulator configuration for agent evaluation.
+              If `agent_info` is provided without `inference_configs`, this config is used
+              to automatically construct the inference configuration. If not specified,
+              or if `max_turn` is not set, `max_turn` defaults to 5.
           inference_configs: The candidate to inference config map for the evaluation run.
               The key is the candidate name, and the value is the inference config.
-              If provided, agent_info must be None.
+              If provided, `agent_info` must be None. If omitted and `agent_info` is provided,
+              this will be automatically constructed using `agent_info` and `user_simulator_config`.
               Example:
               {"candidate-1": types.EvaluationRunInferenceConfig(model="gemini-2.5-flash")}
           labels: The labels to apply to the evaluation run.
@@ -3312,6 +3347,27 @@ async def create_evaluation_run(
             if isinstance(agent_info, dict)
             else (agent_info or evals_types.AgentInfo())
         )
+
+        if agent_info and not inference_configs:
+            parsed_user_simulator_config = (
+                evals_types.UserSimulatorConfig.model_validate(user_simulator_config)
+                if isinstance(user_simulator_config, dict)
+                else (user_simulator_config or evals_types.UserSimulatorConfig())
+            )
+            if getattr(parsed_user_simulator_config, "max_turn", None) is None:
+                parsed_user_simulator_config.max_turn = 5
+
+            candidate_name = parsed_agent_info.name or "candidate-1"
+            inference_configs = {
+                candidate_name: types.EvaluationRunInferenceConfig(
+                    agent_configs=parsed_agent_info.agents,
+                    agent_run_config=types.AgentRunConfig(
+                        agent_engine=parsed_agent_info.agent_resource_name,
+                        user_simulator_config=parsed_user_simulator_config,
+                    ),
+                )
+            }
+
         if isinstance(dataset, types.EvaluationDataset):
             _evals_utils._validate_dataset_agent_data(dataset, inference_configs)
         resolved_dataset = _evals_common._resolve_dataset(