Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
),
)
INFERENCE_CONFIG = types.EvaluationRunInferenceConfig(
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
model="projects/977012026409/locations/us-central1/publishers/google/models/gemini-2.5-flash"
)
TOOL = genai_types.Tool(
function_declarations=[
Expand All @@ -82,8 +82,14 @@
AGENT_INFO = types.evals.AgentInfo(
agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
name="agent-1",
instruction="agent-1 instruction",
tool_declarations=[TOOL],
agents={
"agent-1": types.evals.AgentConfig(
agent_id="agent-1",
instruction="agent-1 instruction",
tools=[TOOL],
)
},
root_agent_id="agent-1",
)
DEFAULT_PROMPT_TEMPLATE = "{prompt}"
INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
Expand All @@ -96,9 +102,9 @@
}
)
CANDIDATE_NAME = "candidate_1"
MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
MODEL_NAME = "projects/977012026409/locations/us-central1/publishers/google/models/gemini-2.5-flash"
EVAL_SET_NAME = (
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
"projects/977012026409/locations/us-central1/evaluationSets/6619939608513740800"
)


Expand Down Expand Up @@ -140,12 +146,7 @@ def test_create_eval_run_data_source_evaluation_set(client):
assert evaluation_run.inference_configs[
AGENT_INFO.name
] == types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
developer_instruction=genai_types.Content(
parts=[genai_types.Part(text="agent-1 instruction")]
),
tools=[TOOL],
)
agent_configs=AGENT_INFO.agents,
)
assert evaluation_run.labels == {
"vertex-ai-evaluation-agent-engine-id": "456",
Expand Down
163 changes: 24 additions & 139 deletions tests/unit/vertexai/genai/replays/test_get_evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from tests.unit.vertexai.genai.replays import pytest_helper
from vertexai import types
from google.genai import types as genai_types
import datetime
import pytest

Expand All @@ -25,13 +24,13 @@ def test_get_eval_run(client):
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
client._api_client._http_options.api_version = "v1beta1"
evaluation_run_name = (
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
"projects/977012026409/locations/us-central1/evaluationRuns/3940878372367761408"
)
evaluation_run = client.evals.get_evaluation_run(
name=evaluation_run_name, include_evaluation_items=True
)
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
check_run_5133048044039700480_evaluation_item_results(
check_run_3940878372367761408(client, evaluation_run, evaluation_run_name)
check_run_3940878372367761408_evaluation_item_results(
client, evaluation_run, evaluation_run_name
)

Expand All @@ -40,10 +39,10 @@ def test_get_eval_run_include_evaluation_items_false(client):
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
client._api_client._http_options.api_version = "v1beta1"
evaluation_run_name = (
"projects/503583131166/locations/us-central1/evaluationRuns/5133048044039700480"
"projects/977012026409/locations/us-central1/evaluationRuns/3940878372367761408"
)
evaluation_run = client.evals.get_evaluation_run(name=evaluation_run_name)
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
check_run_3940878372367761408(client, evaluation_run, evaluation_run_name)
assert evaluation_run.evaluation_item_results is None


Expand Down Expand Up @@ -103,172 +102,58 @@ def test_get_eval_run_eval_set_source(client):
async def test_get_eval_run_async(client):
"""Tests that get_evaluation_run() returns a correctly structured EvaluationRun."""
client._api_client._http_options.api_version = "v1beta1"
eval_run_id = "5133048044039700480"
eval_run_id = "3940878372367761408"
evaluation_run_name = (
f"projects/503583131166/locations/us-central1/evaluationRuns/{eval_run_id}"
f"projects/977012026409/locations/us-central1/evaluationRuns/{eval_run_id}"
)
evaluation_run = await client.aio.evals.get_evaluation_run(name=eval_run_id)
check_run_5133048044039700480(client, evaluation_run, evaluation_run_name)
check_run_3940878372367761408(client, evaluation_run, evaluation_run_name)
assert evaluation_run.evaluation_item_results is None


def check_run_5133048044039700480(
def check_run_3940878372367761408(
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
):
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.name == evaluation_run_name
assert evaluation_run.display_name == "sdk-test-1"
assert evaluation_run.metadata == {"pipeline_id": "4868043098678099968"}
assert (
evaluation_run.display_name
== "evaluation_run_9a464a39-6d40-4d4e-a5e2-a4ceabea4b15"
)
assert evaluation_run.metadata == {"pipeline_id": "8162140658019074048"}
assert evaluation_run.create_time == datetime.datetime(
2025, 10, 21, 19, 25, 58, 669441, tzinfo=datetime.timezone.utc
2026, 3, 18, 1, 10, 13, 360535, tzinfo=datetime.timezone.utc
)
assert evaluation_run.completion_time == datetime.datetime(
2025, 10, 21, 19, 26, 15, 855568, tzinfo=datetime.timezone.utc
2026, 3, 18, 1, 11, 0, 448191, tzinfo=datetime.timezone.utc
)
assert evaluation_run.state == types.EvaluationRunState.SUCCEEDED
assert evaluation_run.evaluation_set_snapshot == (
"projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
"projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
)
assert (
evaluation_run.data_source.evaluation_set
== "projects/503583131166/locations/us-central1/evaluationSets/3122155626046685184"
== "projects/977012026409/locations/us-central1/evaluationSets/3991900109943078912"
)
assert evaluation_run.evaluation_run_results.evaluation_set == (
"projects/503583131166/locations/us-central1/evaluationSets/129513673658990592"
)
assert evaluation_run.inference_configs == {
"gemini-2.0-flash-001@default": types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
developer_instruction={
"parts": [{"text": "example agent developer instruction"}]
},
tools=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="check_chime",
description="Check chime.",
parameters={
"type": "OBJECT",
"properties": {
"nums": {
"type": "STRING",
"description": "List of numbers to be verified.",
}
},
"required": ["nums"],
},
),
],
)
],
)
),
}
assert evaluation_run.evaluation_run_results.summary_metrics == (
types.SummaryMetric(
metrics={
"gemini-2.0-flash-001@default/safety_v1/VARIANCE": 0.08950617055834077,
"gemini-2.0-flash-001@default/safety_v1/MAXIMUM": 1,
"gemini-2.0-flash-001@default/universal/AVERAGE": 0.7888888915379842,
"gemini-2.0-flash-001@default/universal/P90": 1,
"gemini-2.0-flash-001@default/safety_v1/MEDIAN": 1,
"gemini-2.0-flash-001@default/universal/P95": 1,
"gemini-2.0-flash-001@default/universal/VARIANCE": 0.08950617055834077,
"gemini-2.0-flash-001@default/universal/STANDARD_DEVIATION": 0.2991758188061675,
"gemini-2.0-flash-001@default/universal/MEDIAN": 1,
"gemini-2.0-flash-001@default/safety_v1/STANDARD_DEVIATION": 0.2991758188061675,
"gemini-2.0-flash-001@default/universal/MODE": 1,
"gemini-2.0-flash-001@default/safety_v1/MODE": 1,
"gemini-2.0-flash-001@default/safety_v1/MINIMUM": 0.3333333432674408,
"gemini-2.0-flash-001@default/safety_v1/P90": 1,
"gemini-2.0-flash-001@default/safety_v1/P95": 1,
"gemini-2.0-flash-001@default/universal/P99": 1,
"gemini-2.0-flash-001@default/safety_v1/AVERAGE": 0.7888888915379842,
"gemini-2.0-flash-001@default/universal/MINIMUM": 0.3333333432674408,
"gemini-2.0-flash-001@default/universal/MAXIMUM": 1,
"gemini-2.0-flash-001@default/safety_v1/P99": 1,
},
total_items=3,
)
"projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
)
assert evaluation_run.evaluation_run_results.summary_metrics.total_items == 2
assert evaluation_run.error is None


def check_run_5133048044039700480_evaluation_item_results(
def check_run_3940878372367761408_evaluation_item_results(
client, evaluation_run: types.EvaluationRun, evaluation_run_name: str
):
eval_result = evaluation_run.evaluation_item_results
assert isinstance(eval_result, types.EvaluationResult)
assert eval_result.summary_metrics == [
types.AggregatedMetricResult(
metric_name="safety_v1",
mean_score=0.7888888915379842,
stdev_score=0.2991758188061675,
),
types.AggregatedMetricResult(
metric_name="universal",
mean_score=0.7888888915379842,
stdev_score=0.2991758188061675,
metric_name="general_quality_v1",
mean_score=0.13333333656191826,
stdev_score=0.03333333507180214,
),
]
# Check the agent info.
assert eval_result.agent_info == types.evals.AgentInfo(
name="gemini-2.0-flash-001@default",
instruction="example agent developer instruction",
description=None,
tool_declarations=[
genai_types.Tool(
function_declarations=[
genai_types.FunctionDeclaration(
name="check_chime",
description="Check chime.",
parameters={
"type": "OBJECT",
"properties": {
"nums": {
"type": "STRING",
"description": "List of numbers to be verified.",
}
},
"required": ["nums"],
},
),
],
)
],
)
# Check the first eval case result.
eval_case_result = eval_result.eval_case_results[0]
assert isinstance(eval_case_result, types.EvalCaseResult)
# Check the response candidate results.
response_candidate_result = eval_case_result.response_candidate_results[0]
assert response_candidate_result.response_index == 0
universal_metric_result = response_candidate_result.metric_results["universal"]
assert isinstance(universal_metric_result, types.EvalCaseMetricResult)
assert universal_metric_result.metric_name == "universal"
assert universal_metric_result.score > 0
assert universal_metric_result.explanation is None
# Check the first rubric verdict.
rubric_verdict_0 = universal_metric_result.rubric_verdicts[0]
assert isinstance(rubric_verdict_0, types.evals.RubricVerdict)
assert rubric_verdict_0.evaluated_rubric == types.evals.Rubric(
content=types.evals.RubricContent(
property=types.evals.RubricContentProperty(
description="The response is in English."
)
),
importance="HIGH",
type="LANGUAGE:PRIMARY_RESPONSE_LANGUAGE",
)
assert rubric_verdict_0.reasoning is not None
assert rubric_verdict_0.verdict is True
# Check the first evaluation dataset.
eval_dataset = eval_result.evaluation_dataset[0]
assert isinstance(eval_dataset, types.EvaluationDataset)
assert eval_dataset.candidate_name == "gemini-2.0-flash-001@default"
assert eval_dataset.eval_dataset_df.shape[0] == 3
assert eval_dataset.eval_dataset_df.shape[1] > 3


pytestmark = pytest_helper.setup(
Expand Down
Loading
Loading