Skip to content

Commit f6f903d

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - add rich HTML visualization for loss pattern analysis
PiperOrigin-RevId: 894799725
1 parent 09794ba commit f6f903d

File tree

6 files changed

+986
-1
lines changed

6 files changed

+986
-1
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# pylint: disable=protected-access,bad-continuation,missing-function-docstring
16+
17+
from tests.unit.vertexai.genai.replays import pytest_helper
18+
from vertexai import types
19+
import pytest
20+
21+
22+
def test_gen_loss_clusters(client):
23+
"""Tests that generate_loss_clusters() returns GenerateLossClustersResponse."""
24+
eval_result = types.EvaluationResult()
25+
response = client.evals.generate_loss_clusters(
26+
eval_result=eval_result,
27+
config=types.LossAnalysisConfig(
28+
metric="multi_turn_task_success_v1",
29+
candidate="travel-agent",
30+
),
31+
)
32+
assert isinstance(response, types.GenerateLossClustersResponse)
33+
assert len(response.results) == 1
34+
result = response.results[0]
35+
assert result.config.metric == "multi_turn_task_success_v1"
36+
assert result.config.candidate == "travel-agent"
37+
assert len(result.clusters) == 2
38+
assert result.clusters[0].cluster_id == "cluster-1"
39+
assert result.clusters[0].taxonomy_entry.l1_category == "Tool Calling"
40+
assert (
41+
result.clusters[0].taxonomy_entry.l2_category == "Missing Tool Invocation"
42+
)
43+
assert result.clusters[0].item_count == 3
44+
assert result.clusters[1].cluster_id == "cluster-2"
45+
assert result.clusters[1].taxonomy_entry.l1_category == "Hallucination"
46+
assert result.clusters[1].item_count == 2
47+
48+
49+
pytest_plugins = ("pytest_asyncio",)
50+
51+
52+
@pytest.mark.asyncio
53+
async def test_gen_loss_clusters_async(client):
54+
"""Tests that generate_loss_clusters() async returns GenerateLossClustersResponse."""
55+
eval_result = types.EvaluationResult()
56+
response = await client.aio.evals.generate_loss_clusters(
57+
eval_result=eval_result,
58+
config=types.LossAnalysisConfig(
59+
metric="multi_turn_task_success_v1",
60+
candidate="travel-agent",
61+
),
62+
)
63+
assert isinstance(response, types.GenerateLossClustersResponse)
64+
assert len(response.results) == 1
65+
result = response.results[0]
66+
assert result.config.metric == "multi_turn_task_success_v1"
67+
assert len(result.clusters) == 2
68+
assert result.clusters[0].cluster_id == "cluster-1"
69+
assert result.clusters[1].cluster_id == "cluster-2"
70+
71+
72+
pytestmark = pytest_helper.setup(
73+
file=__file__,
74+
globals_for_file=globals(),
75+
test_method="evals.generate_loss_clusters",
76+
)

tests/unit/vertexai/genai/test_evals.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from google.cloud.aiplatform import initializer as aiplatform_initializer
3030
from vertexai import _genai
3131
from vertexai._genai import _evals_data_converters
32+
from vertexai._genai import _evals_utils
3233
from vertexai._genai import _evals_metric_handlers
3334
from vertexai._genai import _evals_visualization
3435
from vertexai._genai import _evals_metric_loaders
@@ -265,6 +266,304 @@ def test_t_inline_results(self):
265266
assert payload[0]["candidate_results"][0]["score"] == 0.0
266267

267268

269+
class TestLossAnalysis:
270+
"""Unit tests for loss analysis types and visualization."""
271+
272+
def test_response_structure(self):
273+
response = common_types.GenerateLossClustersResponse(
274+
analysis_time="2026-04-01T10:00:00Z",
275+
results=[
276+
common_types.LossAnalysisResult(
277+
config=common_types.LossAnalysisConfig(
278+
metric="multi_turn_task_success_v1",
279+
candidate="travel-agent",
280+
),
281+
analysis_time="2026-04-01T10:00:00Z",
282+
clusters=[
283+
common_types.LossCluster(
284+
cluster_id="cluster-1",
285+
taxonomy_entry=common_types.LossTaxonomyEntry(
286+
l1_category="Tool Calling",
287+
l2_category="Missing Tool Invocation",
288+
description="The agent failed to invoke a required tool.",
289+
),
290+
item_count=3,
291+
),
292+
common_types.LossCluster(
293+
cluster_id="cluster-2",
294+
taxonomy_entry=common_types.LossTaxonomyEntry(
295+
l1_category="Hallucination",
296+
l2_category="Hallucination of Action",
297+
description="Verbally confirmed action without tool.",
298+
),
299+
item_count=2,
300+
),
301+
],
302+
)
303+
],
304+
)
305+
assert len(response.results) == 1
306+
assert response.analysis_time == "2026-04-01T10:00:00Z"
307+
result = response.results[0]
308+
assert result.config.metric == "multi_turn_task_success_v1"
309+
assert len(result.clusters) == 2
310+
assert result.clusters[0].cluster_id == "cluster-1"
311+
assert result.clusters[0].item_count == 3
312+
assert result.clusters[1].cluster_id == "cluster-2"
313+
314+
def test_get_loss_analysis_html(self):
315+
"""Tests that _get_loss_analysis_html generates valid HTML with data."""
316+
from vertexai._genai import _evals_visualization
317+
import json
318+
319+
data = {
320+
"results": [
321+
{
322+
"config": {
323+
"metric": "test_metric",
324+
"candidate": "test-candidate",
325+
},
326+
"clusters": [
327+
{
328+
"cluster_id": "c1",
329+
"taxonomy_entry": {
330+
"l1_category": "Tool Calling",
331+
"l2_category": "Missing Invocation",
332+
"description": "Agent failed to call the tool.",
333+
},
334+
"item_count": 5,
335+
"examples": [
336+
{
337+
"failed_rubrics": [
338+
{
339+
"rubric_id": "tool_use",
340+
"classification_rationale": "Did not invoke find_flights.",
341+
}
342+
]
343+
}
344+
],
345+
},
346+
],
347+
}
348+
]
349+
}
350+
html = _evals_visualization._get_loss_analysis_html(json.dumps(data))
351+
assert "Loss Pattern Analysis" in html
352+
assert "test_metric" not in html # data is Base64-encoded in the HTML
353+
assert "<!DOCTYPE html>" in html
354+
355+
def test_display_loss_clusters_response_no_ipython(self):
356+
"""Tests graceful fallback when not in IPython."""
357+
from vertexai._genai import _evals_visualization
358+
from unittest import mock
359+
360+
response = common_types.GenerateLossClustersResponse(
361+
results=[
362+
common_types.LossAnalysisResult(
363+
config=common_types.LossAnalysisConfig(
364+
metric="test_metric",
365+
candidate="test-candidate",
366+
),
367+
clusters=[
368+
common_types.LossCluster(
369+
cluster_id="c1",
370+
taxonomy_entry=common_types.LossTaxonomyEntry(
371+
l1_category="Cat1",
372+
l2_category="SubCat1",
373+
),
374+
item_count=5,
375+
),
376+
],
377+
)
378+
],
379+
)
380+
with mock.patch.object(
381+
_evals_visualization, "_is_ipython_env", return_value=False
382+
):
383+
# Should not raise, just log a warning
384+
response.show()
385+
386+
def test_display_loss_analysis_result_no_ipython(self):
387+
"""Tests graceful fallback for individual result when not in IPython."""
388+
from vertexai._genai import _evals_visualization
389+
from unittest import mock
390+
391+
result = common_types.LossAnalysisResult(
392+
config=common_types.LossAnalysisConfig(
393+
metric="test_metric",
394+
candidate="test-candidate",
395+
),
396+
clusters=[
397+
common_types.LossCluster(
398+
cluster_id="c1",
399+
taxonomy_entry=common_types.LossTaxonomyEntry(
400+
l1_category="DirectCat",
401+
l2_category="DirectSubCat",
402+
),
403+
item_count=7,
404+
),
405+
],
406+
)
407+
with mock.patch.object(
408+
_evals_visualization, "_is_ipython_env", return_value=False
409+
):
410+
result.show()
411+
412+
413+
def _make_eval_result(
414+
metrics=None,
415+
candidate_names=None,
416+
):
417+
"""Helper to create an EvaluationResult with the given metrics and candidates."""
418+
metrics = metrics or ["task_success_v1"]
419+
candidate_names = candidate_names or ["agent-1"]
420+
421+
metric_results = {}
422+
for m in metrics:
423+
metric_results[m] = common_types.EvalCaseMetricResult(metric_name=m)
424+
425+
eval_case_results = [
426+
common_types.EvalCaseResult(
427+
eval_case_index=0,
428+
response_candidate_results=[
429+
common_types.ResponseCandidateResult(
430+
response_index=0,
431+
metric_results=metric_results,
432+
)
433+
],
434+
)
435+
]
436+
metadata = common_types.EvaluationRunMetadata(
437+
candidate_names=candidate_names,
438+
)
439+
return common_types.EvaluationResult(
440+
eval_case_results=eval_case_results,
441+
metadata=metadata,
442+
)
443+
444+
445+
class TestResolveMetricName:
446+
"""Unit tests for _resolve_metric_name."""
447+
448+
def test_none_returns_none(self):
449+
assert _evals_utils._resolve_metric_name(None) is None
450+
451+
def test_string_passes_through(self):
452+
assert _evals_utils._resolve_metric_name("task_success_v1") == "task_success_v1"
453+
454+
def test_metric_object_extracts_name(self):
455+
metric = common_types.Metric(name="multi_turn_task_success_v1")
456+
assert (
457+
_evals_utils._resolve_metric_name(metric)
458+
== "multi_turn_task_success_v1"
459+
)
460+
461+
def test_object_with_name_attr(self):
462+
"""Tests that any object with a .name attribute works (e.g., LazyLoadedPrebuiltMetric)."""
463+
464+
class FakeMetric:
465+
name = "tool_use_quality_v1"
466+
467+
assert _evals_utils._resolve_metric_name(FakeMetric()) == "tool_use_quality_v1"
468+
469+
470+
class TestResolveLossAnalysisConfig:
471+
"""Unit tests for _resolve_loss_analysis_config."""
472+
473+
def test_auto_infer_single_metric_and_candidate(self):
474+
eval_result = _make_eval_result(
475+
metrics=["task_success_v1"], candidate_names=["agent-1"]
476+
)
477+
resolved = _evals_utils._resolve_loss_analysis_config(
478+
eval_result=eval_result
479+
)
480+
assert resolved.metric == "task_success_v1"
481+
assert resolved.candidate == "agent-1"
482+
483+
def test_explicit_metric_and_candidate(self):
484+
eval_result = _make_eval_result(
485+
metrics=["m1", "m2"], candidate_names=["c1", "c2"]
486+
)
487+
resolved = _evals_utils._resolve_loss_analysis_config(
488+
eval_result=eval_result, metric="m1", candidate="c2"
489+
)
490+
assert resolved.metric == "m1"
491+
assert resolved.candidate == "c2"
492+
493+
def test_config_provides_metric_and_candidate(self):
494+
eval_result = _make_eval_result(
495+
metrics=["m1"], candidate_names=["c1"]
496+
)
497+
config = common_types.LossAnalysisConfig(
498+
metric="m1", candidate="c1", predefined_taxonomy="my_taxonomy"
499+
)
500+
resolved = _evals_utils._resolve_loss_analysis_config(
501+
eval_result=eval_result, config=config
502+
)
503+
assert resolved.metric == "m1"
504+
assert resolved.candidate == "c1"
505+
assert resolved.predefined_taxonomy == "my_taxonomy"
506+
507+
def test_explicit_args_override_config(self):
508+
eval_result = _make_eval_result(
509+
metrics=["m1", "m2"], candidate_names=["c1", "c2"]
510+
)
511+
config = common_types.LossAnalysisConfig(metric="m1", candidate="c1")
512+
resolved = _evals_utils._resolve_loss_analysis_config(
513+
eval_result=eval_result, config=config, metric="m2", candidate="c2"
514+
)
515+
assert resolved.metric == "m2"
516+
assert resolved.candidate == "c2"
517+
518+
def test_error_multiple_metrics_no_explicit(self):
519+
eval_result = _make_eval_result(
520+
metrics=["m1", "m2"], candidate_names=["c1"]
521+
)
522+
with pytest.raises(ValueError, match="multiple metrics"):
523+
_evals_utils._resolve_loss_analysis_config(eval_result=eval_result)
524+
525+
def test_error_multiple_candidates_no_explicit(self):
526+
eval_result = _make_eval_result(
527+
metrics=["m1"], candidate_names=["c1", "c2"]
528+
)
529+
with pytest.raises(ValueError, match="multiple candidates"):
530+
_evals_utils._resolve_loss_analysis_config(eval_result=eval_result)
531+
532+
def test_error_invalid_metric(self):
533+
eval_result = _make_eval_result(
534+
metrics=["m1"], candidate_names=["c1"]
535+
)
536+
with pytest.raises(ValueError, match="not found in eval_result"):
537+
_evals_utils._resolve_loss_analysis_config(
538+
eval_result=eval_result, metric="nonexistent"
539+
)
540+
541+
def test_error_invalid_candidate(self):
542+
eval_result = _make_eval_result(
543+
metrics=["m1"], candidate_names=["c1"]
544+
)
545+
with pytest.raises(ValueError, match="not found in eval_result"):
546+
_evals_utils._resolve_loss_analysis_config(
547+
eval_result=eval_result, candidate="nonexistent"
548+
)
549+
550+
def test_no_candidates_defaults_to_candidate_1(self):
551+
eval_result = _make_eval_result(metrics=["m1"], candidate_names=[])
552+
eval_result = eval_result.model_copy(
553+
update={"metadata": common_types.EvaluationRunMetadata()}
554+
)
555+
resolved = _evals_utils._resolve_loss_analysis_config(
556+
eval_result=eval_result
557+
)
558+
assert resolved.metric == "m1"
559+
assert resolved.candidate == "candidate_1"
560+
561+
def test_no_eval_case_results_raises(self):
562+
eval_result = common_types.EvaluationResult()
563+
with pytest.raises(ValueError, match="no metric results"):
564+
_evals_utils._resolve_loss_analysis_config(eval_result=eval_result)
565+
566+
268567
class TestEvals:
269568
"""Unit tests for the GenAI client."""
270569

0 commit comments

Comments
 (0)