|
29 | 29 | from google.cloud.aiplatform import initializer as aiplatform_initializer |
30 | 30 | from vertexai import _genai |
31 | 31 | from vertexai._genai import _evals_data_converters |
| 32 | +from vertexai._genai import _evals_utils |
32 | 33 | from vertexai._genai import _evals_metric_handlers |
33 | 34 | from vertexai._genai import _evals_visualization |
34 | 35 | from vertexai._genai import _evals_metric_loaders |
@@ -265,6 +266,304 @@ def test_t_inline_results(self): |
265 | 266 | assert payload[0]["candidate_results"][0]["score"] == 0.0 |
266 | 267 |
|
267 | 268 |
|
| 269 | +class TestLossAnalysis: |
| 270 | + """Unit tests for loss analysis types and visualization.""" |
| 271 | + |
| 272 | + def test_response_structure(self): |
| 273 | + response = common_types.GenerateLossClustersResponse( |
| 274 | + analysis_time="2026-04-01T10:00:00Z", |
| 275 | + results=[ |
| 276 | + common_types.LossAnalysisResult( |
| 277 | + config=common_types.LossAnalysisConfig( |
| 278 | + metric="multi_turn_task_success_v1", |
| 279 | + candidate="travel-agent", |
| 280 | + ), |
| 281 | + analysis_time="2026-04-01T10:00:00Z", |
| 282 | + clusters=[ |
| 283 | + common_types.LossCluster( |
| 284 | + cluster_id="cluster-1", |
| 285 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 286 | + l1_category="Tool Calling", |
| 287 | + l2_category="Missing Tool Invocation", |
| 288 | + description="The agent failed to invoke a required tool.", |
| 289 | + ), |
| 290 | + item_count=3, |
| 291 | + ), |
| 292 | + common_types.LossCluster( |
| 293 | + cluster_id="cluster-2", |
| 294 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 295 | + l1_category="Hallucination", |
| 296 | + l2_category="Hallucination of Action", |
| 297 | + description="Verbally confirmed action without tool.", |
| 298 | + ), |
| 299 | + item_count=2, |
| 300 | + ), |
| 301 | + ], |
| 302 | + ) |
| 303 | + ], |
| 304 | + ) |
| 305 | + assert len(response.results) == 1 |
| 306 | + assert response.analysis_time == "2026-04-01T10:00:00Z" |
| 307 | + result = response.results[0] |
| 308 | + assert result.config.metric == "multi_turn_task_success_v1" |
| 309 | + assert len(result.clusters) == 2 |
| 310 | + assert result.clusters[0].cluster_id == "cluster-1" |
| 311 | + assert result.clusters[0].item_count == 3 |
| 312 | + assert result.clusters[1].cluster_id == "cluster-2" |
| 313 | + |
| 314 | + def test_get_loss_analysis_html(self): |
| 315 | + """Tests that _get_loss_analysis_html generates valid HTML with data.""" |
| 316 | + from vertexai._genai import _evals_visualization |
| 317 | + import json |
| 318 | + |
| 319 | + data = { |
| 320 | + "results": [ |
| 321 | + { |
| 322 | + "config": { |
| 323 | + "metric": "test_metric", |
| 324 | + "candidate": "test-candidate", |
| 325 | + }, |
| 326 | + "clusters": [ |
| 327 | + { |
| 328 | + "cluster_id": "c1", |
| 329 | + "taxonomy_entry": { |
| 330 | + "l1_category": "Tool Calling", |
| 331 | + "l2_category": "Missing Invocation", |
| 332 | + "description": "Agent failed to call the tool.", |
| 333 | + }, |
| 334 | + "item_count": 5, |
| 335 | + "examples": [ |
| 336 | + { |
| 337 | + "failed_rubrics": [ |
| 338 | + { |
| 339 | + "rubric_id": "tool_use", |
| 340 | + "classification_rationale": "Did not invoke find_flights.", |
| 341 | + } |
| 342 | + ] |
| 343 | + } |
| 344 | + ], |
| 345 | + }, |
| 346 | + ], |
| 347 | + } |
| 348 | + ] |
| 349 | + } |
| 350 | + html = _evals_visualization._get_loss_analysis_html(json.dumps(data)) |
| 351 | + assert "Loss Pattern Analysis" in html |
| 352 | + assert "test_metric" not in html # data is Base64-encoded in the HTML |
| 353 | + assert "<!DOCTYPE html>" in html |
| 354 | + |
| 355 | + def test_display_loss_clusters_response_no_ipython(self): |
| 356 | + """Tests graceful fallback when not in IPython.""" |
| 357 | + from vertexai._genai import _evals_visualization |
| 358 | + from unittest import mock |
| 359 | + |
| 360 | + response = common_types.GenerateLossClustersResponse( |
| 361 | + results=[ |
| 362 | + common_types.LossAnalysisResult( |
| 363 | + config=common_types.LossAnalysisConfig( |
| 364 | + metric="test_metric", |
| 365 | + candidate="test-candidate", |
| 366 | + ), |
| 367 | + clusters=[ |
| 368 | + common_types.LossCluster( |
| 369 | + cluster_id="c1", |
| 370 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 371 | + l1_category="Cat1", |
| 372 | + l2_category="SubCat1", |
| 373 | + ), |
| 374 | + item_count=5, |
| 375 | + ), |
| 376 | + ], |
| 377 | + ) |
| 378 | + ], |
| 379 | + ) |
| 380 | + with mock.patch.object( |
| 381 | + _evals_visualization, "_is_ipython_env", return_value=False |
| 382 | + ): |
| 383 | + # Should not raise, just log a warning |
| 384 | + response.show() |
| 385 | + |
| 386 | + def test_display_loss_analysis_result_no_ipython(self): |
| 387 | + """Tests graceful fallback for individual result when not in IPython.""" |
| 388 | + from vertexai._genai import _evals_visualization |
| 389 | + from unittest import mock |
| 390 | + |
| 391 | + result = common_types.LossAnalysisResult( |
| 392 | + config=common_types.LossAnalysisConfig( |
| 393 | + metric="test_metric", |
| 394 | + candidate="test-candidate", |
| 395 | + ), |
| 396 | + clusters=[ |
| 397 | + common_types.LossCluster( |
| 398 | + cluster_id="c1", |
| 399 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 400 | + l1_category="DirectCat", |
| 401 | + l2_category="DirectSubCat", |
| 402 | + ), |
| 403 | + item_count=7, |
| 404 | + ), |
| 405 | + ], |
| 406 | + ) |
| 407 | + with mock.patch.object( |
| 408 | + _evals_visualization, "_is_ipython_env", return_value=False |
| 409 | + ): |
| 410 | + result.show() |
| 411 | + |
| 412 | + |
| 413 | +def _make_eval_result( |
| 414 | + metrics=None, |
| 415 | + candidate_names=None, |
| 416 | +): |
| 417 | + """Helper to create an EvaluationResult with the given metrics and candidates.""" |
| 418 | + metrics = metrics or ["task_success_v1"] |
| 419 | + candidate_names = candidate_names or ["agent-1"] |
| 420 | + |
| 421 | + metric_results = {} |
| 422 | + for m in metrics: |
| 423 | + metric_results[m] = common_types.EvalCaseMetricResult(metric_name=m) |
| 424 | + |
| 425 | + eval_case_results = [ |
| 426 | + common_types.EvalCaseResult( |
| 427 | + eval_case_index=0, |
| 428 | + response_candidate_results=[ |
| 429 | + common_types.ResponseCandidateResult( |
| 430 | + response_index=0, |
| 431 | + metric_results=metric_results, |
| 432 | + ) |
| 433 | + ], |
| 434 | + ) |
| 435 | + ] |
| 436 | + metadata = common_types.EvaluationRunMetadata( |
| 437 | + candidate_names=candidate_names, |
| 438 | + ) |
| 439 | + return common_types.EvaluationResult( |
| 440 | + eval_case_results=eval_case_results, |
| 441 | + metadata=metadata, |
| 442 | + ) |
| 443 | + |
| 444 | + |
| 445 | +class TestResolveMetricName: |
| 446 | + """Unit tests for _resolve_metric_name.""" |
| 447 | + |
| 448 | + def test_none_returns_none(self): |
| 449 | + assert _evals_utils._resolve_metric_name(None) is None |
| 450 | + |
| 451 | + def test_string_passes_through(self): |
| 452 | + assert _evals_utils._resolve_metric_name("task_success_v1") == "task_success_v1" |
| 453 | + |
| 454 | + def test_metric_object_extracts_name(self): |
| 455 | + metric = common_types.Metric(name="multi_turn_task_success_v1") |
| 456 | + assert ( |
| 457 | + _evals_utils._resolve_metric_name(metric) |
| 458 | + == "multi_turn_task_success_v1" |
| 459 | + ) |
| 460 | + |
| 461 | + def test_object_with_name_attr(self): |
| 462 | + """Tests that any object with a .name attribute works (e.g., LazyLoadedPrebuiltMetric).""" |
| 463 | + |
| 464 | + class FakeMetric: |
| 465 | + name = "tool_use_quality_v1" |
| 466 | + |
| 467 | + assert _evals_utils._resolve_metric_name(FakeMetric()) == "tool_use_quality_v1" |
| 468 | + |
| 469 | + |
| 470 | +class TestResolveLossAnalysisConfig: |
| 471 | + """Unit tests for _resolve_loss_analysis_config.""" |
| 472 | + |
| 473 | + def test_auto_infer_single_metric_and_candidate(self): |
| 474 | + eval_result = _make_eval_result( |
| 475 | + metrics=["task_success_v1"], candidate_names=["agent-1"] |
| 476 | + ) |
| 477 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 478 | + eval_result=eval_result |
| 479 | + ) |
| 480 | + assert resolved.metric == "task_success_v1" |
| 481 | + assert resolved.candidate == "agent-1" |
| 482 | + |
| 483 | + def test_explicit_metric_and_candidate(self): |
| 484 | + eval_result = _make_eval_result( |
| 485 | + metrics=["m1", "m2"], candidate_names=["c1", "c2"] |
| 486 | + ) |
| 487 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 488 | + eval_result=eval_result, metric="m1", candidate="c2" |
| 489 | + ) |
| 490 | + assert resolved.metric == "m1" |
| 491 | + assert resolved.candidate == "c2" |
| 492 | + |
| 493 | + def test_config_provides_metric_and_candidate(self): |
| 494 | + eval_result = _make_eval_result( |
| 495 | + metrics=["m1"], candidate_names=["c1"] |
| 496 | + ) |
| 497 | + config = common_types.LossAnalysisConfig( |
| 498 | + metric="m1", candidate="c1", predefined_taxonomy="my_taxonomy" |
| 499 | + ) |
| 500 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 501 | + eval_result=eval_result, config=config |
| 502 | + ) |
| 503 | + assert resolved.metric == "m1" |
| 504 | + assert resolved.candidate == "c1" |
| 505 | + assert resolved.predefined_taxonomy == "my_taxonomy" |
| 506 | + |
| 507 | + def test_explicit_args_override_config(self): |
| 508 | + eval_result = _make_eval_result( |
| 509 | + metrics=["m1", "m2"], candidate_names=["c1", "c2"] |
| 510 | + ) |
| 511 | + config = common_types.LossAnalysisConfig(metric="m1", candidate="c1") |
| 512 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 513 | + eval_result=eval_result, config=config, metric="m2", candidate="c2" |
| 514 | + ) |
| 515 | + assert resolved.metric == "m2" |
| 516 | + assert resolved.candidate == "c2" |
| 517 | + |
| 518 | + def test_error_multiple_metrics_no_explicit(self): |
| 519 | + eval_result = _make_eval_result( |
| 520 | + metrics=["m1", "m2"], candidate_names=["c1"] |
| 521 | + ) |
| 522 | + with pytest.raises(ValueError, match="multiple metrics"): |
| 523 | + _evals_utils._resolve_loss_analysis_config(eval_result=eval_result) |
| 524 | + |
| 525 | + def test_error_multiple_candidates_no_explicit(self): |
| 526 | + eval_result = _make_eval_result( |
| 527 | + metrics=["m1"], candidate_names=["c1", "c2"] |
| 528 | + ) |
| 529 | + with pytest.raises(ValueError, match="multiple candidates"): |
| 530 | + _evals_utils._resolve_loss_analysis_config(eval_result=eval_result) |
| 531 | + |
| 532 | + def test_error_invalid_metric(self): |
| 533 | + eval_result = _make_eval_result( |
| 534 | + metrics=["m1"], candidate_names=["c1"] |
| 535 | + ) |
| 536 | + with pytest.raises(ValueError, match="not found in eval_result"): |
| 537 | + _evals_utils._resolve_loss_analysis_config( |
| 538 | + eval_result=eval_result, metric="nonexistent" |
| 539 | + ) |
| 540 | + |
| 541 | + def test_error_invalid_candidate(self): |
| 542 | + eval_result = _make_eval_result( |
| 543 | + metrics=["m1"], candidate_names=["c1"] |
| 544 | + ) |
| 545 | + with pytest.raises(ValueError, match="not found in eval_result"): |
| 546 | + _evals_utils._resolve_loss_analysis_config( |
| 547 | + eval_result=eval_result, candidate="nonexistent" |
| 548 | + ) |
| 549 | + |
| 550 | + def test_no_candidates_defaults_to_candidate_1(self): |
| 551 | + eval_result = _make_eval_result(metrics=["m1"], candidate_names=[]) |
| 552 | + eval_result = eval_result.model_copy( |
| 553 | + update={"metadata": common_types.EvaluationRunMetadata()} |
| 554 | + ) |
| 555 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 556 | + eval_result=eval_result |
| 557 | + ) |
| 558 | + assert resolved.metric == "m1" |
| 559 | + assert resolved.candidate == "candidate_1" |
| 560 | + |
| 561 | + def test_no_eval_case_results_raises(self): |
| 562 | + eval_result = common_types.EvaluationResult() |
| 563 | + with pytest.raises(ValueError, match="no metric results"): |
| 564 | + _evals_utils._resolve_loss_analysis_config(eval_result=eval_result) |
| 565 | + |
| 566 | + |
268 | 567 | class TestEvals: |
269 | 568 | """Unit tests for the GenAI client.""" |
270 | 569 |
|
|
0 commit comments