|
29 | 29 | from google.cloud.aiplatform import initializer as aiplatform_initializer |
30 | 30 | from vertexai import _genai |
31 | 31 | from vertexai._genai import _evals_data_converters |
| 32 | +from vertexai._genai import _evals_utils |
32 | 33 | from vertexai._genai import _evals_metric_handlers |
33 | 34 | from vertexai._genai import _evals_visualization |
34 | 35 | from vertexai._genai import _evals_metric_loaders |
@@ -265,6 +266,254 @@ def test_t_inline_results(self): |
265 | 266 | assert payload[0]["candidate_results"][0]["score"] == 0.0 |
266 | 267 |
|
267 | 268 |
|
| 269 | +class TestLossAnalysis: |
| 270 | + """Unit tests for loss analysis types and visualization.""" |
| 271 | + |
| 272 | + def test_response_structure(self): |
| 273 | + response = common_types.GenerateLossClustersResponse( |
| 274 | + analysis_time="2026-04-01T10:00:00Z", |
| 275 | + results=[ |
| 276 | + common_types.LossAnalysisResult( |
| 277 | + config=common_types.LossAnalysisConfig( |
| 278 | + metric="multi_turn_task_success_v1", |
| 279 | + candidate="travel-agent", |
| 280 | + ), |
| 281 | + analysis_time="2026-04-01T10:00:00Z", |
| 282 | + clusters=[ |
| 283 | + common_types.LossCluster( |
| 284 | + cluster_id="cluster-1", |
| 285 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 286 | + l1_category="Tool Calling", |
| 287 | + l2_category="Missing Tool Invocation", |
| 288 | + description="The agent failed to invoke a required tool.", |
| 289 | + ), |
| 290 | + item_count=3, |
| 291 | + ), |
| 292 | + common_types.LossCluster( |
| 293 | + cluster_id="cluster-2", |
| 294 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 295 | + l1_category="Hallucination", |
| 296 | + l2_category="Hallucination of Action", |
| 297 | + description="Verbally confirmed action without tool.", |
| 298 | + ), |
| 299 | + item_count=2, |
| 300 | + ), |
| 301 | + ], |
| 302 | + ) |
| 303 | + ], |
| 304 | + ) |
| 305 | + assert len(response.results) == 1 |
| 306 | + assert response.analysis_time == "2026-04-01T10:00:00Z" |
| 307 | + result = response.results[0] |
| 308 | + assert result.config.metric == "multi_turn_task_success_v1" |
| 309 | + assert len(result.clusters) == 2 |
| 310 | + assert result.clusters[0].cluster_id == "cluster-1" |
| 311 | + assert result.clusters[0].item_count == 3 |
| 312 | + assert result.clusters[1].cluster_id == "cluster-2" |
| 313 | + |
| 314 | + def test_response_show_with_results(self, capsys): |
| 315 | + response = common_types.GenerateLossClustersResponse( |
| 316 | + results=[ |
| 317 | + common_types.LossAnalysisResult( |
| 318 | + config=common_types.LossAnalysisConfig( |
| 319 | + metric="test_metric", |
| 320 | + candidate="test-candidate", |
| 321 | + ), |
| 322 | + clusters=[ |
| 323 | + common_types.LossCluster( |
| 324 | + cluster_id="c1", |
| 325 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 326 | + l1_category="Cat1", |
| 327 | + l2_category="SubCat1", |
| 328 | + ), |
| 329 | + item_count=5, |
| 330 | + ), |
| 331 | + ], |
| 332 | + ) |
| 333 | + ], |
| 334 | + ) |
| 335 | + response.show() |
| 336 | + captured = capsys.readouterr() |
| 337 | + assert "test_metric" in captured.out |
| 338 | + assert "c1" in captured.out |
| 339 | + |
| 340 | + def test_loss_analysis_result_show(self, capsys): |
| 341 | + result = common_types.LossAnalysisResult( |
| 342 | + config=common_types.LossAnalysisConfig( |
| 343 | + metric="test_metric", |
| 344 | + candidate="test-candidate", |
| 345 | + ), |
| 346 | + clusters=[ |
| 347 | + common_types.LossCluster( |
| 348 | + cluster_id="c1", |
| 349 | + taxonomy_entry=common_types.LossTaxonomyEntry( |
| 350 | + l1_category="DirectCat", |
| 351 | + l2_category="DirectSubCat", |
| 352 | + ), |
| 353 | + item_count=7, |
| 354 | + ), |
| 355 | + ], |
| 356 | + ) |
| 357 | + result.show() |
| 358 | + captured = capsys.readouterr() |
| 359 | + assert "test_metric" in captured.out |
| 360 | + assert "c1" in captured.out |
| 361 | + |
| 362 | + |
| 363 | +def _make_eval_result( |
| 364 | + metrics=None, |
| 365 | + candidate_names=None, |
| 366 | +): |
| 367 | + """Helper to create an EvaluationResult with the given metrics and candidates.""" |
| 368 | + metrics = metrics or ["task_success_v1"] |
| 369 | + candidate_names = candidate_names or ["agent-1"] |
| 370 | + |
| 371 | + metric_results = {} |
| 372 | + for m in metrics: |
| 373 | + metric_results[m] = common_types.EvalCaseMetricResult(metric_name=m) |
| 374 | + |
| 375 | + eval_case_results = [ |
| 376 | + common_types.EvalCaseResult( |
| 377 | + eval_case_index=0, |
| 378 | + response_candidate_results=[ |
| 379 | + common_types.ResponseCandidateResult( |
| 380 | + response_index=0, |
| 381 | + metric_results=metric_results, |
| 382 | + ) |
| 383 | + ], |
| 384 | + ) |
| 385 | + ] |
| 386 | + metadata = common_types.EvaluationRunMetadata( |
| 387 | + candidate_names=candidate_names, |
| 388 | + ) |
| 389 | + return common_types.EvaluationResult( |
| 390 | + eval_case_results=eval_case_results, |
| 391 | + metadata=metadata, |
| 392 | + ) |
| 393 | + |
| 394 | + |
| 395 | +class TestResolveMetricName: |
| 396 | + """Unit tests for _resolve_metric_name.""" |
| 397 | + |
| 398 | + def test_none_returns_none(self): |
| 399 | + assert _evals_utils._resolve_metric_name(None) is None |
| 400 | + |
| 401 | + def test_string_passes_through(self): |
| 402 | + assert _evals_utils._resolve_metric_name("task_success_v1") == "task_success_v1" |
| 403 | + |
| 404 | + def test_metric_object_extracts_name(self): |
| 405 | + metric = common_types.Metric(name="multi_turn_task_success_v1") |
| 406 | + assert ( |
| 407 | + _evals_utils._resolve_metric_name(metric) |
| 408 | + == "multi_turn_task_success_v1" |
| 409 | + ) |
| 410 | + |
| 411 | + def test_object_with_name_attr(self): |
| 412 | + """Tests that any object with a .name attribute works (e.g., LazyLoadedPrebuiltMetric).""" |
| 413 | + |
| 414 | + class FakeMetric: |
| 415 | + name = "tool_use_quality_v1" |
| 416 | + |
| 417 | + assert _evals_utils._resolve_metric_name(FakeMetric()) == "tool_use_quality_v1" |
| 418 | + |
| 419 | + |
| 420 | +class TestResolveLossAnalysisConfig: |
| 421 | + """Unit tests for _resolve_loss_analysis_config.""" |
| 422 | + |
| 423 | + def test_auto_infer_single_metric_and_candidate(self): |
| 424 | + eval_result = _make_eval_result( |
| 425 | + metrics=["task_success_v1"], candidate_names=["agent-1"] |
| 426 | + ) |
| 427 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 428 | + eval_result=eval_result |
| 429 | + ) |
| 430 | + assert resolved.metric == "task_success_v1" |
| 431 | + assert resolved.candidate == "agent-1" |
| 432 | + |
| 433 | + def test_explicit_metric_and_candidate(self): |
| 434 | + eval_result = _make_eval_result( |
| 435 | + metrics=["m1", "m2"], candidate_names=["c1", "c2"] |
| 436 | + ) |
| 437 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 438 | + eval_result=eval_result, metric="m1", candidate="c2" |
| 439 | + ) |
| 440 | + assert resolved.metric == "m1" |
| 441 | + assert resolved.candidate == "c2" |
| 442 | + |
| 443 | + def test_config_provides_metric_and_candidate(self): |
| 444 | + eval_result = _make_eval_result( |
| 445 | + metrics=["m1"], candidate_names=["c1"] |
| 446 | + ) |
| 447 | + config = common_types.LossAnalysisConfig( |
| 448 | + metric="m1", candidate="c1", predefined_taxonomy="my_taxonomy" |
| 449 | + ) |
| 450 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 451 | + eval_result=eval_result, config=config |
| 452 | + ) |
| 453 | + assert resolved.metric == "m1" |
| 454 | + assert resolved.candidate == "c1" |
| 455 | + assert resolved.predefined_taxonomy == "my_taxonomy" |
| 456 | + |
| 457 | + def test_explicit_args_override_config(self): |
| 458 | + eval_result = _make_eval_result( |
| 459 | + metrics=["m1", "m2"], candidate_names=["c1", "c2"] |
| 460 | + ) |
| 461 | + config = common_types.LossAnalysisConfig(metric="m1", candidate="c1") |
| 462 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 463 | + eval_result=eval_result, config=config, metric="m2", candidate="c2" |
| 464 | + ) |
| 465 | + assert resolved.metric == "m2" |
| 466 | + assert resolved.candidate == "c2" |
| 467 | + |
| 468 | + def test_error_multiple_metrics_no_explicit(self): |
| 469 | + eval_result = _make_eval_result( |
| 470 | + metrics=["m1", "m2"], candidate_names=["c1"] |
| 471 | + ) |
| 472 | + with pytest.raises(ValueError, match="multiple metrics"): |
| 473 | + _evals_utils._resolve_loss_analysis_config(eval_result=eval_result) |
| 474 | + |
| 475 | + def test_error_multiple_candidates_no_explicit(self): |
| 476 | + eval_result = _make_eval_result( |
| 477 | + metrics=["m1"], candidate_names=["c1", "c2"] |
| 478 | + ) |
| 479 | + with pytest.raises(ValueError, match="multiple candidates"): |
| 480 | + _evals_utils._resolve_loss_analysis_config(eval_result=eval_result) |
| 481 | + |
| 482 | + def test_error_invalid_metric(self): |
| 483 | + eval_result = _make_eval_result( |
| 484 | + metrics=["m1"], candidate_names=["c1"] |
| 485 | + ) |
| 486 | + with pytest.raises(ValueError, match="not found in eval_result"): |
| 487 | + _evals_utils._resolve_loss_analysis_config( |
| 488 | + eval_result=eval_result, metric="nonexistent" |
| 489 | + ) |
| 490 | + |
| 491 | + def test_error_invalid_candidate(self): |
| 492 | + eval_result = _make_eval_result( |
| 493 | + metrics=["m1"], candidate_names=["c1"] |
| 494 | + ) |
| 495 | + with pytest.raises(ValueError, match="not found in eval_result"): |
| 496 | + _evals_utils._resolve_loss_analysis_config( |
| 497 | + eval_result=eval_result, candidate="nonexistent" |
| 498 | + ) |
| 499 | + |
| 500 | + def test_no_candidates_defaults_to_candidate_1(self): |
| 501 | + eval_result = _make_eval_result(metrics=["m1"], candidate_names=[]) |
| 502 | + eval_result = eval_result.model_copy( |
| 503 | + update={"metadata": common_types.EvaluationRunMetadata()} |
| 504 | + ) |
| 505 | + resolved = _evals_utils._resolve_loss_analysis_config( |
| 506 | + eval_result=eval_result |
| 507 | + ) |
| 508 | + assert resolved.metric == "m1" |
| 509 | + assert resolved.candidate == "candidate_1" |
| 510 | + |
| 511 | + def test_no_eval_case_results_raises(self): |
| 512 | + eval_result = common_types.EvaluationResult() |
| 513 | + with pytest.raises(ValueError, match="no metric results"): |
| 514 | + _evals_utils._resolve_loss_analysis_config(eval_result=eval_result) |
| 515 | + |
| 516 | + |
268 | 517 | class TestEvals: |
269 | 518 | """Unit tests for the GenAI client.""" |
270 | 519 |
|
|
0 commit comments