|
43 | 43 | from mellea.stdlib import functional as mfuncs |
44 | 44 | from mellea.stdlib.components import Intrinsic, Message |
45 | 45 | from mellea.stdlib.components.docs.document import Document |
46 | | -from mellea.stdlib.components.intrinsic import rag |
| 46 | +from mellea.stdlib.components.intrinsic import core as intrinsic_core, guardian, rag |
47 | 47 | from mellea.stdlib.context import ChatContext |
48 | 48 | from test.formatters.granite.test_intrinsics_formatters import ( |
49 | 49 | _YAML_JSON_COMBOS_WITH_MODEL, |
@@ -355,13 +355,21 @@ def test_call_intrinsic_answerability(call_intrinsic_backend): |
355 | 355 |
|
356 | 356 |
|
357 | 357 | @pytest.mark.qualitative |
358 | | -def test_call_intrinsic_context_relevance(call_intrinsic_backend): |
359 | | - """call_intrinsic path: check_context_relevance returns a score between 0 and 1.""" |
360 | | - context, question, documents = _read_rag_input("context_relevance.json") |
361 | | - result = rag.check_context_relevance( |
362 | | - question, documents[0], context, call_intrinsic_backend |
| 358 | +def test_call_intrinsic_requirement_check(call_intrinsic_backend): |
| 359 | + """call_intrinsic path: requirement_check returns a score between 0 and 1.""" |
| 360 | + with open(_RAG_TEST_DATA / "requirement_check.json", encoding="utf-8") as f: |
| 361 | + data = json.load(f) |
| 362 | + |
| 363 | + context = ChatContext() |
| 364 | + for m in data["messages"]: |
| 365 | + context = context.add(Message(m["role"], m["content"])) |
| 366 | + |
| 367 | + requirement = data["requirement"] |
| 368 | + result = intrinsic_core.requirement_check( |
| 369 | + context, call_intrinsic_backend, requirement=requirement |
363 | 370 | ) |
364 | | - assert result in ["relevant", "irrelevant", "partially relevant"] |
| 371 | + assert isinstance(result, float) |
| 372 | + assert 0.0 <= result <= 1.0 |
365 | 373 |
|
366 | 374 |
|
367 | 375 | # --------------------------------------------------------------------------- |
@@ -399,3 +407,186 @@ def get_temperature(location: str) -> int: |
399 | 407 | assert len(result.value) > 0 |
400 | 408 | parsed = json.loads(result.value) |
401 | 409 | assert isinstance(parsed, dict) |
| 410 | + |
| 411 | + |
| 412 | +# --------------------------------------------------------------------------- |
| 413 | +# Guardian intrinsic tests — exercise the high-level convenience wrappers |
| 414 | +# --------------------------------------------------------------------------- |
| 415 | + |
| 416 | +_GUARDIAN_TEST_DATA = ( |
| 417 | + pathlib.Path(__file__).parent.parent |
| 418 | + / "stdlib" |
| 419 | + / "components" |
| 420 | + / "intrinsic" |
| 421 | + / "testdata" |
| 422 | + / "input_json" |
| 423 | +) |
| 424 | + |
| 425 | + |
| 426 | +def _read_guardian_input(file_name: str) -> ChatContext: |
| 427 | + """Read guardian test input and convert to a ChatContext.""" |
| 428 | + with open(_GUARDIAN_TEST_DATA / file_name, encoding="utf-8") as f: |
| 429 | + json_data = json.load(f) |
| 430 | + |
| 431 | + context = ChatContext() |
| 432 | + for m in json_data["messages"]: |
| 433 | + role = m["role"] |
| 434 | + content = m["content"] |
| 435 | + context = context.add(Message(role, content)) |
| 436 | + |
| 437 | + return context |
| 438 | + |
| 439 | + |
| 440 | +@pytest.mark.qualitative |
| 441 | +def test_call_intrinsic_policy_guardrails(call_intrinsic_backend): |
| 442 | + """call_intrinsic path: policy_guardrails returns a compliance label.""" |
| 443 | + context = _read_guardian_input("policy_guardrails.json") |
| 444 | + |
| 445 | + policy_text = ( |
| 446 | + "hiring managers should steer away from any questions that directly seek " |
| 447 | + 'information about protected classes\u2014such as "how old are you," "where are ' |
| 448 | + 'you from," "what year did you graduate" or "what are your plans for having kids."' |
| 449 | + ) |
| 450 | + |
| 451 | + result = guardian.policy_guardrails( |
| 452 | + context, call_intrinsic_backend, policy_text=policy_text |
| 453 | + ) |
| 454 | + assert result in ("Yes", "No", "Ambiguous") |
| 455 | + |
| 456 | + |
| 457 | +@pytest.mark.qualitative |
| 458 | +def test_call_intrinsic_guardian_check_harm(call_intrinsic_backend): |
| 459 | + """call_intrinsic path: guardian_check detects harmful prompts.""" |
| 460 | + context = _read_guardian_input("guardian_core.json") |
| 461 | + |
| 462 | + result = guardian.guardian_check( |
| 463 | + context, call_intrinsic_backend, criteria="harm", target_role="user" |
| 464 | + ) |
| 465 | + assert isinstance(result, float) |
| 466 | + assert 0.0 <= result <= 1.0 |
| 467 | + |
| 468 | + |
| 469 | +@pytest.mark.qualitative |
| 470 | +def test_call_intrinsic_guardian_check_groundedness(call_intrinsic_backend): |
| 471 | + """call_intrinsic path: guardian_check detects ungrounded responses.""" |
| 472 | + document = Document( |
| 473 | + text=( |
| 474 | + "Eat (1964) is a 45-minute underground film created by Andy Warhol. " |
| 475 | + "The film was first shown by Jonas Mekas on July 16, 1964, at the " |
| 476 | + "Washington Square Gallery." |
| 477 | + ), |
| 478 | + doc_id="0", |
| 479 | + ) |
| 480 | + |
| 481 | + context = ( |
| 482 | + ChatContext() |
| 483 | + .add(Message("user", "When was the film Eat first shown?")) |
| 484 | + .add( |
| 485 | + Message( |
| 486 | + "assistant", |
| 487 | + "The film Eat was first shown by Jonas Mekas on December 24, " |
| 488 | + "1922 at the Washington Square Gallery.", |
| 489 | + documents=[document], |
| 490 | + ) |
| 491 | + ) |
| 492 | + ) |
| 493 | + |
| 494 | + result = guardian.guardian_check( |
| 495 | + context, call_intrinsic_backend, criteria="groundedness" |
| 496 | + ) |
| 497 | + assert isinstance(result, float) |
| 498 | + assert 0.0 <= result <= 1.0 |
| 499 | + |
| 500 | + |
| 501 | +@pytest.mark.qualitative |
| 502 | +def test_call_intrinsic_guardian_check_function_call(call_intrinsic_backend): |
| 503 | + """call_intrinsic path: guardian_check detects function call hallucinations.""" |
| 504 | + tools = [ |
| 505 | + { |
| 506 | + "name": "comment_list", |
| 507 | + "description": "Fetches a list of comments for a specified IBM video.", |
| 508 | + "parameters": { |
| 509 | + "aweme_id": { |
| 510 | + "description": "The ID of the IBM video.", |
| 511 | + "type": "int", |
| 512 | + "default": "7178094165614464282", |
| 513 | + }, |
| 514 | + "cursor": { |
| 515 | + "description": "The cursor for pagination. Defaults to 0.", |
| 516 | + "type": "int, optional", |
| 517 | + "default": "0", |
| 518 | + }, |
| 519 | + "count": { |
| 520 | + "description": "The number of comments to fetch. Maximum is 30. Defaults to 20.", |
| 521 | + "type": "int, optional", |
| 522 | + "default": "20", |
| 523 | + }, |
| 524 | + }, |
| 525 | + } |
| 526 | + ] |
| 527 | + tools_text = "Available tools:\n" + json.dumps(tools, indent=2) |
| 528 | + user_text = "Fetch the first 15 comments for the IBM video with ID 456789123." |
| 529 | + # Deliberately wrong: uses "video_id" instead of "aweme_id" |
| 530 | + response_text = str( |
| 531 | + [{"name": "comment_list", "arguments": {"video_id": 456789123, "count": 15}}] |
| 532 | + ) |
| 533 | + |
| 534 | + context = ( |
| 535 | + ChatContext() |
| 536 | + .add(Message("user", f"{tools_text}\n\n{user_text}")) |
| 537 | + .add(Message("assistant", response_text)) |
| 538 | + ) |
| 539 | + |
| 540 | + result = guardian.guardian_check( |
| 541 | + context, call_intrinsic_backend, criteria="function_call" |
| 542 | + ) |
| 543 | + assert isinstance(result, float) |
| 544 | + assert 0.0 <= result <= 1.0 |
| 545 | + |
| 546 | + |
| 547 | +@pytest.mark.qualitative |
| 548 | +def test_call_intrinsic_factuality_detection(call_intrinsic_backend): |
| 549 | + """call_intrinsic path: factuality_detection returns a yes/no label.""" |
| 550 | + with open(_GUARDIAN_TEST_DATA / "factuality_detection.json", encoding="utf-8") as f: |
| 551 | + data = json.load(f) |
| 552 | + |
| 553 | + context = ChatContext() |
| 554 | + docs = [ |
| 555 | + Document(text=d["text"], doc_id=d.get("doc_id")) |
| 556 | + for d in data.get("extra_body", {}).get("documents", []) |
| 557 | + ] |
| 558 | + messages = data["messages"] |
| 559 | + for i, m in enumerate(messages): |
| 560 | + is_last = i == len(messages) - 1 |
| 561 | + if is_last and docs: |
| 562 | + context = context.add(Message(m["role"], m["content"], documents=docs)) |
| 563 | + else: |
| 564 | + context = context.add(Message(m["role"], m["content"])) |
| 565 | + |
| 566 | + result = guardian.factuality_detection(context, call_intrinsic_backend) |
| 567 | + assert result in ("yes", "no") |
| 568 | + |
| 569 | + |
| 570 | +@pytest.mark.qualitative |
| 571 | +def test_call_intrinsic_factuality_correction(call_intrinsic_backend): |
| 572 | + """call_intrinsic path: factuality_correction returns corrected text or 'none'.""" |
| 573 | + with open( |
| 574 | + _GUARDIAN_TEST_DATA / "factuality_correction.json", encoding="utf-8" |
| 575 | + ) as f: |
| 576 | + data = json.load(f) |
| 577 | + |
| 578 | + context = ChatContext() |
| 579 | + docs = [ |
| 580 | + Document(text=d["text"], doc_id=d.get("doc_id")) |
| 581 | + for d in data.get("extra_body", {}).get("documents", []) |
| 582 | + ] |
| 583 | + messages = data["messages"] |
| 584 | + for i, m in enumerate(messages): |
| 585 | + is_last = i == len(messages) - 1 |
| 586 | + if is_last and docs: |
| 587 | + context = context.add(Message(m["role"], m["content"], documents=docs)) |
| 588 | + else: |
| 589 | + context = context.add(Message(m["role"], m["content"])) |
| 590 | + |
| 591 | + result = guardian.factuality_correction(context, call_intrinsic_backend) |
| 592 | + assert isinstance(result, str) |
0 commit comments