From 343f52ad888f60728bfe02cdb770013525464f07 Mon Sep 17 00:00:00 2001 From: nac7 Date: Mon, 8 Jun 2026 15:30:16 -0500 Subject: [PATCH 1/2] fix(output_parsers): use correct JSON key "Violated Categories" in nemoguard parsers Both nemoguard_parse_prompt_safety and nemoguard_parse_response_safety checked for key "Safety Categories" when extracting violation categories from NemoGuard ContentSafety model output, but the model actually returns key "Violated Categories" (as documented in each function's own docstring). This caused violation categories to be silently dropped on every unsafe response, breaking audit logging, granular guardrail policies, and compliance reporting that depend on knowing which policy categories were flagged. Fixes #2010 --- nemoguardrails/llm/output_parsers.py | 8 ++-- tests/test_content_safety_output_parsers.py | 50 +++++++++++++-------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/nemoguardrails/llm/output_parsers.py b/nemoguardrails/llm/output_parsers.py index b9c3b2c1eb..e428e4843a 100644 --- a/nemoguardrails/llm/output_parsers.py +++ b/nemoguardrails/llm/output_parsers.py @@ -160,8 +160,8 @@ def nemoguard_parse_prompt_safety(response: str) -> Sequence[Union[bool, str]]: parsed_json_result = json.loads(response) assert "User Safety" in parsed_json_result result = parsed_json_result["User Safety"].lower() - if "Safety Categories" in parsed_json_result: - safety_categories = [cat.strip() for cat in parsed_json_result["Safety Categories"].split(",")] + if "Violated Categories" in parsed_json_result: + safety_categories = [cat.strip() for cat in parsed_json_result["Violated Categories"].split(",")] else: safety_categories = [] except Exception: @@ -199,8 +199,8 @@ def nemoguard_parse_response_safety(response: str) -> Sequence[Union[bool, str]] parsed_json_result = json.loads(response) assert "Response Safety" in parsed_json_result result = parsed_json_result["Response Safety"].lower() - if "Safety Categories" in parsed_json_result: - safety_categories = [cat.strip() for cat in parsed_json_result["Safety Categories"].split(",")] + if "Violated Categories" in parsed_json_result: + safety_categories = [cat.strip() for cat in parsed_json_result["Violated Categories"].split(",")] else: safety_categories = [] except Exception: diff --git a/tests/test_content_safety_output_parsers.py b/tests/test_content_safety_output_parsers.py index 7b44b54fc9..a1de2d3319 100644 --- a/tests/test_content_safety_output_parsers.py +++ b/tests/test_content_safety_output_parsers.py @@ -133,15 +133,22 @@ def test_unsafe_user_content_json(self): assert is_safe is False assert violated_policies == [] - def test_unsafe_with_safety_categories(self): - """Test parsing unsafe content with safety categories.""" - response = '{"User Safety": "unsafe", "Safety Categories": "S1, S8, S10"}' + def test_unsafe_with_violated_categories(self): + """Test parsing unsafe content with violation categories (correct NemoGuard key).""" + response = '{"User Safety": "unsafe", "Violated Categories": "S1, S8, S10"}' is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) assert is_safe is False assert "S1" in violated_policies assert "S8" in violated_policies assert "S10" in violated_policies + def test_wrong_key_safety_categories_yields_no_categories(self): + """Regression: old wrong key 'Safety Categories' should not extract categories.""" + response = '{"User Safety": "unsafe", "Safety Categories": "S1, S8"}' + is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) + assert is_safe is False + assert violated_policies == [] + def test_case_insensitive_safety_status(self): """Test parsing is case insensitive for safety status.""" response = '{"User Safety": "SAFE"}' @@ -151,16 +158,16 @@ def test_case_insensitive_safety_status(self): def test_categories_with_whitespace_trimming(self): """Test parsing categories with extra whitespace gets trimmed.""" - response = '{"User Safety": "unsafe", "Safety Categories": " S1 , S8 , S10 "}' + response = '{"User Safety": "unsafe", "Violated Categories": " S1 , S8 , S10 "}' is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) assert is_safe is False assert "S1" in violated_policies assert "S8" in violated_policies assert "S10" in violated_policies - def test_empty_safety_categories(self): - """Test parsing with empty safety categories string.""" - response = '{"User Safety": "unsafe", "Safety Categories": ""}' + def test_empty_violated_categories(self): + """Test parsing with empty violated categories string.""" + response = '{"User Safety": "unsafe", "Violated Categories": ""}' is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) assert is_safe is False assert violated_policies == [""] @@ -187,15 +194,15 @@ def test_missing_user_safety_field(self): assert violated_policies == ["JSON parsing failed"] def test_single_category(self): - """Test parsing with single safety category.""" - response = '{"User Safety": "unsafe", "Safety Categories": "Violence"}' + """Test parsing with single violated category.""" + response = '{"User Safety": "unsafe", "Violated Categories": "Violence"}' is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) assert is_safe is False assert violated_policies == ["Violence"] def test_complex_category_names(self): """Test parsing with descriptive category names.""" - response = '{"User Safety": "unsafe", "Safety Categories": "Violence, Hate Speech, Sexual Content"}' + response = '{"User Safety": "unsafe", "Violated Categories": "Violence, Hate Speech, Sexual Content"}' is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) assert is_safe is False assert "Violence" in violated_policies @@ -220,15 +227,22 @@ def test_unsafe_response_content_json(self): assert is_safe is False assert violated_policies == [] - def test_unsafe_with_safety_categories(self): - """Test parsing unsafe response with safety categories.""" - response = '{"Response Safety": "unsafe", "Safety Categories": "S1, S8, S10"}' + def test_unsafe_with_violated_categories(self): + """Test parsing unsafe response with violated categories (correct NemoGuard key).""" + response = '{"Response Safety": "unsafe", "Violated Categories": "S1, S8, S10"}' is_safe, *violated_policies = nemoguard_parse_response_safety(response) assert is_safe is False assert "S1" in violated_policies assert "S8" in violated_policies assert "S10" in violated_policies + def test_wrong_key_safety_categories_yields_no_categories(self): + """Regression: old wrong key 'Safety Categories' should not extract categories.""" + response = '{"Response Safety": "unsafe", "Safety Categories": "S1, S8"}' + is_safe, *violated_policies = nemoguard_parse_response_safety(response) + assert is_safe is False + assert violated_policies == [] + def test_case_insensitive_safety_status(self): """Test parsing is case insensitive for safety status.""" response = '{"Response Safety": "SAFE"}' @@ -238,7 +252,7 @@ def test_case_insensitive_safety_status(self): def test_categories_with_whitespace_trimming(self): """Test parsing categories with extra whitespace gets trimmed.""" - response = '{"Response Safety": "unsafe", "Safety Categories": " S1 , S8 , S10 "}' + response = '{"Response Safety": "unsafe", "Violated Categories": " S1 , S8 , S10 "}' is_safe, *violated_policies = nemoguard_parse_response_safety(response) assert is_safe is False assert "S1" in violated_policies @@ -268,7 +282,7 @@ def test_missing_response_safety_field(self): def test_full_nemoguard_response(self): """Test parsing a full NemoGuard response with both user and response safety.""" - response = '{"User Safety": "safe", "Response Safety": "unsafe", "Safety Categories": "S1, S8"}' + response = '{"User Safety": "safe", "Response Safety": "unsafe", "Violated Categories": "S1, S8"}' is_safe, *violated_policies = nemoguard_parse_response_safety(response) assert is_safe is False assert "S1" in violated_policies @@ -298,13 +312,13 @@ def test_llama_guard_typical_responses(self): def test_nemoguard_content_safety_responses(self): """Test typical NemoGuard ContentSafety model responses.""" - response = '{"User Safety": "unsafe", "Safety Categories": "S1: Violence, S8: Hate/Identity Hate"}' + response = '{"User Safety": "unsafe", "Violated Categories": "S1: Violence, S8: Hate/Identity Hate"}' is_safe, *violated_policies = nemoguard_parse_prompt_safety(response) assert is_safe is False assert "S1: Violence" in violated_policies assert "S8: Hate/Identity Hate" in violated_policies - response = '{"User Safety": "safe", "Response Safety": "unsafe", "Safety Categories": "S11: Sexual Content"}' + response = '{"User Safety": "safe", "Response Safety": "unsafe", "Violated Categories": "S11: Sexual Content"}' is_safe, *violated_policies = nemoguard_parse_response_safety(response) assert is_safe is False assert violated_policies == ["S11: Sexual Content"] @@ -359,7 +373,7 @@ def test_starred_unpacking_compatibility(self): assert is_safe is True assert violated_policies == [] - response = '{"Response Safety": "unsafe", "Safety Categories": "S1, S8"}' + response = '{"Response Safety": "unsafe", "Violated Categories": "S1, S8"}' result = nemoguard_parse_response_safety(response) is_safe, *violated_policies = result assert is_safe is False From 2727e793eb738ae6dbbdc5f8eee7c44c07d16636 Mon Sep 17 00:00:00 2001 From: nac7 Date: Mon, 8 Jun 2026 16:37:10 -0500 Subject: [PATCH 2/2] test: update mock responses to use correct Violated Categories key Existing tests provided mock NemoGuard JSON responses with the wrong key Safety Categories. Now that the parser correctly reads Violated Categories, update all mock response fixtures to match what the real model emits. The intentional regression tests in test_content_safety_output_parsers.py that verify Safety Categories no longer extracts data are left unchanged. --- tests/guardrails/test_content_safety_iorails_actions.py | 4 ++-- tests/guardrails/test_data.py | 4 ++-- tests/guardrails/test_iorails_telemetry.py | 2 +- tests/guardrails/test_rails_manager.py | 4 ++-- tests/test_content_safety_integration.py | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/guardrails/test_content_safety_iorails_actions.py b/tests/guardrails/test_content_safety_iorails_actions.py index c5a41eb000..5df72c91ef 100644 --- a/tests/guardrails/test_content_safety_iorails_actions.py +++ b/tests/guardrails/test_content_safety_iorails_actions.py @@ -43,7 +43,7 @@ UNSAFE_JSON = json.dumps( { "User Safety": "unsafe", - "Safety Categories": "S1: Violence, S3: Criminal Planning/Confessions", + "Violated Categories": "S1: Violence, S3: Criminal Planning/Confessions", } ) SAFE_OUTPUT_JSON = json.dumps({"User Safety": "safe", "Response Safety": "safe"}) @@ -51,7 +51,7 @@ { "User Safety": "safe", "Response Safety": "unsafe", - "Safety Categories": "S17: Malware", + "Violated Categories": "S17: Malware", } ) diff --git a/tests/guardrails/test_data.py b/tests/guardrails/test_data.py index fbdb25d4bf..31ae9eb33d 100644 --- a/tests/guardrails/test_data.py +++ b/tests/guardrails/test_data.py @@ -53,7 +53,7 @@ Provide your safety assessment for the above conversation in the below output JSON format: -{"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.} +{"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Violated Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.} Do not include anything other than the output JSON in your response. Output JSON: @@ -97,7 +97,7 @@ Provide your safety assessment for the above conversation in the below output JSON format: -{"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Safety Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.} +{"User Safety": safe or unsafe rating for the user message, "Response Safety": safe or unsafe rating for the agent response. Omit if no agent response present. "Violated Categories": a comma-separated list of applicable safety categories from the provided taxonomy. Omit if all safe.} Do not include anything other than the output JSON in your response. Output JSON: diff --git a/tests/guardrails/test_iorails_telemetry.py b/tests/guardrails/test_iorails_telemetry.py index bb76cdec36..015880213f 100644 --- a/tests/guardrails/test_iorails_telemetry.py +++ b/tests/guardrails/test_iorails_telemetry.py @@ -372,7 +372,7 @@ async def capture_then_pass(messages): SAFE_INPUT_JSON = json.dumps({"User Safety": "safe"}) SAFE_OUTPUT_JSON = json.dumps({"User Safety": "safe", "Response Safety": "safe"}) -UNSAFE_INPUT_JSON = json.dumps({"User Safety": "unsafe", "Safety Categories": "S1: Violence"}) +UNSAFE_INPUT_JSON = json.dumps({"User Safety": "unsafe", "Violated Categories": "S1: Violence"}) def _stub_deep_pipeline(iorails, main_llm_response="Hello", input_safe=True): diff --git a/tests/guardrails/test_rails_manager.py b/tests/guardrails/test_rails_manager.py index f558f6c6a9..edfdb7f94f 100644 --- a/tests/guardrails/test_rails_manager.py +++ b/tests/guardrails/test_rails_manager.py @@ -41,13 +41,13 @@ ) SAFE_INPUT_JSON = json.dumps({"User Safety": "safe"}) -UNSAFE_INPUT_JSON = json.dumps({"User Safety": "unsafe", "Safety Categories": "S1: Violence"}) +UNSAFE_INPUT_JSON = json.dumps({"User Safety": "unsafe", "Violated Categories": "S1: Violence"}) SAFE_OUTPUT_JSON = json.dumps({"User Safety": "safe", "Response Safety": "safe"}) UNSAFE_OUTPUT_JSON = json.dumps( { "User Safety": "safe", "Response Safety": "unsafe", - "Safety Categories": "S17: Malware", + "Violated Categories": "S17: Malware", } ) MESSAGES = [{"role": "user", "content": "hello"}] diff --git a/tests/test_content_safety_integration.py b/tests/test_content_safety_integration.py index 8812877f6b..b8b735c9b8 100644 --- a/tests/test_content_safety_integration.py +++ b/tests/test_content_safety_integration.py @@ -161,7 +161,7 @@ async def test_content_safety_input_with_nemoguard_parser_unsafe_with_categories self, ): """Test input action with real nemoguard_parse_prompt_safety parser - unsafe with categories.""" - json_response = '{"User Safety": "unsafe", "Safety Categories": "S1, S8, S10"}' + json_response = '{"User Safety": "unsafe", "Violated Categories": "S1, S8, S10"}' parsed_result = nemoguard_parse_prompt_safety(json_response) llms, mock_task_manager = _create_mock_setup([json_response], parsed_result) context = _create_input_context("Potentially harmful content") @@ -181,7 +181,7 @@ async def test_content_safety_input_with_nemoguard_parser_unsafe_with_categories [ ('{"Response Safety": "safe"}', True, []), ( - '{"Response Safety": "unsafe", "Safety Categories": "Violence, Hate Speech"}', + '{"Response Safety": "unsafe", "Violated Categories": "Violence, Hate Speech"}', False, ["Violence", "Hate Speech"], ), @@ -251,13 +251,13 @@ def test_iterable_unpacking_with_is_content_safe_outputs(self, response, expecte [ ('{"User Safety": "safe"}', True, []), ( - '{"User Safety": "unsafe", "Safety Categories": "S1, S8"}', + '{"User Safety": "unsafe", "Violated Categories": "S1, S8"}', False, ["S1", "S8"], ), ('{"Response Safety": "safe"}', True, []), ( - '{"Response Safety": "unsafe", "Safety Categories": "Violence, Hate"}', + '{"Response Safety": "unsafe", "Violated Categories": "Violence, Hate"}', False, ["Violence", "Hate"], ),