trustyai-explainability · m-misiura · Mar 25, 2026 · Mar 24, 2026
diff --git a/nemoguardrails/server/schemas/openai.py b/nemoguardrails/server/schemas/openai.py
@@ -138,7 +138,7 @@ def validate_config_ids(cls, data: Any) -> Any:
             config_fields = [data.get("config_id"), data.get("config_ids"), data.get("config")]
             non_none_count = sum(1 for field in config_fields if field is not None)
             if non_none_count > 1:
-                raise ValueError("Only one of config, config_id, or config_ids should be specified")
+                raise ValueError("Only one of config_id or config_ids should be specified")
         return data
 
     @field_validator("config_ids", mode="before")

diff --git a/scripts/discover_required_models.py b/scripts/discover_required_models.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,42 +77,22 @@ def get_active_guardrails(self) -> List[str]:
             logging.error(f"Missing directory: {library_path}")
             sys.exit(1)
 
-        available = [
-            item.name
-            for item in library_path.iterdir()
-            if item.is_dir() and not item.name.startswith("_")
-        ]
-        return (
-            available
-            if include_closed
-            else [gr for gr in available if gr not in closed_source]
-        )
+        available = [item.name for item in library_path.iterdir() if item.is_dir() and not item.name.startswith("_")]
+        return available if include_closed else [gr for gr in available if gr not in closed_source]
 
     @staticmethod
     def _extract_from_ast(tree: ast.AST) -> Dict[str, Set[str]]:
         models = {k: set() for k in ModelDiscoverer.MODEL_KEYS}
         for node in ast.walk(tree):
             if (
                 isinstance(node, ast.Call)
-                and getattr(getattr(node.func, "attr", None), "lower", lambda: "")()
-                == "load"
+                and getattr(getattr(node.func, "attr", None), "lower", lambda: "")() == "load"
                 and getattr(getattr(node.func, "value", None), "id", None) == "spacy"
             ):
-                if (
-                    node.args
-                    and isinstance(node.args[0], ast.Constant)
-                    and isinstance(node.args[0].value, str)
-                ):
+                if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
                     models["spacy"].add(node.args[0].value)
-            if (
-                isinstance(node, ast.Call)
-                and getattr(node.func, "id", None) == "SentenceTransformer"
-            ):
-                if (
-                    node.args
-                    and isinstance(node.args[0], ast.Constant)
-                    and isinstance(node.args[0].value, str)
-                ):
+            if isinstance(node, ast.Call) and getattr(node.func, "id", None) == "SentenceTransformer":
+                if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
                     name = node.args[0].value
                     if not name.startswith("sentence-transformers/"):
                         name = f"sentence-transformers/{name}"
@@ -130,11 +110,7 @@ def _extract_from_ast(tree: ast.AST) -> Dict[str, Set[str]]:
                 and getattr(node.func, "attr", None) == "download"
                 and getattr(getattr(node.func, "value", None), "id", None) == "nltk"
             ):
-                if (
-                    node.args
-                    and isinstance(node.args[0], ast.Constant)
-                    and isinstance(node.args[0].value, str)
-                ):
+                if node.args and isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
                     models["nltk"].add(node.args[0].value)
         return models
 
@@ -182,9 +158,7 @@ def discover(self) -> Dict[str, Set[str]]:
     def print_summary(self):
         active_guardrails = self.get_active_guardrails()
         print(f"Discovering models for profile: {self.profile}")
-        print(
-            f"Active guardrails ({len(active_guardrails)}): {', '.join(active_guardrails)}"
-        )
+        print(f"Active guardrails ({len(active_guardrails)}): {', '.join(active_guardrails)}")
         for category in self.MODEL_KEYS:
             models = self.models[category]
             if models:

diff --git a/scripts/filter_guardrails.py b/scripts/filter_guardrails.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,7 +15,6 @@
 # limitations under the License.
 
 import logging
-import os
 import shutil
 import sys
 from pathlib import Path
@@ -39,9 +38,7 @@ def main():
         config = yaml.safe_load(f)
 
     if profile not in config["profiles"]:
-        logger.error(
-            f"Profile '{profile}' not found. Available: {list(config['profiles'].keys())}"
-        )
+        logger.error(f"Profile '{profile}' not found. Available: {list(config['profiles'].keys())}")
         sys.exit(1)
 
     include_closed_source = config["profiles"][profile]["include_closed_source"]
@@ -59,11 +56,7 @@ def main():
     removed_dirs = []
 
     for guardrail_dir in library_path.iterdir():
-        if (
-            not guardrail_dir.is_dir()
-            or guardrail_dir.name.startswith(".")
-            or guardrail_dir.name.startswith("__")
-        ):
+        if not guardrail_dir.is_dir() or guardrail_dir.name.startswith(".") or guardrail_dir.name.startswith("__"):
             continue
 
         guardrail_name = guardrail_dir.name
@@ -78,9 +71,7 @@ def main():
             logger.info(f"Keeping {source_type}: {guardrail_name}")
             kept_dirs.append(guardrail_name)
 
-    logger.info(
-        f"\nSummary: kept {len(kept_dirs)}, removed {len(removed_dirs)} guardrails"
-    )
+    logger.info(f"\nSummary: kept {len(kept_dirs)}, removed {len(removed_dirs)} guardrails")
 
 
 if __name__ == "__main__":

diff --git a/scripts/pre_download_required_models.py b/scripts/pre_download_required_models.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -89,9 +89,7 @@ def download_sentence_transformers_models(models):
             sentence_transformers.SentenceTransformer(model_name)
             logging.info(f"Downloaded Sentence Transformers model: {model_name}")
         except Exception as e:
-            logging.warning(
-                f"Failed to download Sentence Transformers model {model_name}: {e}"
-            )
+            logging.warning(f"Failed to download Sentence Transformers model {model_name}: {e}")
 
 
 def download_fastembed_models(models):
@@ -143,9 +141,7 @@ def download_huggingface_models(models):
                 except Exception as e2:
                     logging.warning(f"Failed to download {model_name}: {e2}")
             else:
-                logging.warning(
-                    f"Failed to download HuggingFace model {model_name}: {e}"
-                )
+                logging.warning(f"Failed to download HuggingFace model {model_name}: {e}")
 
 
 def download_nltk_data():

diff --git a/tests/test_configs/simple_rails/actions.py b/tests/test_configs/simple_rails/actions.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemoguardrails.actions import action
+
+
+@action(is_system_action=True)
+async def check_forbidden_words(context: dict = {}):
+    """Check if the message contains forbidden words."""
+    user_message = context.get("user_message", "").lower()
+
+    forbidden_categories = {
+        "security": ["password", "hack", "exploit", "vulnerability"],
+        "inappropriate": ["violence", "illegal", "harmful"],
+        "competitors": ["chatgpt", "openai", "claude", "anthropic"],
+    }
+
+    for category, words in forbidden_categories.items():
+        for word in words:
+            if word in user_message:
+                return {"status": "blocked", "category": category, "word": word}
+
+    return {"status": "allowed"}
+
+
+@action(is_system_action=True)
+async def check_output_length(context: dict = {}):
+    """Check if the bot message is too long."""
+    bot_msg = context.get("bot_message", "")
+    return "blocked" if len(bot_msg.split()) > 100 else "allowed"
diff --git a/tests/test_configs/simple_rails/config.yml b/tests/test_configs/simple_rails/config.yml
@@ -0,0 +1,17 @@
+models:
+  - type: main
+    engine: openai
+    model: test
+
+instructions:
+  - type: general
+    content: |
+      You are a helpful assistant.
+
+rails:
+  input:
+    flows:
+      - check forbidden words
+  output:
+    flows:
+      - check output length
diff --git a/tests/test_configs/simple_rails/rails.co b/tests/test_configs/simple_rails/rails.co
@@ -0,0 +1,19 @@
+define flow check forbidden words
+  $result = execute check_forbidden_words
+
+  if $result.status == "blocked"
+    bot inform forbidden content
+    stop
+
+define bot inform forbidden content
+  "I can't answer questions about closed source AI models"
+
+define flow check output length
+  $result = execute check_output_length
+
+  if $result == "blocked"
+    bot inform output too long
+    stop
+
+define bot inform output too long
+  "The response is too long."
diff --git a/tests/test_configs/tool_rails_combined/actions.py b/tests/test_configs/tool_rails_combined/actions.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from nemoguardrails.actions import action
+
+
+@action(is_system_action=True)
+async def check_forbidden_words(context: dict = {}):
+    """Check if the message contains forbidden words."""
+    user_message = context.get("user_message", "").lower()
+
+    forbidden_categories = {
+        "security": ["password", "hack", "exploit", "vulnerability"],
+        "inappropriate": ["violence", "illegal", "harmful"],
+        "competitors": ["chatgpt", "openai", "claude", "anthropic"],
+    }
+
+    for category, words in forbidden_categories.items():
+        for word in words:
+            if word in user_message:
+                return {"status": "blocked", "category": category, "word": word}
+
+    return {"status": "allowed"}
+
+
+@action(is_system_action=True)
+async def check_output_length(context: dict = {}):
+    """Check if the bot message is too long."""
+    bot_msg = context.get("bot_message", "")
+    return "blocked" if len(bot_msg.split()) > 100 else "allowed"
+
+
+@action(is_system_action=True)
+async def check_tool_response_safety(tool_message: str = None, context: dict = None):
+    """Validate tool responses for sensitive data leakage."""
+    if tool_message is None:
+        tool_message = context.get("tool_message", "") if context else ""
+
+    if not tool_message:
+        return "allowed"
+
+    credential_patterns = {
+        "password": r"password[:\s=]+\w+",
+        "api_key": r"(?:api[_\s-]?key|apikey)[:\s=]+[\w-]+",
+        "secret": r"secret[:\s=]+\w+",
+        "token": r"(?:access[_\s]?token|bearer)[:\s=]+[\w.-]+",
+        "private_key": r"-----BEGIN (?:RSA |EC )?PRIVATE KEY-----",
+    }
+
+    tool_message_lower = tool_message.lower()
+
+    for pattern_name, pattern in credential_patterns.items():
+        if re.search(pattern, tool_message_lower):
+            return "blocked"
+
+    return "allowed"
+
+
+@action(is_system_action=True)
+async def check_tool_call_safety(tool_calls=None, context=None):
+    """Validate tool calls before execution using an allow list approach."""
+    if tool_calls is None:
+        tool_calls = context.get("tool_calls", []) if context else []
+
+    allowed_tools = [
+        "get_weather",
+        "search_web",
+        "read_file",
+        "get_time",
+        "get_stock_price",
+        "calculate",
+    ]
+
+    dangerous_patterns = {
+        "path_traversal": r"\.\./",
+        "command_injection": r"[;&|`$]",
+        "sql_injection": r"(?:DROP|DELETE|TRUNCATE)\s+(?:TABLE|DATABASE)",
+    }
+
+    for tool_call in tool_calls:
+        tool_name = tool_call.get("name", "")
+
+        if tool_name not in allowed_tools:
+            return "blocked"
+
+        args = tool_call.get("args", {})
+        for arg_name, arg_value in args.items():
+            if isinstance(arg_value, str):
+                for pattern_name, pattern in dangerous_patterns.items():
+                    if re.search(pattern, arg_value, re.IGNORECASE):
+                        return "blocked"
+
+    return "allowed"
diff --git a/tests/test_configs/tool_rails_combined/config.yml b/tests/test_configs/tool_rails_combined/config.yml
@@ -0,0 +1,25 @@
+models:
+  - type: main
+    engine: openai
+    model: test
+
+instructions:
+  - type: general
+    content: |
+      You are a helpful assistant.
+
+passthrough: true
+
+rails:
+  input:
+    flows:
+      - check forbidden words
+  output:
+    flows:
+      - check output length
+  tool_input:
+    flows:
+      - check tool response safety
+  tool_output:
+    flows:
+      - check tool call safety