Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion nemoguardrails/library/jailbreak_detection/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y git gcc g++ python3-dev wget && apt-get

# Predownload embedding-based jailbreak detection models, set environment variable for path
WORKDIR /models
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx
ENV EMBEDDING_CLASSIFIER_PATH=/models

# Set working directory
Expand Down
2 changes: 1 addition & 1 deletion nemoguardrails/library/jailbreak_detection/Dockerfile-GPU
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ ENV JAILBREAK_CHECK_DEVICE=cuda:0

# Predownload embedding-based jailbreak detection models, set environment variable for path
WORKDIR /models
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.pkl
RUN wget https://huggingface.co/nvidia/NemoGuard-JailbreakDetect/resolve/main/snowflake.onnx
ENV EMBEDDING_CLASSIFIER_PATH=/models

# Set working directory
Expand Down
16 changes: 12 additions & 4 deletions nemoguardrails/library/jailbreak_detection/model_based/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,19 @@ def initialize_model() -> Union[None, "JailbreakClassifier"]:
logger.warning("No embedding classifier path set. Server /model endpoint will not work.")
return None

from nemoguardrails.library.jailbreak_detection.model_based.models import (
JailbreakClassifier,
)
Path(classifier_path).mkdir(parents=True, exist_ok=True)

jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.pkl")))
# check if model is present. If not, download it.
if not Path(classifier_path).joinpath("snowflake.onnx").is_file():
from huggingface_hub import hf_hub_download

hf_hub_download(
repo_id="nvidia/NemoGuard-JailbreakDetect", filename="snowflake.onnx", local_dir=classifier_path
)
Comment on lines +48 to +50
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we pin revision?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We certainly could. it won't hurt anything.


from .models import JailbreakClassifier

jailbreak_classifier = JailbreakClassifier(str(Path(classifier_path).joinpath("snowflake.onnx")))

return jailbreak_classifier

Expand Down
29 changes: 19 additions & 10 deletions nemoguardrails/library/jailbreak_detection/model_based/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Tuple

import numpy as np


class SnowflakeEmbed:
def __init__(self):
import torch
from transformers import AutoModel, AutoTokenizer

self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.tokenizer = AutoTokenizer.from_pretrained("Snowflake/snowflake-arctic-embed-m-long")
device = os.environ.get("JAILBREAK_CHECK_DEVICE")
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
self.device = device
self.tokenizer = AutoTokenizer.from_pretrained(
"Snowflake/snowflake-arctic-embed-m-long",
trust_remote_code=True,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this needed?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which part? All of that is contained in the model card instructions on how to use the model.

)
Comment thread
erickgalinkin marked this conversation as resolved.
self.model = AutoModel.from_pretrained(
"Snowflake/snowflake-arctic-embed-m-long",
trust_remote_code=True,
Expand All @@ -43,16 +49,19 @@ def __call__(self, text: str):

class JailbreakClassifier:
def __init__(self, random_forest_path: str):
import pickle
from onnxruntime import InferenceSession

self.embed = SnowflakeEmbed()
with open(random_forest_path, "rb") as fd:
self.classifier = pickle.load(fd)
# See https://onnx.ai/sklearn-onnx/auto_examples/plot_convert_decision_function.html
self.classifier = InferenceSession(random_forest_path, providers=["CPUExecutionProvider"])

def __call__(self, text: str) -> Tuple[bool, float]:
e = self.embed(text)
probs = self.classifier.predict_proba([e])
classification = np.argmax(probs)
prob = np.max(probs)
res = self.classifier.run(None, {"X": [e]})
# InferenceSession returns a result where the first item is equivalent to argmax over probabilities
classification = res[0].item()
# The second is a list of dicts of probabilities -- the slice res[1][:2] should have only one element.
# We access the dict entry for the class.
prob = res[1][0][classification]
score = -prob if classification == 0 else prob
return bool(classification), float(score)
13 changes: 7 additions & 6 deletions nemoguardrails/library/jailbreak_detection/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ fastapi>=0.103.1
starlette>=0.50.0
typer>=0.7.0
uvicorn>=0.23.2
transformers>=4.57.6
torch>=2.9.1
nemoguardrails>=0.14.0
numpy==1.23.5
scikit-learn==1.2.2
einops>=0.7.0
transformers>=5.3.0
torch>=2.9.0
torchvision>=0.25.0
Comment thread
erickgalinkin marked this conversation as resolved.
numpy==1.26.4
einops>=0.8.2
onnxruntime>=1.24.3
huggingface_hub>=1.0
71 changes: 61 additions & 10 deletions tests/test_jailbreak_model_based.py
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dead sklearn monkeypatch and stale test intent as in below

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No test covers the new hf_hub_download branch in initialize_model(). would be great to add a patched test that asserts:

  • no download when the file exists
  • one call to hf_hub_download with the expected args when it does not.

Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@

def test_lazy_import_does_not_require_heavy_deps():
"""
Importing the checks module should not require torch, transformers, or sklearn unless model-based classifier is used.
Importing the checks module should not require torch, transformers, or onnxruntime unless model-based classifier is used.
"""
with mock.patch.dict(sys.modules, {"torch": None, "transformers": None, "sklearn": None}):
with mock.patch.dict(sys.modules, {"torch": None, "transformers": None, "onnxruntime": None}):
import nemoguardrails.library.jailbreak_detection.model_based.checks as checks

# Just importing and calling unrelated functions should not raise ImportError
Expand All @@ -38,20 +38,20 @@ def test_lazy_import_does_not_require_heavy_deps():

def test_model_based_classifier_imports(monkeypatch):
"""
Instantiating JailbreakClassifier should require sklearn and pickle, and use SnowflakeEmbed which requires torch/transformers.
Instantiating JailbreakClassifier should require onnxruntime, and use SnowflakeEmbed which requires torch/transformers.
"""
# Mock dependencies
fake_rf = mock.MagicMock()
fake_embed = mock.MagicMock(return_value=[0.0])
fake_pickle = types.SimpleNamespace(load=mock.MagicMock(return_value=fake_rf))
fake_onnx = types.SimpleNamespace(InferenceSession=mock.MagicMock(return_value=fake_rf))
fake_snowflake = mock.MagicMock(return_value=fake_embed)

monkeypatch.setitem(
sys.modules,
"sklearn.ensemble",
types.SimpleNamespace(RandomForestClassifier=mock.MagicMock()),
)
Comment on lines 49 to 53
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stale patch

monkeypatch.setitem(sys.modules, "pickle", fake_pickle)
monkeypatch.setitem(sys.modules, "onnxruntime", fake_onnx)
monkeypatch.setitem(sys.modules, "torch", mock.MagicMock())
monkeypatch.setitem(sys.modules, "transformers", mock.MagicMock())

Expand All @@ -64,7 +64,7 @@ def test_model_based_classifier_imports(monkeypatch):
mock_open = mock.mock_open()
with mock.patch("builtins.open", mock_open):
# Should not raise
classifier = models.JailbreakClassifier("fake_model_path.pkl")
classifier = models.JailbreakClassifier("fake_model_path.onnx")
assert classifier is not None
# Should be callable
result = classifier("test")
Expand All @@ -76,17 +76,17 @@ def test_model_based_classifier_imports(monkeypatch):

def test_model_based_classifier_missing_deps(monkeypatch):
"""
If sklearn is missing, instantiating JailbreakClassifier should raise ImportError.
If onnxruntime is missing, instantiating JailbreakClassifier should raise ImportError.
"""
monkeypatch.setitem(sys.modules, "sklearn.ensemble", None)
monkeypatch.setitem(sys.modules, "onnxruntime", None)

import nemoguardrails.library.jailbreak_detection.model_based.models as models

# to avoid Windows permission issues
mock_open = mock.mock_open()
with mock.patch("builtins.open", mock_open):
with pytest.raises(ImportError):
models.JailbreakClassifier("fake_model_path.pkl")
models.JailbreakClassifier("fake_model_path.onnx")


# Test 4: Return None when EMBEDDING_CLASSIFIER_PATH is not set
Expand Down Expand Up @@ -253,10 +253,61 @@ def test_initialize_model_with_valid_path(monkeypatch):

assert result == mock_classifier

expected_path = str(Path(test_path).joinpath("snowflake.pkl"))
expected_path = str(Path(test_path).joinpath("snowflake.onnx"))
mock_jailbreak_classifier_class.assert_called_once_with(expected_path)


def test_initialize_model_skips_hf_hub_download_when_snowflake_onnx_exists(monkeypatch, tmp_path):
"""
When snowflake.onnx is already present under EMBEDDING_CLASSIFIER_PATH, do not call hf_hub_download.
"""
import nemoguardrails.library.jailbreak_detection.model_based.checks as checks

checks.initialize_model.cache_clear()

(tmp_path / "snowflake.onnx").write_bytes(b"")
monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", str(tmp_path))

mock_classifier = mock.MagicMock()
monkeypatch.setattr(
"nemoguardrails.library.jailbreak_detection.model_based.models.JailbreakClassifier",
mock.MagicMock(return_value=mock_classifier),
)

with mock.patch("huggingface_hub.hf_hub_download") as mock_hf_hub_download:
result = checks.initialize_model()

assert result is mock_classifier
mock_hf_hub_download.assert_not_called()


def test_initialize_model_calls_hf_hub_download_when_snowflake_onnx_missing(monkeypatch, tmp_path):
"""
When snowflake.onnx is absent, hf_hub_download is invoked once with the NemoGuard repo and paths.
"""
import nemoguardrails.library.jailbreak_detection.model_based.checks as checks

checks.initialize_model.cache_clear()

monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", str(tmp_path))

mock_classifier = mock.MagicMock()
monkeypatch.setattr(
"nemoguardrails.library.jailbreak_detection.model_based.models.JailbreakClassifier",
mock.MagicMock(return_value=mock_classifier),
)

with mock.patch("huggingface_hub.hf_hub_download") as mock_hf_hub_download:
result = checks.initialize_model()

assert result is mock_classifier
mock_hf_hub_download.assert_called_once_with(
repo_id="nvidia/NemoGuard-JailbreakDetect",
filename="snowflake.onnx",
local_dir=str(tmp_path),
)


# Test 10: Test that NvEmbedE5 class no longer exists


Expand Down
Loading