NVIDIA-NeMo
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 81 additions & 3 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 81 additions & 3 deletions
diff --git a/‎plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md‎
Lines changed: 21 additions & 0 deletions b/‎plugins/nemo-guardrails/benchmarks/configs/mock_llm/README.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env‎
Lines changed: 19 additions & 0 deletions b/‎plugins/nemo-guardrails/benchmarks/configs/mock_llm/app-llm.env‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env‎
Lines changed: 19 additions & 0 deletions b/‎plugins/nemo-guardrails/benchmarks/configs/mock_llm/content-safety-llm.env‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md‎
Lines changed: 154 additions & 0 deletions b/‎plugins/nemo-guardrails/benchmarks/results/local-baseline-2026-06-16.md‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py‎
Lines changed: 10 additions & 2 deletions b/‎plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/aiperf_runner.py‎
Lines changed: 10 additions & 2 deletions
@@ -1072,10 +1072,21 @@ jobs:
           path: web/packages/studio/playwright-report/
 
   benchmark-guardrails:
-    name: Guardrails plugin benchmark
+    # Run the two benchmark variants as parallel matrix jobs, each with its
+    # own NMP instance. This isolates them from each other (no shared mocks,
+    # no cross-talk on :8080) and roughly halves wall-clock vs. the previous
+    # single-job sequential layout. The `benchmark-guardrails-analyze` job
+    # below merges both artifacts and prints the with-vs-without comparison.
+    name: Guardrails plugin benchmark (${{ matrix.variant }})
     if: github.event_name == 'workflow_dispatch'
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    strategy:
+      # Don't cancel the other variant if one fails; the partial artifact
+      # is still useful for diagnosing what went wrong.
+      fail-fast: false
+      matrix:
+        variant: [with-guardrails, without-guardrails]
     steps:
       - name: Checkout nemo-platform
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
@@ -1102,15 +1113,81 @@ jobs:
           PYTORCH_DEPS: cpu
       - name: Run benchmark sweep
         working-directory: nemo-platform
-        run: make benchmark-guardrails
+        # Pin both variants to the same `--run-id` so when the analyze job
+        # downloads both artifacts into one `runs/` parent, they merge into
+        # a single run directory the analyzer can read normally.
+        run: |
+          make benchmark-guardrails BENCHMARK_ARGS="\
+            --variant ${{ matrix.variant }} \
+            --run-id ci-${{ github.run_id }}-${{ github.run_attempt }}"
         env:
           NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails
           _TYPER_FORCE_DISABLE_TERMINAL: "1"
       - name: Upload benchmark artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: benchmark-guardrails-results
+          # Per-variant artifact name; GHA disallows two artifacts with the
+          # same name in one workflow run.
+          name: benchmark-guardrails-results-${{ matrix.variant }}
+          retention-days: 30
+          path: |
+            nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+
+  benchmark-guardrails-analyze:
+    # Joins the two parallel matrix jobs above, downloads both artifacts into
+    # one merged `runs/<id>/` tree, and prints the with-vs-without
+    # comparison table. No regression gate yet: this exists to produce
+    # CI-collected numbers we can use to seed a baseline file later.
+    name: Guardrails benchmark analysis
+    needs: [benchmark-guardrails]
+    if: github.event_name == 'workflow_dispatch' && !cancelled()
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout nemo-platform
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+        with:
+          path: nemo-platform
+      - name: Download with-guardrails artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: benchmark-guardrails-results-with-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Download without-guardrails artifact
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: benchmark-guardrails-results-without-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Install uv
+        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
+        with:
+          working-directory: nemo-platform
+          python-version: "3.11"
+          enable-cache: true
+      - name: Bootstrap Python environment
+        working-directory: nemo-platform
+        run: make bootstrap-python
+        env:
+          PYTORCH_DEPS: cpu
+      - name: Print benchmark comparison
+        working-directory: nemo-platform
+        # Both matrix jobs above use the same `--run-id`, so there should be
+        # exactly one merged run directory under `runs/`. `ls -td ... | head -1`
+        # is defensive in case that ever stops being true (e.g. an artifact
+        # ever leaks in from a previous workflow attempt).
+        run: |
+          RUN_DIR=$(ls -td plugins/nemo-guardrails/benchmarks/artifacts/runs/*/ | head -1)
+          echo "Analyzing run directory: $RUN_DIR"
+          uv run --frozen --package nemo-guardrails-plugin --extra bench \
+            python -m nemo_guardrails_plugin.benchmarks.analyze "$RUN_DIR"
+      - name: Upload merged benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          # Single merged artifact so collecting baseline samples is a matter
+          # of downloading one thing per workflow run rather than two.
+          name: benchmark-guardrails-results-merged
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
@@ -1262,6 +1339,7 @@ jobs:
       - web-studio-deps
       - web-studio-e2e
       - benchmark-guardrails
+      - benchmark-guardrails-analyze
       - opa-policy-test
     if: always()
     runs-on: ubuntu-latest
 
@@ -0,0 +1,21 @@
+# Mock LLM configurations
+
+These `.env` files configure the upstream `benchmark.mock_llm_server.run_server`
+(from the `NeMo-Guardrails` checkout) for the IGW guardrails benchmark.
+
+We keep our own copies (instead of pointing at the upstream checkout's
+`benchmark/mock_llm_server/configs/`) so:
+
+- We can change mock latency without touching the upstream repo. The original
+  motivation was tuning `E2E_LATENCY_*` to isolate NMP+middleware overhead
+  from mandatory NIM sleep (see the benchmark README for the full rationale).
+- The exact mock behavior we benchmarked against is versioned alongside the
+  results, so historical numbers stay reproducible even if upstream changes
+  its defaults.
+
+Initial contents are a verbatim copy of the upstream files:
+
+- `app-llm.env`            ← upstream `meta-llama-3.3-70b-instruct.env`
+- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env`
+
+Update either file to change mock behavior for the next benchmark run.
@@ -0,0 +1,19 @@
+MODEL="meta/llama-3.3-70b-instruct"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
+SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=4.0
+E2E_LATENCY_MAX_SECONDS=4.0
+E2E_LATENCY_MEAN_SECONDS=4.0
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.3
+TTFT_MAX_SECONDS=0.3
+TTFT_MEAN_SECONDS=0.3
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
@@ -0,0 +1,19 @@
+MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
+SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=0.5
+E2E_LATENCY_MAX_SECONDS=0.5
+E2E_LATENCY_MEAN_SECONDS=0.5
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.2
+TTFT_MAX_SECONDS=0.2
+TTFT_MEAN_SECONDS=0.2
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
@@ -0,0 +1,154 @@
+# Local baseline results — 2026-06-16
+
+Three back-to-back runs of `make benchmark-guardrails` on a local MacBook Pro
+(Apple Silicon), no other heavy workloads running. Goal: characterize the
+run-to-run variance of the new with-guardrails / without-guardrails harness so
+we can decide what's gateable in CI.
+
+## Hardware / setup
+
+- Host: MacBook Pro, Apple Silicon, on AC power
+- NMP, mocks, shim: all on localhost
+- Mock LLM config: in-repo defaults (`plugins/nemo-guardrails/benchmarks/configs/mock_llm/`)
+  - app LLM: 4.0s e2e latency, std 0
+  - content-safety LLM: 0.5s e2e latency, std 0
+- AIPerf sweep: concurrency `[1, 2, 4, 8, 16, 32, 64]`, `benchmark_duration: 60s`,
+  `warmup_request_count: 10`, non-streaming chat completions
+- Mock workers: 4 (default)
+- Three runs in the same afternoon, NMP data dir reused across runs
+
+## Run inventory
+
+| Run | Run dir | Notes |
+|---|---|---|
+| 1 | `20260616_123851` | first run after the with/without harness change |
+| 2 | `20260616_145058` | identical config |
+| 3 | `20260616_152834` | identical config |
+
+All three runs completed with 7/7 sweeps passing per variant, exit code 0.
+
+## Δp50 (with-guardrails − without-guardrails), milliseconds
+
+This is the headline metric: how much wall-clock time the guardrails middleware
+adds on top of the bare NMP+IGW path, including the two content-safety LLM
+round-trips that the rails cause but don't do themselves.
+
+| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64    |
+|---------|-----:|-----:|-----:|-----:|-----:|-----:|--------:|
+| Run 1   | 1029 | 1071 | 1068 | 1104 | 1145 | 1260 |    778  |
+| Run 2   | 1027 | 1062 | 1096 | 1105 | 1226 | 1256 |  -2896  |
+| Run 3   | 1030 | 1062 | 1079 | 1070 | 1118 | 1201 |  -2077  |
+| **mean**| **1029** | **1065** | **1081** | **1093** | **1163** | **1239** | **−1398** |
+| range   |    3 |    9 |   28 |   35 |  108 |   59 |   3674  |
+| range % | 0.3% | 0.8% | 2.6% | 3.2% | 9.3% | 4.8% |   n/a   |
+
+## with-guardrails p50 (absolute), milliseconds
+
+Useful as a sanity check that nothing catastrophic shifted in the absolute
+numbers — even if Δp50 stays steady, both variants could slow down together.
+
+| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64 |
+|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| Run 1   | 5049 | 5101 | 5114 | 5152 | 5201 | 5318 | 6164 |
+| Run 2   | 5048 | 5093 | 5125 | 5137 | 5255 | 5279 | 5614 |
+| Run 3   | 5050 | 5094 | 5123 | 5146 | 5163 | 5250 | 5486 |
+| **mean**| **5049** | **5096** | **5121** | **5145** | **5206** | **5282** | **5755** |
+| range   |    2 |    8 |   11 |   15 |   92 |   68 |  678 |
+| range % | 0.0% | 0.2% | 0.2% | 0.3% | 1.8% | 1.3% | 11.8%|
+
+## without-guardrails p50 (absolute), milliseconds
+
+For completeness. This is the variant that's wildly unstable at c=64.
+
+| Run     | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64 |
+|---------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| Run 1   | 4020 | 4030 | 4045 | 4048 | 4056 | 4058 | 5386 |
+| Run 2   | 4020 | 4031 | 4029 | 4032 | 4029 | 4023 | 8510 |
+| Run 3   | 4020 | 4032 | 4044 | 4076 | 4045 | 4049 | 7563 |
+| **mean**| **4020** | **4031** | **4039** | **4052** | **4043** | **4043** | **7153** |
+| range   |    0 |    2 |   16 |   44 |   27 |   35 | 3124 |
+
+The app mock sleeps for exactly 4.0s. The ~20–80 ms above 4000 across c=1–c=32
+is pure NMP+IGW+shim overhead. At c=64 the mock saturates (4 workers × 1 req/4s
+= 4 RPS ceiling, vs. 64 requested in-flight) and requests queue.
+
+## p90 — informational only
+
+p90 is much noisier than p50 across runs. Not gateable with three samples.
+
+### Δp90, milliseconds
+
+| Run   | c=1  | c=2  | c=4  | c=8  | c=16 | c=32 | c=64  |
+|-------|-----:|-----:|-----:|-----:|-----:|-----:|------:|
+| Run 1 | 1039 | 1099 | 1162 | 1025 |  911 |  604 | 3009  |
+| Run 2 | 1028 | 1115 | 1160 | 1262 |  783 |  641 | 1015  |
+| Run 3 | 1023 | 1076 | 1189 | 1085 | 1209 |   18 | 1998  |
+
+## Observations
+
+### What's stable enough to gate on
+
+**c=1, 2, 4, 8.** The Δp50 ranges are 3–35 ms, well under any tolerance we'd
+realistically write. The absolute with-guardrails p50 is even tighter (2–15 ms
+across three runs). This is the regime where the harness is genuinely measuring
+what we want: NMP+middleware overhead on top of fixed-latency mocks.
+
+### What's borderline
+
+**c=16.** Δp50 range is 9.3%. Gateable with a generous tolerance (~10%+) but
+adds limited signal beyond c=8.
+
+### What's not gateable
+
+**c=32.** ~5% Δp50 range. Still bounded, but the run-to-run distance is
+several times larger than at c=1–c=8 and the absolute numbers wobble too.
+
+**c=64.** Unusable. Δp50 swings from +778 to −2896 across three runs.
+Root cause is the app mock's 4-worker saturation at this load level: the
+without-guardrails path fires app requests as fast as it can and the mock queues
+unpredictably. The with-guardrails path's CS-mock work paces requests enough to
+hide most of this. This is a test-rig artifact, not an NMP behavior.
+
+### Side observation: middleware overhead is small
+
+Of the ~1029 ms Δp50 at c=1:
+- ~1000 ms is the two content-safety mock round-trips (0.5s each, mandatory).
+- ~29 ms is the middleware's *own* work (rails orchestration, request/response
+  shaping, etc.) plus bare NMP+IGW overhead delta vs. without-guardrails.
+
+The without-guardrails baseline of ~4020 ms at c=1 against a 4000 ms mock means
+**bare NMP+IGW+shim overhead is ~20 ms** at idle.
+
+## Recommendation for the CI gate
+
+Based on the variance data above:
+
+| Concurrency | Gate Δp50? | Gate absolute with-guardrails p50? | Notes |
+|---|---|---|---|
+| 1  | yes | yes | tightest signal |
+| 2  | yes | yes | |
+| 4  | yes | yes | |
+| 8  | yes | yes | |
+| 16 | informational | informational | record but don't fail |
+| 32 | informational | informational | record but don't fail |
+| 64 | exclude | exclude | mock saturation, not gateable |
+
+Proposed tolerance bands (`max(absolute_ms, relative_%)`):
+- Δp50: `max(±100 ms, ±5%)`
+- with-guardrails p50: `max(±150 ms, ±3%)`
+
+Both bands are ~3× the observed local run-to-run range, leaving headroom for
+CI hardware noise being noisier than a quiet laptop.
+
+## Open questions / followups
+
+- **Local baselines won't transfer to CI hardware.** These numbers should seed
+  the baseline file but be replaced once we have N runs from the actual CI
+  runner class.
+- **Three samples is a small N.** Worth one more local run (Run 4) before we
+  treat the means above as canonical, but the c=1–c=8 numbers are unlikely
+  to budge meaningfully.
+- **c=64 instability is downstream of NMP.** Hypothesis: app mock's 4 workers
+  saturate at concurrency 64 (4 RPS ceiling on 4.0s sleep). Easy to test by
+  running with `--mock-workers 16`. Not blocking the gate work since c=64 is
+  excluded anyway.
@@ -60,12 +60,15 @@ def prepare_runtime_aiperf_config(
     template_path: Path,
     runtime_config_path: Path,
     aiperf_output_dir: Path,
+    model_ref: str | None = None,
 ) -> dict[str, Any]:
     """Materialize the AIPerf config this run will use.
 
     Reads the checked-in ``template_path`` config, overrides its
-    ``output_base_dir`` to point inside the current run's directory, and writes
-    the result to ``runtime_config_path``. AIPerf is later invoked with
+    ``output_base_dir`` to point inside the current run's directory, optionally
+    overrides ``base_config.model`` (so the same template can target multiple
+    VirtualModels in one harness invocation), and writes the result to
+    ``runtime_config_path``. AIPerf is later invoked with
     ``--config-file <runtime_config_path>`` so every artifact lands under a
     separate per-run directory.
 
@@ -82,6 +85,11 @@ def prepare_runtime_aiperf_config(
     # Point AIPerf's output_base_dir at this run's directory so its results
     # nest under our per-run artifacts tree.
     config["output_base_dir"] = str(aiperf_output_dir)
+    if model_ref is not None:
+        base_config = config.get("base_config")
+        if not isinstance(base_config, dict):
+            raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}")
+        base_config["model"] = model_ref
     runtime_config_path.parent.mkdir(parents=True, exist_ok=True)
     runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")