NVIDIA-NeMo · JashG · Jun 16, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -40,6 +40,9 @@ outputs:
   k8s-smoke:
     description: "'true' if Kubernetes smoke test support files changed"
     value: ${{ steps.filter.outputs.k8s-smoke }}
+  guardrails-benchmark:
+    description: "'true' if the nemo-guardrails plugin or guardrails service changed"
+    value: ${{ steps.filter.outputs.guardrails-benchmark }}
   cpu-smoke:
     description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed"
     value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }}
@@ -97,3 +100,6 @@ runs:
             - 'e2e/k8s/values/**'
             - 'e2e/test_jobs.py'
             - '.github/actions/free-disk-space/action.yaml'
+          guardrails-benchmark:
+            - 'plugins/nemo-guardrails/**'
+            - 'services/guardrails/**'
@@ -44,6 +44,7 @@ jobs:
       docker: ${{ steps.changes.outputs.docker }}
       helm: ${{ steps.changes.outputs.helm }}
       cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }}
+      guardrails-benchmark: ${{ steps.changes.outputs.guardrails-benchmark }}
     steps:
       - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
       - uses: ./.github/actions/changes
@@ -1071,21 +1072,36 @@ jobs:
           retention-days: 7
           path: web/packages/studio/playwright-report/
 
-  benchmark-guardrails:
-    name: Guardrails plugin benchmark
-    if: github.event_name == 'workflow_dispatch'
+  guardrails-benchmark:
+    # Parallel matrix jobs (one NMP per variant) so the two sweeps don't
+    # share mocks or contend on :8080. `guardrails-benchmark-analyze` merges
+    # the artifacts and prints the comparison.
+    name: nemo-guardrails plugin benchmark (${{ matrix.variant }})
+    needs: [changes]
+    if: >
+      !cancelled() && (
+        github.event_name == 'workflow_dispatch' ||
+        needs.changes.outputs.guardrails-benchmark == 'true'
+      )
     runs-on: ubuntu-latest
     timeout-minutes: 30
+    strategy:
+      # Keep the partial artifact if one variant fails.
+      fail-fast: false
+      matrix:
+        variant: [with-guardrails, without-guardrails]
     steps:
       - name: Checkout nemo-platform
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           path: nemo-platform
+          persist-credentials: false
       - name: Checkout NeMo-Guardrails
         uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           repository: NVIDIA/NeMo-Guardrails
           path: NeMo-Guardrails
+          persist-credentials: false
       - name: Install uv
         uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
         with:
@@ -1102,15 +1118,71 @@ jobs:
           PYTORCH_DEPS: cpu
       - name: Run benchmark sweep
         working-directory: nemo-platform
-        run: make benchmark-guardrails
+        # Pin both variants to the same `--run-id` so when the analyze job
+        # downloads both artifacts into one `runs/` parent, they merge into
+        # a single run directory the analyzer can read normally.
+        run: |
+          make benchmark-guardrails BENCHMARK_ARGS="\
+            --variant ${{ matrix.variant }} \
+            --run-id ci-${{ github.run_id }}-${{ github.run_attempt }}"
         env:
           NEMO_GUARDRAILS_REPO_ROOT: ${{ github.workspace }}/NeMo-Guardrails
           _TYPER_FORCE_DISABLE_TERMINAL: "1"
       - name: Upload benchmark artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
-          name: benchmark-guardrails-results
+          # Ensure we use a unique artifact name per benchmark vaiant.
-          # Ensure we use a unique artifact name per benchmark vaiant.
+          # Ensure we use a unique artifact name per benchmark variant.
-          # Ensure we use a unique artifact name per benchmark vaiant.
+          # Ensure we use a unique artifact name per benchmark variant.
+          name: guardrails-benchmark-results-${{ matrix.variant }}
+          retention-days: 30
+          path: |
+            nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+
+  guardrails-benchmark-analyze:
+    # Merge both variant artifacts and print the comparison table.
+    name: nemo-guardrails plugin benchmark analysis
+    needs: [changes, guardrails-benchmark]
+    if: >
+      !cancelled() && (
+        github.event_name == 'workflow_dispatch' ||
+        needs.changes.outputs.guardrails-benchmark == 'true'
+      )
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - name: Checkout nemo-platform
+        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+        with:
+          path: nemo-platform
+          persist-credentials: false
+      - name: Download with-guardrails artifact
+        # If a variant failed entirely it may have uploaded no artifact;
+        # the analyzer handles the single-variant case so don't fail here.
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: guardrails-benchmark-results-with-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Download without-guardrails artifact
+        continue-on-error: true
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: guardrails-benchmark-results-without-guardrails
+          path: nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
+      - name: Print benchmark comparison
+        working-directory: nemo-platform
+        # `analyze.py` doesn't rely on NMP or AIPerf, so we skip the uv bootstrap
+        # step and run it with the runner's `python3` CLI directly.
+        run: |
+          RUN_DIR=$(find plugins/nemo-guardrails/benchmarks/artifacts/runs -mindepth 1 -maxdepth 1 -type d -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-)
+          echo "Analyzing run directory: $RUN_DIR"
+          python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py "$RUN_DIR" --strict
+      - name: Upload merged benchmark artifacts
+        if: always()
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          # Single artifact so baseline collection is one download per run.
+          name: guardrails-benchmark-results-merged
           retention-days: 30
           path: |
             nemo-platform/plugins/nemo-guardrails/benchmarks/artifacts/runs/
@@ -1261,7 +1333,6 @@ jobs:
       - web-sdk-gen
       - web-studio-deps
       - web-studio-e2e
-      - benchmark-guardrails
       - opa-policy-test
     if: always()
     runs-on: ubuntu-latest

@@ -15,9 +15,11 @@ benchmark modules with `PYTHONPATH` pointed at that checkout.
 plugins/nemo-guardrails/benchmarks/
   configs/
     nmp_igw_guardrails_sweep_concurrency.yaml   # AIPerf sweep template
+    mock_llm/                                   # in-repo mock LLM env files
   artifacts/                                    # per-run outputs (gitignored)
 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/
   run.py             # entrypoint: `python -m nemo_guardrails_plugin.benchmarks.run`
+  analyze.py         # post-run analysis; checks latencies against baseline values
   paths.py           # filesystem layout
   constants.py       # workspace / VM / provider names
   processes.py       # subprocess supervision (process groups + ExitStack)
@@ -181,15 +183,76 @@ plugins/nemo-guardrails/benchmarks/artifacts/runs/<timestamp>/
 
 ## CI
 
-A `benchmark-guardrails` job in `.github/workflows/ci.yaml` checks out both
-this repo and `NVIDIA/NeMo-Guardrails`, runs `make bootstrap-python` and
-`make benchmark-guardrails`, and uploads the per-run artifacts directory
-(`logs/`, `generated/`, `aiperf_results/`) on success or failure.
+Two jobs in `.github/workflows/ci.yaml`:
 
-Pass/fail is driven by the harness's exit code, which is non-zero if `aiperf`
-itself exits non-zero or any sweep returns a non-zero exit code. No latency
-thresholds are enforced — those can be layered on later by a separate
-analyzer that reads the per-sweep CSVs.
+- `guardrails-benchmark` — matrix of two parallel jobs, one per variant
+  (`with-guardrails`, `without-guardrails`), each on its own NMP instance.
+  Uploads per-variant artifacts (`logs/`, `generated/`, `aiperf_results/`).
+- `guardrails-benchmark-analyze` — joins the two matrix jobs, downloads both
+  artifacts, prints a side-by-side comparison via
+  `nemo_guardrails_plugin.benchmarks.analyze`, and runs the baseline check
+  (see below). Fails the build on a latency regression beyond tolerance. The
+  analyzer is stdlib-only by design, so this job runs on the runner's stock
+  `python3` without bootstrapping the uv workspace.
+
+### Baseline and gating
+
+CI compares the run's delta_p50 (with-guardrails minus without-guardrails
+p50, in ms) against a checked-in baseline. The baseline lives as
+module-level constants in:
+
+```text
+plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py
+```
+
+Why only delta_p50 (and not absolute with-guardrails p50)? delta_p50
+isolates the middleware's contribution — shared CI runner noise cancels
+across the two variants.
+
+#### Baseline constants
+
+- `CONCURRENCIES_TO_VALIDATE: list[int]` — concurrency levels to gate on.
+  Other levels still appear in the analyzer's output tables, but pass/fail
+  is decided only by these.
+- `DEFAULT_DELTA_P50_TOLERANCE_MS: int` — default tolerance (in ms) applied
+  to every validated concurrency. A check fails when
+  `|observed - baseline| > tolerance`.
+- `DELTA_P50_TOLERANCE_OVERRIDES_MS: dict[int, int]` — per-concurrency
+  tolerance overrides (in ms). Levels without an override fall back to the
+  default.
+- `DELTA_P50_BASELINE_BY_CONCURRENCY: dict[int, int]` — expected delta_p50
+  (in ms) per concurrency level. Edit by hand when a real change shifts
+  the numbers.
+
+Worked example: at c=16 the override is 200 ms, so a run with observed
+delta_p50 = 1689 (diff +199 from baseline 1390) passes; observed
+delta_p50 = 1691 (diff +201) fails.
+
+Notes on the current values:
+
+- c=16 and c=32 use wider tolerances than the default because their
+  absolute delta_p50 is larger. Over time, we can tighten these values
+  if latencies in CI produce less variance.
+- Any change to mock-LLM latencies, the guardrails config, or the runner
+  class invalidates the current baseline values. The benchmark should be
+  re-run in CI several tiems to establish updated baseline values.
-  re-run in CI several tiems to establish updated baseline values.
+  re-run in CI several times to establish updated baseline values.
-  re-run in CI several tiems to establish updated baseline values.
+  re-run in CI several times to establish updated baseline values.
+
+#### Running the analyzer locally
+
+```bash
+python3 plugins/nemo-guardrails/src/nemo_guardrails_plugin/benchmarks/analyze.py \
+    plugins/nemo-guardrails/benchmarks/artifacts/runs/<run-id>
+```
+
+Local runs print both tables and the baseline-check table.
+CI passes `--strict` to make any out-of-tolerance check fail the job.
+
+#### Updating the baseline
+
+When a real change shifts the numbers (ex. a deliberate middleware change,
+a mock-LLM config change, or a runner-class change), edit the constants at
+the top of `analyze.py` by hand and reference the PR / CI run that
+justifies it in the commit.
 
 ## Cleanup
 

@@ -0,0 +1,15 @@
+# Mock LLM configurations
+
+These `.env` files configure the behavior of the mock LLMs, used by the upstream
+`nemo-guardrails` library's `benchmark.mock_llm_server.run_server`.
+
+The library stores these files, but we keep our own copies so:
+
+- We can change mock latency without touching the upstream repo.
+- The exact mock behavior we benchmarked against is versioned alongside the
+  results, so historical numbers stay reproducible even if upstream changes
+  its defaults.
+
+Mapping to upstream files:
+- `app-llm.env`            ← upstream `meta-llama-3.3-70b-instruct.env`
+- `content-safety-llm.env` ← upstream `nvidia-llama-3.1-nemoguard-8b-content-safety.env`
@@ -0,0 +1,19 @@
+MODEL="meta/llama-3.3-70b-instruct"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="I can't help with that. Is there anything else I can assist you with?"
+SAFE_TEXT="I can provide information and help with a wide range of topics, from science and history to entertainment and culture. I can also help with language-related tasks, such as translation and text summarization. However, I can't assist with requests that involve harm or illegal activities."
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=4.0
+E2E_LATENCY_MAX_SECONDS=4.0
+E2E_LATENCY_MEAN_SECONDS=4.0
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.3
+TTFT_MAX_SECONDS=0.3
+TTFT_MEAN_SECONDS=0.3
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
@@ -0,0 +1,19 @@
+MODEL="nvidia/llama-3.1-nemoguard-8b-content-safety"
+UNSAFE_PROBABILITY=0.0
+UNSAFE_TEXT="{\"User Safety\": \"unsafe\", \"Response Safety\": \"unsafe\", \"Safety Categories\": \"Violence, Criminal Planning/Confessions\"}"
+SAFE_TEXT="{\"User Safety\": \"safe\", \"Response Safety\": \"safe\"}"
+# End-to-end latency
+E2E_LATENCY_MIN_SECONDS=0.5
+E2E_LATENCY_MAX_SECONDS=0.5
+E2E_LATENCY_MEAN_SECONDS=0.5
+E2E_LATENCY_STD_SECONDS=0.0
+# Streaming latency: Time to First Token (TTFT)
+TTFT_MIN_SECONDS=0.2
+TTFT_MAX_SECONDS=0.2
+TTFT_MEAN_SECONDS=0.2
+TTFT_STD_SECONDS=0.0
+# Streaming latency: Chunk Latency (ITL)
+CHUNK_LATENCY_MIN_SECONDS=0.015
+CHUNK_LATENCY_MAX_SECONDS=0.015
+CHUNK_LATENCY_MEAN_SECONDS=0.015
+CHUNK_LATENCY_STD_SECONDS=0.0
@@ -60,17 +60,15 @@ def prepare_runtime_aiperf_config(
     template_path: Path,
     runtime_config_path: Path,
     aiperf_output_dir: Path,
+    model_ref: str | None = None,
 ) -> dict[str, Any]:
     """Materialize the AIPerf config this run will use.
 
-    Reads the checked-in ``template_path`` config, overrides its
-    ``output_base_dir`` to point inside the current run's directory, and writes
-    the result to ``runtime_config_path``. AIPerf is later invoked with
-    ``--config-file <runtime_config_path>`` so every artifact lands under a
-    separate per-run directory.
-
-    Returns the parsed config dict so callers can log fields (sweep params,
-    benchmark_duration) without re-reading the file.
+    Reads ``template_path``, overrides ``output_base_dir`` (so AIPerf
+    artifacts nest under this run) and optionally ``base_config.model``
+    (so one template can target multiple VirtualModels), and writes the
+    result to ``runtime_config_path``. Returns the parsed config so
+    callers can log sweep params without re-reading the file.
     """
     if not template_path.is_file():
         raise FileNotFoundError(f"AIPerf template not found: {template_path}")
@@ -82,6 +80,11 @@ def prepare_runtime_aiperf_config(
     # Point AIPerf's output_base_dir at this run's directory so its results
     # nest under our per-run artifacts tree.
     config["output_base_dir"] = str(aiperf_output_dir)
+    if model_ref is not None:
+        base_config = config.get("base_config")
+        if not isinstance(base_config, dict):
+            raise ValueError(f"Expected `base_config` mapping in {template_path}, got {type(base_config).__name__}")
+        base_config["model"] = model_ref
     runtime_config_path.parent.mkdir(parents=True, exist_ok=True)
     runtime_config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")