diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml index 92ffe445b..873eccd6e 100644 --- a/.github/copy-pr-bot.yaml +++ b/.github/copy-pr-bot.yaml @@ -1,14 +1,13 @@ # copy-pr-bot configuration for NVIDIA-AI-Blueprints/rag # Docs: https://docs.gha-runners.nvidia.com/platform/apps/copy-pr-bot/ # -# All NVIDIA org members with write access are auto-trusted and auto-vetters. -# No individual names needed — scales to any number of contributors. -# Commit signing required for trusted-change classification. -# See: https://docs.github.com/en/authentication/managing-commit-signature-verification -# -# Only add to additional_vetters if someone needs vetting rights -# but has read-only repo access (rare for internal repos). +# additional_trustees: users explicitly trusted even if not org members. +# Needed when author_association is CONTRIBUTOR (not MEMBER of NVIDIA-AI-Blueprints org). enabled: true auto_sync_draft: false auto_sync_ready: true + +additional_trustees: + - vidushig-nv + - richa-nvidia diff --git a/.github/skill-eval/AGENTS.md b/.github/skill-eval/AGENTS.md index aa605fcc9..d8b0d6e64 100644 --- a/.github/skill-eval/AGENTS.md +++ b/.github/skill-eval/AGENTS.md @@ -22,6 +22,18 @@ find /tmp/skill-eval/results -mindepth 1 -maxdepth 1 -type d \ ! -name "${GITHUB_RUN_ID}" -exec rm -rf {} + 2>/dev/null || true mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results + +# Log exact image digests for traceability (resolve :latest to sha256) +echo "=== Image digests (for traceability) ===" +for img in \ + nvcr.io/nvstaging/blueprint/rag-server:${TAG:-latest} \ + nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-latest}; do + digest=$(docker inspect "$img" --format '{{index .RepoDigests 0}}' 2>/dev/null \ + || docker pull "$img" -q 2>/dev/null \ + && docker inspect "$img" --format '{{index .RepoDigests 0}}' 2>/dev/null \ + || echo "$img — not yet pulled") + echo " $img → $digest" +done ``` ## Your job, in order @@ -54,7 +66,7 @@ mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results Skills with no `eval/` dir are not yet migrated — skip them. -3. **Check the shared adapter.** All rag-* skills use a single adapter +3. **Check the shared adapter.** All rag-\* skills use a single adapter at `skill-eval/adapters/rag-blueprint/generate.py` with `--skill-name `. Verify it accepts `--skill-name`: @@ -68,7 +80,7 @@ mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results § 3c) with the fix and emit `BLOCKED: adapter missing --skill-name`. Unlike VSS, you do NOT create per-skill adapters — one shared - adapter serves all rag-* skills. If a skill genuinely needs custom + adapter serves all rag-\* skills. If a skill genuinely needs custom adapter logic (different PREAMBLE, non-standard platform), note it in the PR comment and raise a bot PR adding `skill-eval/adapters//generate.py`. @@ -78,8 +90,8 @@ mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results is the spec filename without `.json`. Resolve `SKILL_DIR` based on where the skill lives: - - Decomposed skills: `SKILL_DIR="$REPO_ROOT/skills/"` - - Monolithic skills: `SKILL_DIR="$REPO_ROOT/skill-source/.agents/skills/"` + - Decomposed skills: `SKILL_DIR="$REPO_ROOT/skills/"` + - Monolithic skills: `SKILL_DIR="$REPO_ROOT/skill-source/.agents/skills/"` ```bash cd "$REPO_ROOT/skill-eval" @@ -95,14 +107,13 @@ mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results generation fails, read the traceback, fix the adapter, rerun. 5. **Run Harbor trials.** Platform routing: - - **`cpu` platform** (`nvidia_hosted.json` specs) → `LocalEnvironment`. Docker runs directly on the `rag-skill-validator` runner — no Brev VM needed. The runner IS the deploy host. - **`H100_x2` platform** (`h100.json` specs) → `BrevEnvironment`. - Pre-provision an ephemeral Brev VM, run Harbor against it, - delete it after. See § GPU provisioning below. + Pre-provision ONE ephemeral Brev VM for all H100 specs in this run + (see § GPU provisioning). Run all H100 trials against that single VM. For **cpu skills**, clean any leftover Docker state first: @@ -119,6 +130,30 @@ mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results skills set — it deploys the RAG stack that all other skills test against. Then run remaining cpu skills in any order. + **GPU pre-flight (automatic, no action required from skill authors):** + Before running ANY H100 spec for any skill, first sync the Brev VM's repo + to the PR base branch so compose files, env files, and skill docs all match + the branch under test (Harbor clones the default branch — main — not the PR): + + ```bash + brev exec "$BREV_INSTANCE" -- \ + "cd /home/nvidia/rag && git fetch origin ${PR_BASE} && git checkout ${PR_BASE} && git pull origin ${PR_BASE}" \ + 2>/dev/null || true + ``` + + Then check if the RAG stack is already running on the Brev VM: + + ```bash + brev exec "$BREV_INSTANCE" "curl -sf http://localhost:8081/v1/health" \ + 2>/dev/null && RAG_RUNNING=true || RAG_RUNNING=false + ``` + + If `RAG_RUNNING=false` and `rag-blueprint/eval/h100.json` exists in + the repo, run it first to deploy the self-hosted RAG stack. This + happens automatically regardless of which skills are in the PR diff — + skill authors do NOT need to declare this dependency in their specs. + Once deployed, all subsequent H100 specs reuse the running stack. + Use the canonical Harbor invocation from § Harbor invocation below. One step at a time, in order. Skip remaining steps if a step's reward < 1.0 (skip-on-prior-fail). @@ -153,15 +188,29 @@ mkdir -p /tmp/skill-eval/datasets /tmp/skill-eval/results ## GPU provisioning (H100_x2 specs only) -For specs with `platforms: ["H100_x2"]`: +**One VM per platform per run.** If multiple skills have `H100_x2` specs +(e.g. rag-eval/h100.json + rag-perf/h100.json), provision ONE Brev VM at +the start and run ALL H100 trials against it sequentially. Do NOT provision +a new VM per spec — that wastes 13+ min provisioning time and doubles cost. + +**Before processing specs**, collect all unique platforms needed: ```bash -BREV_TYPE="dmz.h100x2.pcie" +# Scan all changed skill specs for their platform requirements +GPU_PLATFORMS_NEEDED=$(...) # e.g. "H100_x2" +``` + +Then provision once per platform, store the instance name, reuse it for +all specs of that platform: + +```bash +# Provision ONCE for all H100_x2 specs in this run +BREV_TYPE="dmz.h100x2,scaleway_H100x2,gpu-h100-sxm.1gpu-16vcpu-200gb" BREV_INSTANCE="rag-eval-gpu-$(date +%s | tail -c 8)" -# Create with retry +# Create with retry + fallback types for attempt in $(seq 1 5); do - echo "$BREV_TYPE" | brev create "$BREV_INSTANCE" --detached 2>&1 | tail -5 + brev create "$BREV_INSTANCE" --type "$BREV_TYPE" --detached 2>&1 | tail -5 brev ls 2>/dev/null | awk -v n="$BREV_INSTANCE" '$1==n {found=1} END{exit !found}' \ && break sleep 15 @@ -169,18 +218,20 @@ done # Wait for RUNNING+READY (up to 30 min) DEADLINE=$(( $(date +%s) + 1800 )) +last_state="" while [ "$(date +%s)" -lt "$DEADLINE" ]; do STATE=$(brev ls 2>/dev/null | awk -v n="$BREV_INSTANCE" '$1==n {print $2"+"$4}') + [ -n "$STATE" ] && [ "$STATE" != "$last_state" ] && echo " $(date -u +%H:%M:%SZ) $BREV_INSTANCE: $STATE" && last_state="$STATE" [ "$STATE" = "RUNNING+READY" ] && break sleep 15 done -[ "$STATE" = "RUNNING+READY" ] || { echo "BLOCKED: H100 VM never reached RUNNING+READY"; exit 1; } +[ "$last_state" = "RUNNING+READY" ] || { echo "BLOCKED: H100 VM never reached RUNNING+READY"; exit 1; } -# Record for cleanup +# Record for cleanup — workflow step deletes after 5-min cooldown mkdir -p /tmp/brev echo "$BREV_INSTANCE" >> "/tmp/brev/started-by-${GITHUB_RUN_ID}.txt" -export BREV_INSTANCE +export BREV_INSTANCE # reuse this for ALL H100_x2 specs below ``` --- @@ -248,15 +299,23 @@ done ``` **Never background harbor and poll.** Use foreground blocking calls only. +`harbor run` MUST be called directly in a Bash tool call and allowed to block +until it exits. Do NOT use TaskCreate, background processes (`&`), `nohup`, +`Monitor`, or any other mechanism to run harbor asynchronously — not even +wrapped in a shell script. The Bash tool call itself must block until harbor +exits. The call will block for up to 90 minutes on GPU specs — that is +expected and correct. Do NOT check on it with sleep loops, Read, or Monitor. +Just wait. Violating this rule causes the agent to exit without DONE:/BLOCKED: +(exit 4). This has happened multiple times — do not repeat the mistake. --- ## Platform topology -| Platform | `spec.platforms` value | Environment | Instance | After run | -|---|---|---|---|---| -| CPU / cloud NIMs | `cpu` | LocalEnvironment | `rag-skill-validator` runner | docker down + volume cleanup | -| 2× H100 80GB | `H100_x2` | BrevEnvironment | `rag-eval-gpu-` (`dmz.h100x2.pcie`) | workflow step deletes after 5-min cooldown | +| Platform | `spec.platforms` value | Environment | Instance | After run | +| ---------------- | ---------------------- | ---------------- | --------------------------------------- | ------------------------------------------ | +| CPU / cloud NIMs | `cpu` | LocalEnvironment | `rag-skill-validator` runner | docker down + volume cleanup | +| 2× H100 80GB | `H100_x2` | BrevEnvironment | `rag-eval-gpu-` (`dmz.h100x2.pcie`) | workflow step deletes after 5-min cooldown | `rag-skill-validator` is the CI runner host — **never** provision Brev against it. @@ -270,10 +329,10 @@ done Head: `` · spec `` First started: `` · Last finished: `` · Total: `` -| Platform | Step | Query | Result | Reward | Duration | Turns | -|---|---|---|---|---|---|---| -| cpu | step-1 | Deploy via Docker Compose... | ✅ 1.0 (6/6) | 1.0 | 4m 29s | 18 | -| cpu | step-2 | Get RAG Blueprint running... | ✅ 1.0 (5/5) | 1.0 | 1m 23s | 9 | +| Platform | Step | Query | Result | Reward | Duration | Turns | +| -------- | ------ | ---------------------------- | ------------ | ------ | -------- | ----- | +| cpu | step-1 | Deploy via Docker Compose... | ✅ 1.0 (6/6) | 1.0 | 4m 29s | 18 | +| cpu | step-2 | Get RAG Blueprint running... | ✅ 1.0 (5/5) | 1.0 | 1m 23s | 9 | ### Failing checks @@ -320,6 +379,7 @@ END=$(jq -r '.trial_finished_at' "$RESULTS"/*/*/step-${STEP}__*/result.json 2>/ ## Manual full-sweep mode When `MANUAL_FULL_SWEEP=1` (workflow_dispatch): + - **Step 1 override:** skip diff. Enumerate `skills/*/eval/*.json`; filter by `MANUAL_SKILLS_FILTER` (`*` = all skills). - **Step 3 override:** no bot-PR flow. Record missing adapter as diff --git a/.github/workflows/ci-pipeline.yml b/.github/workflows/ci-pipeline.yml index 20095e4c1..d617961e9 100644 --- a/.github/workflows/ci-pipeline.yml +++ b/.github/workflows/ci-pipeline.yml @@ -38,10 +38,10 @@ jobs: if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || github.event.pull_request.head.repo.full_name == github.repository steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Setup Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@v5 with: version: 'latest' @@ -67,7 +67,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repository code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - uses: actions/setup-python@v3 - uses: pre-commit/action@v3.0.1 @@ -79,7 +79,7 @@ jobs: image: python:3.12-slim steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install system dependencies run: | @@ -112,7 +112,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Setup Node.js uses: actions/setup-node@v4 @@ -152,7 +152,7 @@ jobs: pnpm test:coverage - name: Upload coverage artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 if: always() with: name: frontend-coverage-${{ steps.sanitize.outputs.ref_name }}-${{ github.sha }} @@ -166,7 +166,7 @@ jobs: image: python:3.12-slim steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install required packages run: | @@ -192,7 +192,7 @@ jobs: ENABLE_NRL_INTEGRATION_TESTS: "false" steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install NGC CLI env: @@ -1264,7 +1264,7 @@ jobs: echo "ref_name=$SANITIZED_REF" >> $GITHUB_OUTPUT - name: Upload all integration test logs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 if: always() with: name: integration-tests-logs-${{ steps.sanitize.outputs.ref_name }}-${{ github.sha }} diff --git a/.github/workflows/publish-artifacts.yml b/.github/workflows/publish-artifacts.yml index 8baf59f9e..001852b88 100644 --- a/.github/workflows/publish-artifacts.yml +++ b/.github/workflows/publish-artifacts.yml @@ -73,7 +73,7 @@ jobs: image: python:3.10 steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set artifactory version run: | @@ -101,7 +101,7 @@ jobs: ls -la dist/ - name: Upload wheel artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: wheel-${{ env.ARTIFACTORY_VERSION }} path: dist/*.whl @@ -156,10 +156,10 @@ jobs: if: github.event_name != 'workflow_dispatch' || ((github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'containers-only') && github.event.inputs.PUBLISH_RAG_SERVER != 'false') steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Determine TAG id: tag @@ -177,7 +177,7 @@ jobs: echo "Final TAG value: $TAG" - name: Login to NGC Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: nvcr.io username: '$oauthtoken' @@ -216,10 +216,10 @@ jobs: if: github.event_name != 'workflow_dispatch' || ((github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'containers-only') && github.event.inputs.PUBLISH_INGESTOR_SERVER != 'false') steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Determine TAG id: tag @@ -237,7 +237,7 @@ jobs: echo "Final TAG value: $TAG" - name: Login to NGC Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: nvcr.io username: '$oauthtoken' @@ -276,10 +276,10 @@ jobs: if: github.event_name != 'workflow_dispatch' || ((github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'containers-only') && github.event.inputs.PUBLISH_RAG_FRONTEND != 'false') steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@v4 - name: Determine TAG id: tag @@ -297,7 +297,7 @@ jobs: echo "Final TAG value: $TAG" - name: Login to NGC Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@v4 with: registry: nvcr.io username: '$oauthtoken' @@ -336,10 +336,10 @@ jobs: if: github.event_name != 'workflow_dispatch' || github.event.inputs.JOBS_TO_RUN == 'all' || github.event.inputs.JOBS_TO_RUN == 'helm-chart-only' steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@v5 with: version: 'v3.17.0' diff --git a/.github/workflows/run-branch-script.yml b/.github/workflows/run-branch-script.yml index 6100e30a2..c140f16a6 100644 --- a/.github/workflows/run-branch-script.yml +++ b/.github/workflows/run-branch-script.yml @@ -42,7 +42,7 @@ jobs: steps: - name: Checkout target ref - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: ref: ${{ inputs.ref }} @@ -64,7 +64,7 @@ jobs: - name: Upload artifacts if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: branch-script-${{ github.run_id }} path: | diff --git a/.github/workflows/skills-eval.yml b/.github/workflows/skills-eval.yml index 84c71b68e..4f28379d7 100644 --- a/.github/workflows/skills-eval.yml +++ b/.github/workflows/skills-eval.yml @@ -22,7 +22,7 @@ on: branches: - "pull-request/[0-9]+" schedule: - - cron: '0 2 * * *' # 2am UTC nightly — all rag-* cpu skills + - cron: "0 2 * * *" # 2am UTC nightly — all rag-* cpu skills workflow_dispatch: inputs: skills: @@ -58,7 +58,7 @@ defaults: jobs: eval: name: Eval changed skills against PR - runs-on: [self-hosted, rag-skill-validator] + runs-on: [self-hosted, rag-eval] # 4-hour cap: 8 cpu skills × ~15 min each = 2h max with 1.5x timeouts. # Nightly runs all skills; PR runs only changed ones so usually much faster. @@ -83,7 +83,7 @@ jobs: done - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 @@ -114,17 +114,6 @@ jobs: - '.github/workflows/skills-eval.yml' - 'ci/run_skill_eval.sh' - - name: Load coordinator env - if: github.event_name != 'push' || steps.changes.outputs.relevant == 'true' - run: | - # Secrets stored in /home/ubuntu/eval-coordinator/.env on the runner. - # Contains: NVIDIA_INFERENCE_KEY, NGC_API_KEY - set -a - source /home/ubuntu/eval-coordinator/.env - set +a - printf "COORDINATOR_ENV=loaded\n" >> "$GITHUB_ENV" - printf "CLAUDE_CODE_DISABLE_THINKING=1\n" >> "$GITHUB_ENV" - - name: Run skills eval agent id: agent if: github.event_name != 'push' || steps.changes.outputs.relevant == 'true' @@ -132,11 +121,16 @@ jobs: GH_TOKEN: ${{ github.token }} GH_CONFIG_DIR: ${{ runner.temp }}/gh-skill-eval-${{ github.run_id }} INPUT_SKILLS: ${{ inputs.skills }} + ANTHROPIC_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} + ANTHROPIC_BASE_URL: https://inference-api.nvidia.com + ANTHROPIC_MODEL: aws/anthropic/bedrock-claude-sonnet-4-6 + NVIDIA_INFERENCE_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} + NVIDIA_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + CLAUDE_CODE_DISABLE_THINKING: "1" + TAG: latest run: | mkdir -p "$GH_CONFIG_DIR" /tmp/brev /tmp/skill-eval - set -a - source /home/ubuntu/eval-coordinator/.env - set +a export PR_NUMBER="${{ steps.pr.outputs.number }}" export PR_BASE="${{ steps.pr.outputs.base }}" export PR_HEAD_SHA="${{ github.sha }}" @@ -150,9 +144,24 @@ jobs: fi python3 .github/skill-eval/skills_eval_agent.py + - name: Collect results for artifact + if: always() && (github.event_name != 'push' || steps.changes.outputs.relevant == 'true') + run: | + if [ ! -d /tmp/skill-eval/results ]; then + echo "no results dir — agent blocked before running trials" + exit 0 + fi + RESULTS=$(find /tmp/skill-eval/results -maxdepth 3 -name "result.json" 2>/dev/null | head -50 || true) + if [ -n "$RESULTS" ]; then + tar czf /tmp/skills-eval-results.tar.gz -C /tmp/skill-eval results + echo "archived $(echo "$RESULTS" | wc -l) result.json files" + else + echo "results dir exists but empty — nothing to archive" + fi + - name: Upload results artifact if: always() && (github.event_name != 'push' || steps.changes.outputs.relevant == 'true') - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: >- ${{ github.event_name == 'schedule' @@ -160,7 +169,7 @@ jobs: || github.event_name == 'workflow_dispatch' && format('skills-eval-manual-{0}', github.run_id) || format('skills-eval-pr-{0}-{1}', steps.pr.outputs.number, github.run_id) }} - path: eval-results/ + path: /tmp/skills-eval-results.tar.gz if-no-files-found: ignore retention-days: 7 diff --git a/.github/workflows/skills-nv-base.yml b/.github/workflows/skills-nv-base.yml index 39a9c5053..2752f56b9 100644 --- a/.github/workflows/skills-nv-base.yml +++ b/.github/workflows/skills-nv-base.yml @@ -2,17 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 # Tier 1 skills validation: schema, security, PII, naming, frontmatter. -# Runs on every PR mirror branch push that touches skills/. -# Uses dorny/paths-filter for cumulative diff (PR base ↔ HEAD) — GitHub's -# top-level paths: filter evaluates per-push diff and misses copy-pr-bot -# merge commits that don't themselves touch skills/. +# push: trigger removed — job is if: false (nv-base requires NVIDIA internal +# network, unavailable on external runners). Keeping only workflow_dispatch +# so the "Run workflow" button appears in the Actions UI for future use. name: Skills NV-BASE on: - push: - branches: - - "pull-request/[0-9]+" workflow_dispatch: inputs: skills: @@ -48,7 +44,7 @@ defaults: jobs: skills-check: name: skills-check - runs-on: [self-hosted, rag-skill-validator] + runs-on: [self-hosted, rag-eval] timeout-minutes: 20 # Tier 1 disabled — nv-base requires pre-installation on the runner @@ -58,7 +54,7 @@ jobs: steps: - name: Checkout mirror head - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 diff --git a/README.md b/README.md index 8d8b5d11c..09d1e12ce 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ The following is a step-by-step explanation of the workflow from the end-user pe ## AI Agent Skill -An agent skill is included that enables AI coding assistants (Claude Code, Cursor, etc.) to deploy, configure, troubleshoot, and manage the RAG Blueprint autonomously. +Agent skills in [`skill-source/`](skill-source/) let coding assistants (Claude Code, Cursor, Codex, etc.) operate this blueprint from natural language. ### Install @@ -172,17 +172,19 @@ An agent skill is included that enables AI coding assistants (Claude Code, Curso npx skills add . ``` -This installs the `rag-blueprint` skill from `skill-source/`. After installation, the agent handles requests like: +Installs all skills below from `skill-source/.agents/skills/`. -- *"Deploy RAG on Docker with NVIDIA-hosted models"* -- *"Enable VLM image captioning and restart the ingestor"* -- *"Ingestion failed for 3 files, can you check why?"* -- *"Switch from Docker to library mode"* -- *"Shut down all RAG services"* +| Skill | Use for | Example prompts | +|-------|---------|-----------------| +| **`rag-blueprint`** | Deploy, configure, troubleshoot, shutdown; REST API usage (`/v1/generate`, ingestor upload) | *"Deploy RAG with self-hosted NIMs"*, *"Enable guardrails"*, *"Wide-net search then high-precision on my collection"* | +| **`rag-eval`** | RAGAS quality benchmarks with `corpus/` + `train.json` and `scripts/eval/evaluate_rag.py` | *"Run RAGAS eval on my dataset"*, *"Compare reranker on vs off"* | +| **`rag-perf`** | Latency/throughput benchmarks via `scripts/rag-perf` (profiling + aiperf) | *"Profile retrieval bottlenecks"*, *"Run a concurrency sweep"* | -> **Note:** If the agent doesn't pick up the skill automatically (e.g., for short or ambiguous queries), invoke it explicitly with `/rag-blueprint `. +Pick the skill that matches the task: operations → **rag-blueprint**; answer quality → **rag-eval**; performance → **rag-perf**. -For skill architecture details, see [`skill-source/README.md`](skill-source/README.md). +> **Note:** If routing is unclear, invoke explicitly: `/rag-blueprint`, `/rag-eval`, or `/rag-perf` plus your request. + +More detail: [`skill-source/README.md`](skill-source/README.md). OpenClaw plugin: [`.openclaw/README.md`](.openclaw/README.md). ## Get Started With NVIDIA RAG Blueprint diff --git a/docs/agentic-rag.md b/docs/agentic-rag.md index 225c3e1bf..c118c7d57 100644 --- a/docs/agentic-rag.md +++ b/docs/agentic-rag.md @@ -33,6 +33,7 @@ The pipeline defaults to off because Agentic RAG trades latency and extra LLM ca - The agentic path does not use NeMo Guardrails, Self-Reflection, Query Decomposition, or VLM Inference. Query rewriting, multi-turn history, multi-collection retrieval, citations, filter generation, and reranking are supported. - Verification runs once; there's no nested verification loop. - Tasks in a plan run at one parallel level; there's no DAG or depends-on construct. +- Per-response retrieval metrics are not emitted. The agentic pipeline issues multiple retrieval calls across initial retrieval, per-task execution, and verification re-retrieval, so the single `metrics` block returned by the standard chain is not populated for agentic requests. ## Architecture Overview diff --git a/docs/release-notes.md b/docs/release-notes.md index 8683edd74..fae923eef 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -10,7 +10,7 @@ This documentation contains the release notes for [NVIDIA RAG Blueprint](readme. ## Release 2.6.0 (TBD) -This release adds [Agentic RAG](./agentic-rag.md) support with plan-and-execute pipelines, streaming responses, and UI integration; changes the default vector database to Elasticsearch and the default object store to SeaweedFS; and introduces new [agent skills](../skill-source/README.md) for deployment, evaluation, and performance tooling. +This release adds [Agentic RAG](./agentic-rag.md) support with plan-and-execute pipelines, streaming responses, and UI integration; changes the default vector database to Elasticsearch and the default object store to SeaweedFS; adds [Red Hat OpenShift](./deploy-helm-openshift.md) support for Helm-based deployment; and introduces new [agent skills](../skill-source/README.md) for deployment, evaluation, and performance tooling. ### Highlights @@ -30,6 +30,7 @@ This release includes the following key updates: - Updated NV-Ingest to version 26.3.0. - Updated OCR NIM naming from `nemoretriever-ocr-v1` to `nemotron-ocr-v1`. - Added OpenClaw plugin for agent-driven deploy/configure/eval workflows. +- Added [Red Hat OpenShift and OKD support](./deploy-helm-openshift.md) for Helm deployments. ### Fixed Known Issues diff --git a/notebooks/rag_library_lite_usage.ipynb b/notebooks/rag_library_lite_usage.ipynb index 90900e788..b3d36928f 100644 --- a/notebooks/rag_library_lite_usage.ipynb +++ b/notebooks/rag_library_lite_usage.ipynb @@ -243,8 +243,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Set logging level\n", - "First let's set the required logging level. Set to INFO for displaying basic important logs. Set to DEBUG for full verbosity." + "## Configure notebook logging\n", + "Set the notebook log level and hide known benign Milvus Lite compatibility noise. Unrelated errors remain visible." ] }, { @@ -255,6 +255,8 @@ "source": [ "import logging\n", "import os\n", + "import traceback\n", + "import warnings\n", "\n", "# Set the log level via environment variable before importing nvidia_rag\n", "# This ensures the package respects our log level setting\n", @@ -264,12 +266,46 @@ "# Configure logging\n", "logging.basicConfig(level=LOGLEVEL, force=True)\n", "\n", - "# Set log levels for specific loggers after package import\n", - "for name in logging.root.manager.loggerDict:\n", - " if name == \"nvidia_rag\" or name.startswith(\"nvidia_rag.\"):\n", + "# Keep known benign Milvus Lite/PyMilvus compatibility noise out of notebook output.\n", + "# These messages can appear during collection creation even when the operation succeeds.\n", + "warnings.filterwarnings(\n", + " \"ignore\",\n", + " message=r\"`connections\\.(has_connection|connect)` is an ORM-style PyMilvus API.*\",\n", + ")\n", + "\n", + "\n", + "class _GrpcAllocTimestampFilter(logging.Filter):\n", + " def filter(self, record: logging.LogRecord) -> bool:\n", + " if record.name != \"grpc._server\":\n", + " return True\n", + " benign_message = \"Exception calling application: Method not implemented!\"\n", + " if benign_message not in record.getMessage():\n", + " return True\n", + " if not record.exc_info:\n", + " return True\n", + " exc_text = \"\".join(traceback.format_exception(*record.exc_info))\n", + " return \"AllocTimestamp\" not in exc_text\n", + "\n", + "\n", + "grpc_logger = logging.getLogger(\"grpc._server\")\n", + "if not any(\n", + " item.__class__.__name__ == \"_GrpcAllocTimestampFilter\"\n", + " for item in grpc_logger.filters\n", + "):\n", + " grpc_logger.addFilter(_GrpcAllocTimestampFilter())\n", + "\n", + "# Set log levels for specific loggers used by the notebook.\n", + "for name in (\"nvidia_rag\", \"nv_ingest_client\"):\n", + " logging.getLogger(name).setLevel(LOGLEVEL)\n", + "\n", + "for name in list(logging.root.manager.loggerDict):\n", + " if name.startswith(\"nvidia_rag.\") or name.startswith(\"nv_ingest_client.\"):\n", " logging.getLogger(name).setLevel(LOGLEVEL)\n", - " if name == \"nv_ingest_client\" or name.startswith(\"nv_ingest_client.\"):\n", - " logging.getLogger(name).setLevel(LOGLEVEL)" + "\n", + "print(\n", + " f\"Notebook logging set to {logging.getLevelName(LOGLEVEL)}. \"\n", + " \"Known benign Milvus Lite compatibility messages will be hidden.\"\n", + ")" ] }, { diff --git a/src/nvidia_rag/ingestor_server/main.py b/src/nvidia_rag/ingestor_server/main.py index ff50af757..bffbec8cb 100644 --- a/src/nvidia_rag/ingestor_server/main.py +++ b/src/nvidia_rag/ingestor_server/main.py @@ -275,7 +275,10 @@ def __prepare_vdb_op_and_collection_name( config=self.config, vdb_auth_token=vdb_auth_token, ) - return vdb_op, collection_name + # Return the backend-canonicalized name (e.g. Elasticsearch + # lowercases index names) so downstream summary keys in Redis + # and object storage align with what GET /collections reports. + return vdb_op, vdb_op.collection_name if not bypass_validation and (collection_name or custom_metadata): raise ValueError( diff --git a/src/nvidia_rag/rag_server/agentic_rag/agentic_rag.py b/src/nvidia_rag/rag_server/agentic_rag/agentic_rag.py index de56ac6c2..652d95582 100644 --- a/src/nvidia_rag/rag_server/agentic_rag/agentic_rag.py +++ b/src/nvidia_rag/rag_server/agentic_rag/agentic_rag.py @@ -54,6 +54,8 @@ from opentelemetry import trace as otel_trace from pydantic import BaseModel, Field +from nvidia_rag.rag_server.agentic_rag.response_parser import parse_json_response + logger = logging.getLogger(__name__) _P = "[AGENTIC_RAG]" @@ -453,44 +455,6 @@ def _filter_think_tokens(content: str) -> str: logger.warning("%s Truncated block (no closing tag), discarding", _P) return "" - @staticmethod - def _sanitize_json_string(raw: str) -> str: - """Escape unescaped control characters inside JSON string values.""" - out: list[str] = [] - in_string = False - i = 0 - length = len(raw) - while i < length: - ch = raw[i] - if ch == "\\" and in_string: - out.append(ch) - if i + 1 < length: - i += 1 - out.append(raw[i]) - i += 1 - continue - if ch == '"': - in_string = not in_string - out.append(ch) - i += 1 - continue - if in_string: - if ch == "\n": - out.append("\\n") - i += 1 - continue - if ch == "\r": - out.append("\\r") - i += 1 - continue - if ch == "\t": - out.append("\\t") - i += 1 - continue - out.append(ch) - i += 1 - return "".join(out) - @staticmethod async def _accumulate_astream( chain: Any, @@ -745,37 +709,6 @@ async def _call_llm( return response_content - # ========================================================================= - # JSON PARSING - # ========================================================================= - - def _parse_json_response(self, response: str) -> dict[str, Any]: - """Parse a JSON object from an LLM response, with fallback sanitization.""" - try: - return json.loads(response) - except json.JSONDecodeError: - pass - - start = response.find("{") - end = response.rfind("}") + 1 - if start == -1 or end <= start: - logger.warning("%s No JSON object found in response: %.200s", _P, response) - return {"error": "Failed to parse JSON", "raw_response": response} - - json_str = response[start:end] - try: - return json.loads(json_str) - except json.JSONDecodeError: - pass - - try: - return json.loads(self._sanitize_json_string(json_str)) - except json.JSONDecodeError: - pass - - logger.warning("%s JSON parse failed: %.200s", _P, response) - return {"error": "Failed to parse JSON", "raw_response": response} - # ========================================================================= # CONTENT HELPERS # ========================================================================= @@ -913,7 +846,7 @@ def _parse_task_answer(self, raw_answer: str) -> dict: if not raw_answer: return {"completeness": "none", "answer": "[NO DATA]", "missing": ""} - parsed = self._parse_json_response(raw_answer) + parsed = parse_json_response(raw_answer) if parsed and "completeness" in parsed: return { "completeness": parsed.get("completeness", "complete"), @@ -999,7 +932,7 @@ def _finish_task(res: dict) -> dict: json_mode=True, # config intentionally omitted — see method docstring. ) - seed_result = self._parse_json_response(seed_response) + seed_result = parse_json_response(seed_response) if seed_result.get("stop", False): logger.debug( @@ -1094,7 +1027,7 @@ def _finish_task(res: dict) -> dict: {"question": task_question, "documents": docs_str}, step_name=f"Task {tid} answer (attempt {attempt + 1})", # config intentionally omitted — see method docstring. - json_mode=True, + json_mode=False, ) parsed = self._parse_task_answer(raw_answer) @@ -1326,7 +1259,7 @@ async def plan_node( json_mode=True, config=config, ) - plan = self._parse_json_response(response) + plan = parse_json_response(response) if "error" in plan: logger.warning( @@ -1875,7 +1808,7 @@ async def verify_node( json_mode=True, config=config, ) - result = self._parse_json_response(response) + result = parse_json_response(response) passed = result.get("status") == "pass" issues = result.get("issues", []) diff --git a/src/nvidia_rag/rag_server/agentic_rag/response_parser.py b/src/nvidia_rag/rag_server/agentic_rag/response_parser.py new file mode 100644 index 000000000..bff23bc05 --- /dev/null +++ b/src/nvidia_rag/rag_server/agentic_rag/response_parser.py @@ -0,0 +1,160 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LLM response parsing with recovery for common malformed-output patterns. + +Handles common LLM output pathologies: + * Preamble / postscript text around the JSON object. + * "False start + restart" patterns from reasoning models — the model + emits a draft, then re-emits the full object. We pick the last + balanced top-level ``{...}`` candidate. + * Missing-colon typos like ``"tasks[`` instead of ``"tasks": [``. + * Unescaped control characters (newline / tab / carriage return) + inside JSON string values. + +Public surface +-------------- +* ``parse_json_response`` — the only function callers need; returns a dict + on success or ``{"error": ..., "raw_response": ...}`` on failure. +""" + +import json +import logging +import re +from typing import Any + +logger = logging.getLogger(__name__) + +_P = "[AGENTIC_RAG]" + + +def parse_json_response(response: str) -> dict[str, Any]: + """Parse a JSON object from an LLM response, with fallback sanitization. + + Handles "false start + restart" patterns from reasoning models by + extracting all top-level balanced ``{...}`` candidates and trying + them from last to first. The last complete candidate is typically + the model's final revised output. + """ + try: + return json.loads(response) + except json.JSONDecodeError: + pass + + # Try balanced top-level candidates (handles "restart" patterns) + for cand in reversed(_extract_top_level_objects(response)): + try: + return json.loads(cand) + except json.JSONDecodeError: + pass + try: + return json.loads(_sanitize_json_string(cand)) + except json.JSONDecodeError: + pass + + # Fallback: broadest span (handles unterminated-string cases that + # confuse brace-counting, e.g. '"tasks[' missing-colon typos). + start = response.find("{") + end = response.rfind("}") + 1 + if start == -1 or end <= start: + logger.warning("%s No JSON object found in response: %.200s", _P, response) + return {"error": "Failed to parse JSON", "raw_response": response} + + broad = response[start:end] + try: + return json.loads(broad) + except json.JSONDecodeError: + pass + try: + return json.loads(_sanitize_json_string(broad)) + except json.JSONDecodeError: + pass + + logger.warning("%s JSON parse failed: %s", _P, response) + return {"error": "Failed to parse JSON", "raw_response": response} + + +def _extract_top_level_objects(text: str) -> list[str]: + """Return all balanced top-level ``{...}`` substrings (string-aware).""" + candidates: list[str] = [] + depth = 0 + start_idx = -1 + in_string = False + escape = False + for i, ch in enumerate(text): + if in_string: + if escape: + escape = False + elif ch == "\\": + escape = True + elif ch == '"': + in_string = False + continue + if ch == '"': + in_string = True + elif ch == "{": + if depth == 0: + start_idx = i + depth += 1 + elif ch == "}": + if depth > 0: + depth -= 1 + if depth == 0 and start_idx != -1: + candidates.append(text[start_idx : i + 1]) + start_idx = -1 + return candidates + + +def _sanitize_json_string(raw: str) -> str: + """Escape unescaped control chars and repair common LLM JSON typos.""" + # Repair missing colon between key and array/object value: + # e.g. '"tasks[' → '"tasks": [' and '"task{' → '"task": {' + raw = re.sub(r'"(\w+)"\s*(\[|\{)', r'"\1": \2', raw) + raw = re.sub(r'"(\w+)(\[|\{)', r'"\1": \2', raw) + + out: list[str] = [] + in_string = False + i = 0 + length = len(raw) + while i < length: + ch = raw[i] + if ch == "\\" and in_string: + out.append(ch) + if i + 1 < length: + i += 1 + out.append(raw[i]) + i += 1 + continue + if ch == '"': + in_string = not in_string + out.append(ch) + i += 1 + continue + if in_string: + if ch == "\n": + out.append("\\n") + i += 1 + continue + if ch == "\r": + out.append("\\r") + i += 1 + continue + if ch == "\t": + out.append("\\t") + i += 1 + continue + out.append(ch) + i += 1 + return "".join(out) diff --git a/tests/unit/test_ingestor_server/test_ingestor_main_core_components.py b/tests/unit/test_ingestor_server/test_ingestor_main_core_components.py index 48ca314e9..b85773ffc 100644 --- a/tests/unit/test_ingestor_server/test_ingestor_main_core_components.py +++ b/tests/unit/test_ingestor_server/test_ingestor_main_core_components.py @@ -303,6 +303,7 @@ def test_prepare_vdb_op_without_vdb_op_missing_collection_name(self): def test_prepare_vdb_op_without_vdb_op_with_collection_name(self, mock_get_vdb): """Test __prepare_vdb_op without vdb_op but with collection_name.""" mock_vdb_op = Mock(spec=VDBRag) + mock_vdb_op.collection_name = "test_collection" mock_get_vdb.return_value = mock_vdb_op ingestor = NvidiaRAGIngestor() @@ -314,10 +315,30 @@ def test_prepare_vdb_op_without_vdb_op_with_collection_name(self, mock_get_vdb): assert result == (mock_vdb_op, "test_collection") mock_get_vdb.assert_called_once() + @patch("nvidia_rag.ingestor_server.main._get_vdb_op") + def test_prepare_vdb_op_returns_backend_canonicalized_name(self, mock_get_vdb): + """Backends that normalize (e.g. Elasticsearch lowercases index names) must + have their canonical name flow back to the caller so downstream summary + keys in Redis and object storage align with what GET /collections reports. + Regression guard for bug 6206269. + """ + mock_vdb_op = Mock(spec=VDBRag) + mock_vdb_op.collection_name = "mycollection" + mock_get_vdb.return_value = mock_vdb_op + + ingestor = NvidiaRAGIngestor() + + result = ingestor._NvidiaRAGIngestor__prepare_vdb_op_and_collection_name( + collection_name="MyCollection" + ) + + assert result == (mock_vdb_op, "mycollection") + @patch("nvidia_rag.ingestor_server.main._get_vdb_op") def test_prepare_vdb_op_bypass_validation(self, mock_get_vdb): """Test __prepare_vdb_op with bypass_validation=True.""" mock_vdb_op = Mock(spec=VDBRag) + mock_vdb_op.collection_name = None mock_get_vdb.return_value = mock_vdb_op ingestor = NvidiaRAGIngestor() diff --git a/tests/unit/test_ingestor_server/test_ingestor_main_document_operations.py b/tests/unit/test_ingestor_server/test_ingestor_main_document_operations.py index 5dd394f01..222e485eb 100644 --- a/tests/unit/test_ingestor_server/test_ingestor_main_document_operations.py +++ b/tests/unit/test_ingestor_server/test_ingestor_main_document_operations.py @@ -550,6 +550,7 @@ def test_private_methods_coverage(self): # Test __prepare_vdb_op_and_collection_name with patch("nvidia_rag.ingestor_server.main._get_vdb_op") as mock_get_vdb: mock_vdb_instance = Mock(spec=VDBRag) + mock_vdb_instance.collection_name = "test_collection" mock_get_vdb.return_value = mock_vdb_instance vdb_op, collection_name = ( diff --git a/tests/unit/test_rag_server/test_agentic_rag.py b/tests/unit/test_rag_server/test_agentic_rag.py index 301307b29..daf7f11a7 100644 --- a/tests/unit/test_rag_server/test_agentic_rag.py +++ b/tests/unit/test_rag_server/test_agentic_rag.py @@ -117,12 +117,6 @@ def test_filter_think_tokens_strips_closed_block(self) -> None: def test_filter_think_tokens_truncated_block(self) -> None: assert AgenticRag._filter_think_tokens("no close") == "" - def test_sanitize_json_string_escapes_newlines_in_strings(self) -> None: - dirty = '{"x": "line1\nline2"}' - clean = AgenticRag._sanitize_json_string(dirty) - assert "\n" not in clean.split('"x":')[1] - assert json.loads(clean)["x"] == "line1\nline2" - def test_rebuild_result_text_vs_chart(self) -> None: text_chunk = { "doc_name": "a.pdf", @@ -163,17 +157,6 @@ def test_clean_answer_strips_markdown_headers(self) -> None: class TestAgenticRagInstanceHelpers: - def test_parse_json_response_direct_and_embedded(self) -> None: - agent = _minimal_agent() - assert agent._parse_json_response('{"k": 1}') == {"k": 1} - wrapped = 'prefix {"k": 2} suffix' - assert agent._parse_json_response(wrapped) == {"k": 2} - - def test_parse_json_response_invalid_returns_error_dict(self) -> None: - agent = _minimal_agent() - out = agent._parse_json_response("not json at all") - assert out.get("error") == "Failed to parse JSON" - def test_extract_chunks_from_model_dump_shape(self) -> None: agent = _minimal_agent() dumped = {