TechNavii
diff --git a/‎.artifact-hygiene-allowlist.txt‎
Lines changed: 13 additions & 0 deletions b/‎.artifact-hygiene-allowlist.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.bandit.yml‎
Lines changed: 15 additions & 0 deletions b/‎.bandit.yml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎.devcontainer/Dockerfile‎
Lines changed: 65 additions & 0 deletions b/‎.devcontainer/Dockerfile‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎.devcontainer/devcontainer.json‎
Lines changed: 61 additions & 0 deletions b/‎.devcontainer/devcontainer.json‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎.env.example‎
Lines changed: 14 additions & 0 deletions b/‎.env.example‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.github/dependabot.yml‎
Lines changed: 109 additions & 0 deletions b/‎.github/dependabot.yml‎
Lines changed: 109 additions & 0 deletions
diff --git a/‎.github/workflows/codeql.yml‎
Lines changed: 95 additions & 0 deletions b/‎.github/workflows/codeql.yml‎
Lines changed: 95 additions & 0 deletions
@@ -0,0 +1,13 @@
+# Artifact Hygiene Allowlist
+# ==========================
+# This file lists files that are intentionally tracked despite being in
+# artifact directories or matching artifact patterns.
+#
+# Format: file_path # justification
+#
+# Guidelines:
+# - Each entry MUST have a justification explaining why it's tracked
+# - Prefer excluding files from tracking over adding allowlist entries
+# - Review this file periodically to remove stale entries
+#
+# Currently no files are allowlisted.
@@ -0,0 +1,15 @@
+# Bandit SAST configuration for benchmark harness
+# Scoped to server/ and harness/ (excludes tasks/* workspaces and test directories)
+
+exclude_dirs:
+  - '/tests'
+  - 'harness/tests'
+
+# Skip low-severity or noisy checks:
+# B101: assert_used - asserts acceptable for invariants and type narrowing
+# B404: import subprocess - subprocess necessary for harness execution
+# B603: subprocess_without_shell_equals_true - we use shell=False (secure)
+skips:
+  - B101
+  - B404
+  - B603
@@ -0,0 +1,65 @@
+# Devcontainer for Benchmark Harness
+# Provides a reproducible development environment matching CI configuration
+#
+# Python versions: 3.11 (primary), 3.12 (available via pyenv)
+# Uses locked dependencies from requirements*.txt files
+
+FROM mcr.microsoft.com/devcontainers/python:1-3.11-bookworm
+
+# Install system dependencies required by the project
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # ShellCheck for script linting
+    shellcheck \
+    # jq for JSON processing (used by scripts)
+    jq \
+    # Additional development tools
+    curl \
+    git \
+    make \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install actionlint for workflow validation
+# hadolint ignore=DL4006
+RUN curl -sL https://github.com/rhysd/actionlint/releases/download/v1.7.4/actionlint_1.7.4_linux_amd64.tar.gz | tar xz -C /usr/local/bin
+
+# Set working directory
+WORKDIR /workspace
+
+# Copy dependency files first for better layer caching
+COPY requirements-dev.txt requirements-dev.in ./
+COPY server/requirements.txt server/requirements.in ./server/
+COPY harness/requirements.txt harness/requirements.in ./harness/
+
+# Create and activate virtual environment
+ENV VIRTUAL_ENV=/workspace/.venv
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+
+# Create virtualenv and install dependencies
+# Uses --require-hashes for security (matches CI behavior)
+# hadolint ignore=DL3013,DL3042
+RUN python -m venv ${VIRTUAL_ENV} && \
+    pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir --require-hashes -r requirements-dev.txt && \
+    pip install --no-cache-dir --require-hashes -r server/requirements.txt && \
+    pip install --no-cache-dir --require-hashes -r harness/requirements.txt
+
+# Install pre-commit hooks (optional but recommended)
+# hadolint ignore=DL3013
+RUN pip install --no-cache-dir pre-commit
+
+# Install Playwright for GUI tests (optional, can be slow)
+# Uncomment if GUI tests are needed in the container
+# RUN playwright install chromium && playwright install-deps chromium
+
+# Set Python environment variables
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PYTHONPATH=/workspace
+
+# Create non-root user directories
+RUN mkdir -p /workspace/runs /workspace/tasks && \
+    chown -R vscode:vscode /workspace
+
+# Default command
+CMD ["bash"]
@@ -0,0 +1,61 @@
+{
+  "name": "Benchmark Harness",
+  "build": {
+    "dockerfile": "Dockerfile",
+    "context": ".."
+  },
+  "features": {
+    "ghcr.io/devcontainers/features/git:1": {},
+    "ghcr.io/devcontainers/features/github-cli:1": {}
+  },
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.python",
+        "ms-python.vscode-pylance",
+        "charliermarsh.ruff",
+        "tamasfe.even-better-toml",
+        "redhat.vscode-yaml"
+      ],
+      "settings": {
+        "python.defaultInterpreterPath": "/workspace/.venv/bin/python",
+        "python.terminal.activateEnvironment": true,
+        "[python]": {
+          "editor.formatOnSave": true,
+          "editor.codeActionsOnSave": {
+            "source.fixAll": "explicit",
+            "source.organizeImports": "explicit"
+          },
+          "editor.defaultFormatter": "charliermarsh.ruff"
+        },
+        "ruff.path": ["/workspace/.venv/bin/ruff"],
+        "mypy-type-checker.path": ["/workspace/.venv/bin/mypy"],
+        "files.exclude": {
+          "**/__pycache__": true,
+          "**/.pytest_cache": true,
+          "**/htmlcov": true,
+          "**/.coverage": true
+        },
+        "terminal.integrated.defaultProfile.linux": "bash"
+      }
+    }
+  },
+  "postCreateCommand": "./scripts/bootstrap.sh",
+  "remoteUser": "vscode",
+  "containerEnv": {
+    "PYTHONDONTWRITEBYTECODE": "1",
+    "PYTHONUNBUFFERED": "1"
+  },
+  "mounts": [
+    "source=${localWorkspaceFolder}/runs,target=/workspace/runs,type=bind,consistency=cached",
+    "source=${localWorkspaceFolder}/tasks,target=/workspace/tasks,type=bind,consistency=cached"
+  ],
+  "forwardPorts": [8000],
+  "portsAttributes": {
+    "8000": {
+      "label": "FastAPI Server",
+      "onAutoForward": "notify"
+    }
+  },
+  "workspaceFolder": "/workspace"
+}
@@ -1,6 +1,20 @@
 # Copy this file to .env and fill in the values to run the harness against OpenRouter.
 OPENROUTER_API_KEY=replace-with-your-key
+
+# Optional: OpenRouter base URL (defaults to https://openrouter.ai/api/v1)
+# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
+
 # Optional: tweak model defaults
 DEFAULT_MODEL=openrouter/google/gemini-pro
 # Optional: default temperature for harness runs
 DEFAULT_TEMPERATURE=0.0
+
+# Optional: default max tokens for harness runs
+DEFAULT_MAX_TOKENS=200000
+
+# Optional: expert QA judge model (OpenRouter model id). Leave blank to disable judge.
+EXPERT_QA_JUDGE_MODEL=openai/gpt-4o-mini
+
+# Optional: LM Studio base URL (OpenAI-compatible endpoint)
+# Defaults to http://127.0.0.1:1234/v1
+# LMSTUDIO_BASE_URL=http://127.0.0.1:1234/v1
@@ -0,0 +1,109 @@
+# Dependabot configuration for automated dependency updates
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+  # GitHub Actions version updates
+  # Actions are pinned to commit SHAs for supply-chain security.
+  # Dependabot automatically updates SHA-pinned actions to new SHAs
+  # when new versions are released, maintaining both security and freshness.
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "monday"
+      time: "06:00"
+      timezone: "UTC"
+    commit-message:
+      prefix: "ci(deps)"
+    labels:
+      - "dependencies"
+      - "github-actions"
+    reviewers:
+      - "tooru" # Repository maintainer
+    # Group minor/patch updates to reduce PR noise
+    groups:
+      github-actions:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+
+  # Python server dependencies (pip-tools lockfiles)
+  - package-ecosystem: "pip"
+    directory: "/server"
+    schedule:
+      interval: "weekly"
+      day: "tuesday"
+      time: "06:00"
+      timezone: "UTC"
+    commit-message:
+      prefix: "deps(server)"
+    labels:
+      - "dependencies"
+      - "python"
+      - "server"
+    reviewers:
+      - "tooru"
+    # Dependabot will update requirements.txt from requirements.in
+    # Note: After merge, run ./scripts/compile-deps.sh to regenerate lock
+    groups:
+      python-server-minor:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+
+  # Python harness dependencies (pip-tools lockfiles)
+  - package-ecosystem: "pip"
+    directory: "/harness"
+    schedule:
+      interval: "weekly"
+      day: "tuesday"
+      time: "06:00"
+      timezone: "UTC"
+    commit-message:
+      prefix: "deps(harness)"
+    labels:
+      - "dependencies"
+      - "python"
+      - "harness"
+    reviewers:
+      - "tooru"
+    groups:
+      python-harness-minor:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+
+  # Python dev dependencies (pip-tools lockfiles)
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "tuesday"
+      time: "06:00"
+      timezone: "UTC"
+    commit-message:
+      prefix: "deps(dev)"
+    labels:
+      - "dependencies"
+      - "python"
+      - "dev"
+    reviewers:
+      - "tooru"
+    groups:
+      python-dev-minor:
+        patterns:
+          - "*"
+        update-types:
+          - "minor"
+          - "patch"
+
+# Note: tasks/* workspace dependencies are NOT managed by Dependabot
+# They are task-specific and intentionally not part of the repository's
+# core dependency management. Each task manages its own dependencies.
@@ -0,0 +1,95 @@
+name: CodeQL Analysis
+
+on:
+  push:
+    branches: [main, master]
+    paths:
+      # Trigger on changes to source code only
+      - 'server/**'
+      - 'harness/**'
+      - 'gui/**'
+      - 'conftest.py'
+  pull_request:
+    branches: [main, master]
+    paths:
+      - 'server/**'
+      - 'harness/**'
+      - 'gui/**'
+      - 'conftest.py'
+  schedule:
+    # Run weekly on Sundays at 3:00 AM UTC (after nightly scans)
+    - cron: '0 3 * * 0'
+  workflow_dispatch:
+    # Allow manual trigger for debugging
+  # NOTE: Do NOT use pull_request_target here - it runs with elevated permissions
+  # from the base branch, which can be exploited by malicious fork PRs
+
+# Only one CodeQL run per branch at a time
+concurrency:
+  group: codeql-${{ github.ref }}
+  cancel-in-progress: true
+
+# Permissions required for CodeQL and security tab
+# Fork PRs: security-events: write is required for SARIF upload but GitHub
+# restricts this for fork PRs - CodeQL handles this gracefully
+permissions:
+  security-events: write  # Required: upload SARIF results
+  contents: read          # Required: checkout code
+  actions: read           # Required: detect workflow changes
+
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # CodeQL supports: cpp, csharp, go, java, javascript, python, ruby, swift
+        language: ['python', 'javascript']
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+
+      # Initialize CodeQL tools for scanning
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@45c373516f557556c15d420e3f5e0aa3d64366bc  # v3
+        with:
+          languages: ${{ matrix.language }}
+          # Configure paths to scan (consistent with meta.lintScope)
+          # and paths to exclude (consistent with meta.excludePatterns)
+          config: |
+            paths:
+              - server
+              - harness
+              - gui
+              - conftest.py
+            paths-ignore:
+              - runs
+              - .venv
+              - .pytest_cache
+              - '**/node_modules'
+              - '**/__pycache__'
+              - 'tasks/**/workspace'
+              - '**/*.pyc'
+          # Use security-and-quality queries for comprehensive analysis
+          # Brownfield-safe: start with default queries, expand later
+          queries: +security-and-quality
+
+      # For Python, we don't need to build - CodeQL can analyze source directly
+      # For JavaScript, CodeQL also analyzes source directly (no build needed)
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@45c373516f557556c15d420e3f5e0aa3d64366bc  # v3
+
+      # Perform CodeQL Analysis
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@45c373516f557556c15d420e3f5e0aa3d64366bc  # v3
+        with:
+          category: "/language:${{ matrix.language }}"
+          # Upload SARIF results to GitHub Security tab
+          upload: true
+          # Brownfield-safe: warn-only initially
+          # To fail on high/critical alerts, add:
+          # fail-on: error
+          # For now, we report all findings without failing the job