Skip to content

Commit 49eda95

Browse files
authored
Merge pull request #4 from TechNavii/feature/lmstudio-support
Feature/lmstudio support
2 parents 089ecd2 + 317986a commit 49eda95

154 files changed

Lines changed: 32263 additions & 1603 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.artifact-hygiene-allowlist.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Artifact Hygiene Allowlist
2+
# ==========================
3+
# This file lists files that are intentionally tracked despite being in
4+
# artifact directories or matching artifact patterns.
5+
#
6+
# Format: file_path # justification
7+
#
8+
# Guidelines:
9+
# - Each entry MUST have a justification explaining why it's tracked
10+
# - Prefer excluding files from tracking over adding allowlist entries
11+
# - Review this file periodically to remove stale entries
12+
#
13+
# Currently no files are allowlisted.

.bandit.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Bandit SAST configuration for benchmark harness
2+
# Scoped to server/ and harness/ (excludes tasks/* workspaces and test directories)
3+
4+
exclude_dirs:
5+
- '/tests'
6+
- 'harness/tests'
7+
8+
# Skip low-severity or noisy checks:
9+
# B101: assert_used - asserts acceptable for invariants and type narrowing
10+
# B404: import subprocess - subprocess necessary for harness execution
11+
# B603: subprocess_without_shell_equals_true - we use shell=False (secure)
12+
skips:
13+
- B101
14+
- B404
15+
- B603

.devcontainer/Dockerfile

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Devcontainer for Benchmark Harness
2+
# Provides a reproducible development environment matching CI configuration
3+
#
4+
# Python versions: 3.11 (primary), 3.12 (available via pyenv)
5+
# Uses locked dependencies from requirements*.txt files
6+
7+
FROM mcr.microsoft.com/devcontainers/python:1-3.11-bookworm
8+
9+
# Install system dependencies required by the project
10+
RUN apt-get update && apt-get install -y --no-install-recommends \
11+
# ShellCheck for script linting
12+
shellcheck \
13+
# jq for JSON processing (used by scripts)
14+
jq \
15+
# Additional development tools
16+
curl \
17+
git \
18+
make \
19+
&& apt-get clean \
20+
&& rm -rf /var/lib/apt/lists/*
21+
22+
# Install actionlint for workflow validation
23+
# hadolint ignore=DL4006
24+
RUN curl -sL https://github.com/rhysd/actionlint/releases/download/v1.7.4/actionlint_1.7.4_linux_amd64.tar.gz | tar xz -C /usr/local/bin
25+
26+
# Set working directory
27+
WORKDIR /workspace
28+
29+
# Copy dependency files first for better layer caching
30+
COPY requirements-dev.txt requirements-dev.in ./
31+
COPY server/requirements.txt server/requirements.in ./server/
32+
COPY harness/requirements.txt harness/requirements.in ./harness/
33+
34+
# Create and activate virtual environment
35+
ENV VIRTUAL_ENV=/workspace/.venv
36+
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
37+
38+
# Create virtualenv and install dependencies
39+
# Uses --require-hashes for security (matches CI behavior)
40+
# hadolint ignore=DL3013,DL3042
41+
RUN python -m venv ${VIRTUAL_ENV} && \
42+
pip install --no-cache-dir --upgrade pip && \
43+
pip install --no-cache-dir --require-hashes -r requirements-dev.txt && \
44+
pip install --no-cache-dir --require-hashes -r server/requirements.txt && \
45+
pip install --no-cache-dir --require-hashes -r harness/requirements.txt
46+
47+
# Install pre-commit hooks (optional but recommended)
48+
# hadolint ignore=DL3013
49+
RUN pip install --no-cache-dir pre-commit
50+
51+
# Install Playwright for GUI tests (optional, can be slow)
52+
# Uncomment if GUI tests are needed in the container
53+
# RUN playwright install chromium && playwright install-deps chromium
54+
55+
# Set Python environment variables
56+
ENV PYTHONDONTWRITEBYTECODE=1 \
57+
PYTHONUNBUFFERED=1 \
58+
PYTHONPATH=/workspace
59+
60+
# Create non-root user directories
61+
RUN mkdir -p /workspace/runs /workspace/tasks && \
62+
chown -R vscode:vscode /workspace
63+
64+
# Default command
65+
CMD ["bash"]

.devcontainer/devcontainer.json

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
{
2+
"name": "Benchmark Harness",
3+
"build": {
4+
"dockerfile": "Dockerfile",
5+
"context": ".."
6+
},
7+
"features": {
8+
"ghcr.io/devcontainers/features/git:1": {},
9+
"ghcr.io/devcontainers/features/github-cli:1": {}
10+
},
11+
"customizations": {
12+
"vscode": {
13+
"extensions": [
14+
"ms-python.python",
15+
"ms-python.vscode-pylance",
16+
"charliermarsh.ruff",
17+
"tamasfe.even-better-toml",
18+
"redhat.vscode-yaml"
19+
],
20+
"settings": {
21+
"python.defaultInterpreterPath": "/workspace/.venv/bin/python",
22+
"python.terminal.activateEnvironment": true,
23+
"[python]": {
24+
"editor.formatOnSave": true,
25+
"editor.codeActionsOnSave": {
26+
"source.fixAll": "explicit",
27+
"source.organizeImports": "explicit"
28+
},
29+
"editor.defaultFormatter": "charliermarsh.ruff"
30+
},
31+
"ruff.path": ["/workspace/.venv/bin/ruff"],
32+
"mypy-type-checker.path": ["/workspace/.venv/bin/mypy"],
33+
"files.exclude": {
34+
"**/__pycache__": true,
35+
"**/.pytest_cache": true,
36+
"**/htmlcov": true,
37+
"**/.coverage": true
38+
},
39+
"terminal.integrated.defaultProfile.linux": "bash"
40+
}
41+
}
42+
},
43+
"postCreateCommand": "./scripts/bootstrap.sh",
44+
"remoteUser": "vscode",
45+
"containerEnv": {
46+
"PYTHONDONTWRITEBYTECODE": "1",
47+
"PYTHONUNBUFFERED": "1"
48+
},
49+
"mounts": [
50+
"source=${localWorkspaceFolder}/runs,target=/workspace/runs,type=bind,consistency=cached",
51+
"source=${localWorkspaceFolder}/tasks,target=/workspace/tasks,type=bind,consistency=cached"
52+
],
53+
"forwardPorts": [8000],
54+
"portsAttributes": {
55+
"8000": {
56+
"label": "FastAPI Server",
57+
"onAutoForward": "notify"
58+
}
59+
},
60+
"workspaceFolder": "/workspace"
61+
}

.env.example

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
# Copy this file to .env and fill in the values to run the harness against OpenRouter.
22
OPENROUTER_API_KEY=replace-with-your-key
3+
4+
# Optional: OpenRouter base URL (defaults to https://openrouter.ai/api/v1)
5+
# OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
6+
37
# Optional: tweak model defaults
48
DEFAULT_MODEL=openrouter/google/gemini-pro
59
# Optional: default temperature for harness runs
610
DEFAULT_TEMPERATURE=0.0
11+
12+
# Optional: default max tokens for harness runs
13+
DEFAULT_MAX_TOKENS=200000
14+
15+
# Optional: expert QA judge model (OpenRouter model id). Leave blank to disable judge.
16+
EXPERT_QA_JUDGE_MODEL=openai/gpt-4o-mini
17+
18+
# Optional: LM Studio base URL (OpenAI-compatible endpoint)
19+
# Defaults to http://127.0.0.1:1234/v1
20+
# LMSTUDIO_BASE_URL=http://127.0.0.1:1234/v1

.github/dependabot.yml

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Dependabot configuration for automated dependency updates
2+
# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
3+
4+
version: 2
5+
updates:
6+
# GitHub Actions version updates
7+
# Actions are pinned to commit SHAs for supply-chain security.
8+
# Dependabot automatically updates SHA-pinned actions to new SHAs
9+
# when new versions are released, maintaining both security and freshness.
10+
- package-ecosystem: "github-actions"
11+
directory: "/"
12+
schedule:
13+
interval: "weekly"
14+
day: "monday"
15+
time: "06:00"
16+
timezone: "UTC"
17+
commit-message:
18+
prefix: "ci(deps)"
19+
labels:
20+
- "dependencies"
21+
- "github-actions"
22+
reviewers:
23+
- "tooru" # Repository maintainer
24+
# Group minor/patch updates to reduce PR noise
25+
groups:
26+
github-actions:
27+
patterns:
28+
- "*"
29+
update-types:
30+
- "minor"
31+
- "patch"
32+
33+
# Python server dependencies (pip-tools lockfiles)
34+
- package-ecosystem: "pip"
35+
directory: "/server"
36+
schedule:
37+
interval: "weekly"
38+
day: "tuesday"
39+
time: "06:00"
40+
timezone: "UTC"
41+
commit-message:
42+
prefix: "deps(server)"
43+
labels:
44+
- "dependencies"
45+
- "python"
46+
- "server"
47+
reviewers:
48+
- "tooru"
49+
# Dependabot will update requirements.txt from requirements.in
50+
# Note: After merge, run ./scripts/compile-deps.sh to regenerate lock
51+
groups:
52+
python-server-minor:
53+
patterns:
54+
- "*"
55+
update-types:
56+
- "minor"
57+
- "patch"
58+
59+
# Python harness dependencies (pip-tools lockfiles)
60+
- package-ecosystem: "pip"
61+
directory: "/harness"
62+
schedule:
63+
interval: "weekly"
64+
day: "tuesday"
65+
time: "06:00"
66+
timezone: "UTC"
67+
commit-message:
68+
prefix: "deps(harness)"
69+
labels:
70+
- "dependencies"
71+
- "python"
72+
- "harness"
73+
reviewers:
74+
- "tooru"
75+
groups:
76+
python-harness-minor:
77+
patterns:
78+
- "*"
79+
update-types:
80+
- "minor"
81+
- "patch"
82+
83+
# Python dev dependencies (pip-tools lockfiles)
84+
- package-ecosystem: "pip"
85+
directory: "/"
86+
schedule:
87+
interval: "weekly"
88+
day: "tuesday"
89+
time: "06:00"
90+
timezone: "UTC"
91+
commit-message:
92+
prefix: "deps(dev)"
93+
labels:
94+
- "dependencies"
95+
- "python"
96+
- "dev"
97+
reviewers:
98+
- "tooru"
99+
groups:
100+
python-dev-minor:
101+
patterns:
102+
- "*"
103+
update-types:
104+
- "minor"
105+
- "patch"
106+
107+
# Note: tasks/* workspace dependencies are NOT managed by Dependabot
108+
# They are task-specific and intentionally not part of the repository's
109+
# core dependency management. Each task manages its own dependencies.

.github/workflows/codeql.yml

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
name: CodeQL Analysis
2+
3+
on:
4+
push:
5+
branches: [main, master]
6+
paths:
7+
# Trigger on changes to source code only
8+
- 'server/**'
9+
- 'harness/**'
10+
- 'gui/**'
11+
- 'conftest.py'
12+
pull_request:
13+
branches: [main, master]
14+
paths:
15+
- 'server/**'
16+
- 'harness/**'
17+
- 'gui/**'
18+
- 'conftest.py'
19+
schedule:
20+
# Run weekly on Sundays at 3:00 AM UTC (after nightly scans)
21+
- cron: '0 3 * * 0'
22+
workflow_dispatch:
23+
# Allow manual trigger for debugging
24+
# NOTE: Do NOT use pull_request_target here - it runs with elevated permissions
25+
# from the base branch, which can be exploited by malicious fork PRs
26+
27+
# Only one CodeQL run per branch at a time
28+
concurrency:
29+
group: codeql-${{ github.ref }}
30+
cancel-in-progress: true
31+
32+
# Permissions required for CodeQL and security tab
33+
# Fork PRs: security-events: write is required for SARIF upload but GitHub
34+
# restricts this for fork PRs - CodeQL handles this gracefully
35+
permissions:
36+
security-events: write # Required: upload SARIF results
37+
contents: read # Required: checkout code
38+
actions: read # Required: detect workflow changes
39+
40+
jobs:
41+
analyze:
42+
name: Analyze (${{ matrix.language }})
43+
runs-on: ubuntu-latest
44+
45+
strategy:
46+
fail-fast: false
47+
matrix:
48+
# CodeQL supports: cpp, csharp, go, java, javascript, python, ruby, swift
49+
language: ['python', 'javascript']
50+
51+
steps:
52+
- name: Checkout repository
53+
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
54+
55+
# Initialize CodeQL tools for scanning
56+
- name: Initialize CodeQL
57+
uses: github/codeql-action/init@45c373516f557556c15d420e3f5e0aa3d64366bc # v3
58+
with:
59+
languages: ${{ matrix.language }}
60+
# Configure paths to scan (consistent with meta.lintScope)
61+
# and paths to exclude (consistent with meta.excludePatterns)
62+
config: |
63+
paths:
64+
- server
65+
- harness
66+
- gui
67+
- conftest.py
68+
paths-ignore:
69+
- runs
70+
- .venv
71+
- .pytest_cache
72+
- '**/node_modules'
73+
- '**/__pycache__'
74+
- 'tasks/**/workspace'
75+
- '**/*.pyc'
76+
# Use security-and-quality queries for comprehensive analysis
77+
# Brownfield-safe: start with default queries, expand later
78+
queries: +security-and-quality
79+
80+
# For Python, we don't need to build - CodeQL can analyze source directly
81+
# For JavaScript, CodeQL also analyzes source directly (no build needed)
82+
- name: Autobuild
83+
uses: github/codeql-action/autobuild@45c373516f557556c15d420e3f5e0aa3d64366bc # v3
84+
85+
# Perform CodeQL Analysis
86+
- name: Perform CodeQL Analysis
87+
uses: github/codeql-action/analyze@45c373516f557556c15d420e3f5e0aa3d64366bc # v3
88+
with:
89+
category: "/language:${{ matrix.language }}"
90+
# Upload SARIF results to GitHub Security tab
91+
upload: true
92+
# Brownfield-safe: warn-only initially
93+
# To fail on high/critical alerts, add:
94+
# fail-on: error
95+
# For now, we report all findings without failing the job

0 commit comments

Comments
 (0)