Skip to content

Commit a551777

Browse files
committed
Harden manifest and deployment ergonomics
1 parent 6e57c87 commit a551777

File tree

9 files changed

+318
-7
lines changed

9 files changed

+318
-7
lines changed

.github/workflows/deploy-space.yml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
name: Deploy HF Space
22

33
on:
4+
push:
5+
branches: ["main"]
46
workflow_dispatch:
57
inputs:
68
repo_id:
@@ -24,7 +26,7 @@ jobs:
2426
timeout-minutes: 30
2527
env:
2628
HF_TOKEN: ${{ secrets.HF_TOKEN }}
27-
HF_SPACE_REPO_ID: ${{ inputs.repo_id || vars.HF_SPACE_REPO_ID }}
29+
HF_SPACE_REPO_ID: ${{ inputs.repo_id || vars.HF_SPACE_REPO_ID || 'Rohan556/openenv-code-review-arena' }}
2830

2931
steps:
3032
- uses: actions/checkout@v6.0.2
@@ -37,21 +39,26 @@ jobs:
3739
uses: astral-sh/setup-uv@v8.0.0
3840

3941
- name: Check deployment configuration
42+
id: config
4043
run: |
4144
if [ -z "${HF_TOKEN}" ]; then
42-
echo "::error::Missing repository secret HF_TOKEN"
43-
exit 1
45+
echo "deploy_enabled=false" >> "$GITHUB_OUTPUT"
46+
echo "HF_TOKEN is not configured; skipping deploy."
47+
exit 0
4448
fi
4549
if [ -z "${HF_SPACE_REPO_ID}" ]; then
4650
echo "::error::Missing workflow input repo_id and repository variable HF_SPACE_REPO_ID"
4751
exit 1
4852
fi
53+
echo "deploy_enabled=true" >> "$GITHUB_OUTPUT"
4954
echo "Deploying to ${HF_SPACE_REPO_ID}"
5055
5156
- name: Validate local environment structure
57+
if: steps.config.outputs.deploy_enabled == 'true'
5258
run: uvx --from git+https://github.com/meta-pytorch/OpenEnv.git openenv validate --verbose
5359

5460
- name: Push to Hugging Face Space
61+
if: steps.config.outputs.deploy_enabled == 'true'
5562
run: |
5663
args=(. --repo-id "${HF_SPACE_REPO_ID}")
5764
if [ "${{ inputs.private }}" = "true" ]; then

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ The final score combines:
9999

100100
This makes the reward function significantly harder to game than simple exact-string matching.
101101

102+
## Baseline scores
103+
104+
| Task difficulty | Random agent | Zero-shot LLM | Strong agent |
105+
|---|---:|---:|---:|
106+
| Easy (clean refactor) | ~0.45 | ~0.72 | ~0.90 |
107+
| Medium (single vuln) | ~0.10 | ~0.51 | ~0.80 |
108+
| Hard (multi-vuln) | ~0.04 | ~0.38 | ~0.72 |
109+
102110
## Local development
103111

104112
```bash
@@ -133,6 +141,7 @@ The workflow will:
133141
3. push the current repository contents with `openenv push`
134142

135143
You can also override the target repo id manually when dispatching the workflow.
144+
The repo also supports automatic HF redeploys on `main` when `HF_TOKEN` is configured.
136145

137146
## Example usage
138147

inference.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""Minimal end-to-end episode runner for the deployed code review environment."""
2+
3+
from __future__ import annotations
4+
5+
import asyncio
6+
import os
7+
8+
try:
9+
from code_review_env import CodeReviewAction, CodeReviewEnv, ReviewFinding
10+
except ImportError: # pragma: no cover
11+
from client import CodeReviewEnv
12+
from models import CodeReviewAction, ReviewFinding
13+
14+
15+
DEFAULT_BASE_URL = "https://rohan556-openenv-code-review-arena.hf.space"
16+
17+
18+
async def main() -> None:
19+
base_url = os.getenv("CODE_REVIEW_ENV_URL", DEFAULT_BASE_URL)
20+
21+
async with CodeReviewEnv(base_url=base_url) as env:
22+
result = await env.reset(task_id="sql_injection_report_filters")
23+
print(f"task={result.observation.task_id}")
24+
print(f"pr={result.observation.pr_title}")
25+
26+
await env.step(
27+
CodeReviewAction(
28+
action_type="inspect_file",
29+
file_path="analytics/reporting.py",
30+
view_mode="full",
31+
start_line=1,
32+
end_line=80,
33+
)
34+
)
35+
36+
graded = await env.step(
37+
CodeReviewAction(
38+
action_type="submit_review",
39+
findings=[
40+
ReviewFinding(
41+
file_path="analytics/reporting.py",
42+
line_start=9,
43+
line_end=15,
44+
severity="critical",
45+
category="sql_injection",
46+
title="Unsafe string interpolation in SQL report query",
47+
explanation=(
48+
"customer_id and period are interpolated directly into raw SQL "
49+
"instead of being passed as bound parameters, so an attacker can "
50+
"inject arbitrary predicates or SQL fragments."
51+
),
52+
confidence=0.98,
53+
)
54+
],
55+
)
56+
)
57+
58+
scorecard = graded.observation.scorecard
59+
print(f"done={graded.done}")
60+
if scorecard is None:
61+
raise RuntimeError("Expected a scorecard after submit_review")
62+
print(f"score={scorecard.overall_score}")
63+
print(f"grade_band={scorecard.grade_band}")
64+
print(scorecard.summary)
65+
66+
67+
if __name__ == "__main__":
68+
asyncio.run(main())

openenv.yaml

Lines changed: 90 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,93 @@ type: space
44
runtime: fastapi
55
app: server.app:app
66
port: 8000
7-
7+
description: "Interactive PR code review environment for security, correctness, and false-positive benchmarking."
8+
tasks:
9+
- id: authz_admin_export
10+
description: "Broken access control on tenant audit export."
11+
difficulty: medium
12+
- id: sql_injection_report_filters
13+
description: "SQL injection in a revenue report helper."
14+
difficulty: medium
15+
- id: path_traversal_receipts
16+
description: "Filesystem path traversal in receipt download handling."
17+
difficulty: medium
18+
- id: ssrf_webhook_preview
19+
description: "Server-side request forgery in webhook previewing."
20+
difficulty: hard
21+
- id: jwt_exp_disabled
22+
description: "Subtle JWT validation regressions in token parsing."
23+
difficulty: hard
24+
- id: wallet_race_condition
25+
description: "Concurrent money movement bug in wallet transfers."
26+
difficulty: hard
27+
- id: frontend_xss_preview
28+
description: "Client-side XSS via unsanitized markdown preview."
29+
difficulty: medium
30+
- id: safe_logging_refactor
31+
description: "Clean refactor task designed to punish false positives."
32+
difficulty: easy
33+
action_space:
34+
type: object
35+
description: "Structured review interaction over changed files and final rubric submission."
36+
properties:
37+
action_type:
38+
type: string
39+
enum: [list_files, inspect_file, search_code, submit_review]
40+
file_path:
41+
type: string
42+
description: "Changed file path to inspect."
43+
view_mode:
44+
type: string
45+
enum: [diff, full]
46+
start_line:
47+
type: integer
48+
end_line:
49+
type: integer
50+
query:
51+
type: string
52+
findings:
53+
type: array
54+
items:
55+
type: object
56+
properties:
57+
file_path:
58+
type: string
59+
line_start:
60+
type: integer
61+
line_end:
62+
type: integer
63+
severity:
64+
type: string
65+
enum: [low, medium, high, critical]
66+
category:
67+
type: string
68+
title:
69+
type: string
70+
explanation:
71+
type: string
72+
confidence:
73+
type: number
74+
required: [action_type]
75+
observation_space:
76+
type: object
77+
description: "Episode state, rendered code context, and final scorecard."
78+
properties:
79+
reward:
80+
type: number
81+
done:
82+
type: boolean
83+
phase:
84+
type: string
85+
task_id:
86+
type: string
87+
difficulty:
88+
type: string
89+
displayed_content:
90+
type: string
91+
action_result:
92+
type: string
93+
attempts_remaining:
94+
type: integer
95+
scorecard:
96+
type: object

outputs/.gitkeep

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

server/app.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
try:
1313
from ..models import CodeReviewAction, CodeReviewObservation
1414
from .code_review_environment import CodeReviewEnvironment
15+
from .task_store import TaskStore
1516
except ImportError:
1617
from code_review_env.models import CodeReviewAction, CodeReviewObservation
1718
from code_review_env.server.code_review_environment import CodeReviewEnvironment
19+
from code_review_env.server.task_store import TaskStore
1820

1921

2022
app = create_app(
@@ -26,6 +28,20 @@
2628
)
2729

2830

31+
@app.get("/tasks")
32+
async def list_tasks() -> list[dict[str, str]]:
33+
store = TaskStore()
34+
return [
35+
{
36+
"id": task.id,
37+
"difficulty": task.difficulty,
38+
"title": task.title,
39+
"description": task.pr_title,
40+
}
41+
for task in store.all_tasks
42+
]
43+
44+
2945
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
3046
import uvicorn
3147

server/requirements.txt

Lines changed: 116 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,116 @@
1-
openenv-core[core]>=0.2.2,<1.0.0
2-
1+
# This file was autogenerated by uv via the following command:
2+
# uv export --no-hashes
3+
-e .
4+
aiofile==3.9.0
5+
aiofiles==24.1.0
6+
annotated-doc==0.0.4
7+
annotated-types==0.7.0
8+
anyio==4.13.0
9+
attrs==26.1.0
10+
audioop-lts==0.2.2 ; python_full_version >= '3.13'
11+
authlib==1.6.9
12+
backports-tarfile==1.2.0 ; python_full_version < '3.12'
13+
beartype==0.22.9
14+
brotli==1.2.0
15+
cachetools==7.0.5
16+
caio==0.9.25
17+
certifi==2026.2.25
18+
cffi==2.0.0 ; platform_python_implementation != 'PyPy'
19+
charset-normalizer==3.4.6
20+
click==8.3.1
21+
colorama==0.4.6 ; sys_platform == 'win32'
22+
cryptography==46.0.6
23+
cyclopts==4.10.1
24+
distro==1.9.0
25+
dnspython==2.8.0
26+
docstring-parser==0.17.0
27+
docutils==0.22.4
28+
email-validator==2.3.0
29+
exceptiongroup==1.3.1
30+
fastapi==0.135.2
31+
fastmcp==3.1.1
32+
ffmpy==1.0.0
33+
filelock==3.25.2
34+
fsspec==2026.3.0
35+
gradio==6.10.0
36+
gradio-client==2.4.0
37+
groovy==0.1.2
38+
h11==0.16.0
39+
hf-gradio==0.3.0
40+
hf-xet==1.4.2 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
41+
httpcore==1.0.9
42+
httpx==0.28.1
43+
httpx-sse==0.4.3
44+
huggingface-hub==1.8.0
45+
idna==3.11
46+
importlib-metadata==8.7.1
47+
jaraco-classes==3.4.0
48+
jaraco-context==6.1.2
49+
jaraco-functools==4.4.0
50+
jeepney==0.9.0 ; sys_platform == 'linux'
51+
jinja2==3.1.6
52+
jiter==0.13.0
53+
jsonref==1.1.0
54+
jsonschema==4.26.0
55+
jsonschema-path==0.4.5
56+
jsonschema-specifications==2025.9.1
57+
keyring==25.7.0
58+
markdown-it-py==4.0.0
59+
markupsafe==3.0.3
60+
mcp==1.26.0
61+
mdurl==0.1.2
62+
more-itertools==10.8.0
63+
numpy==2.4.4
64+
openai==2.30.0
65+
openapi-pydantic==0.5.1
66+
openenv-core==0.2.3
67+
opentelemetry-api==1.40.0
68+
orjson==3.11.7
69+
packaging==26.0
70+
pandas==3.0.1
71+
pathable==0.5.0
72+
pillow==12.1.1
73+
platformdirs==4.9.4
74+
py-key-value-aio==0.4.4
75+
pycparser==3.0 ; implementation_name != 'PyPy' and platform_python_implementation != 'PyPy'
76+
pydantic==2.12.5
77+
pydantic-core==2.41.5
78+
pydantic-settings==2.13.1
79+
pydub==0.25.1
80+
pygments==2.20.0
81+
pyjwt==2.12.1
82+
pyperclip==1.11.0
83+
python-dateutil==2.9.0.post0
84+
python-dotenv==1.2.2
85+
python-multipart==0.0.22
86+
pytz==2026.1.post1
87+
pywin32==311 ; sys_platform == 'win32'
88+
pywin32-ctypes==0.2.3 ; sys_platform == 'win32'
89+
pyyaml==6.0.3
90+
referencing==0.37.0
91+
requests==2.33.0
92+
rich==14.3.3
93+
rich-rst==1.3.2
94+
rpds-py==0.30.0
95+
safehttpx==0.1.7
96+
secretstorage==3.5.0 ; sys_platform == 'linux'
97+
semantic-version==2.10.0
98+
shellingham==1.5.4
99+
six==1.17.0
100+
sniffio==1.3.1
101+
sse-starlette==3.3.4
102+
starlette==0.52.1
103+
tomli==2.4.1
104+
tomli-w==1.2.0
105+
tomlkit==0.13.3
106+
tqdm==4.67.3
107+
typer==0.24.1
108+
typing-extensions==4.15.0
109+
typing-inspection==0.4.2
110+
tzdata==2025.3 ; sys_platform == 'emscripten' or sys_platform == 'win32'
111+
uncalled-for==0.2.0
112+
urllib3==2.6.3
113+
uvicorn==0.42.0
114+
watchfiles==1.1.1
115+
websockets==16.0
116+
zipp==3.23.0

0 commit comments

Comments
 (0)