Rohan5commit
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 47 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 12 additions & 0 deletions b/‎.gitignore‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 22 additions & 0 deletions b/‎LICENSE‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 162 additions & 0 deletions b/‎README.md‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎__init__.py‎
Lines changed: 38 additions & 0 deletions b/‎__init__.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎client.py‎
Lines changed: 72 additions & 0 deletions b/‎client.py‎
Lines changed: 72 additions & 0 deletions
@@ -0,0 +1,47 @@
+name: CI
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+
+      - name: Install OpenEnv CLI
+        run: python -m pip install git+https://github.com/meta-pytorch/OpenEnv.git
+
+      - name: Sync environment
+        run: uv sync --frozen --extra dev
+
+      - name: Run tests
+        run: uv run pytest -q
+
+      - name: Validate local environment structure
+        run: openenv validate --verbose
+
+      - name: Start server
+        run: |
+          uv run server --port 8000 &
+          echo $! > server.pid
+          sleep 8
+
+      - name: Validate running environment
+        run: openenv validate http://127.0.0.1:8000
+
+      - name: Stop server
+        if: always()
+        run: |
+          if [ -f server.pid ]; then kill $(cat server.pid) || true; fi
@@ -0,0 +1,12 @@
+.venv/
+__pycache__/
+.pytest_cache/
+.coverage
+htmlcov/
+dist/
+build/
+*.pyc
+*.pyo
+*.pyd
+*.egg-info/
+
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2026 Rohan5commit
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
@@ -0,0 +1,162 @@
+# Code Review Arena for OpenEnv
+
+`code_review_env` is a production-style OpenEnv benchmark for pull-request review.
+Instead of toy gameplay, the agent reviews realistic code changes for security,
+correctness, reliability, and quality regressions, then submits a structured review.
+
+This repository is designed to score well on the four hackathon judging axes:
+
+1. Runtime correctness
+2. OpenEnv interface compliance
+3. Task design quality
+4. Grading logic sophistication
+
+## Why this environment is competitive
+
+- Real-world task: PR review is a daily engineering workflow with direct product value.
+- Deterministic benchmark: no external APIs, no flaky third-party services, no hidden randomness.
+- Rich interaction loop: the agent can list changed files, inspect diffs or full files, search code, and submit a final review.
+- Sophisticated grading: optimal finding-to-rubric matching, severity weighting, line tolerance, semantic keyword checks, duplicate detection, and false-positive penalties.
+- Judge-friendly packaging: standalone OpenEnv environment with Docker, tests, client, and CI validation.
+
+## Benchmark design
+
+The built-in corpus contains realistic PR tasks across:
+
+- Broken access control
+- SQL injection
+- Path traversal
+- SSRF
+- JWT validation mistakes
+- Concurrency and race conditions
+- Client-side XSS
+- False-positive control via a clean refactor task
+
+Each task includes:
+
+- PR title and description
+- Changed file summaries
+- Unified diff snippets
+- Full changed-file contents for inspection
+- CI summary
+- Hidden reference findings used by the grader
+
+## Action space
+
+The environment accepts one typed `CodeReviewAction` with these `action_type` values:
+
+- `list_files`
+- `inspect_file`
+- `search_code`
+- `submit_review`
+
+Final review submissions are a list of structured findings:
+
+- `file_path`
+- `line_start`
+- `line_end`
+- `severity`
+- `category`
+- `title`
+- `explanation`
+- `confidence`
+
+## Scoring model
+
+The grader uses optimal one-to-one matching between submitted findings and reference findings.
+Each candidate match blends:
+
+- file/path agreement
+- line alignment with tolerance
+- category normalization and alias matching
+- severity agreement
+- title/explanation semantic coverage
+
+The final score combines:
+
+- coverage of true issues
+- precision of submitted issues
+- efficiency from staying within the review budget
+- penalties for false positives
+- penalties for duplicates
+- penalties for missing high-severity findings
+
+This makes the reward function significantly harder to game than simple exact-string matching.
+
+## Local development
+
+```bash
+uv sync --extra dev
+uv run pytest
+uv run server --port 8000
+```
+
+Validate structure with OpenEnv:
+
+```bash
+openenv validate --verbose
+openenv validate http://127.0.0.1:8000
+```
+
+## Example usage
+
+```python
+import asyncio
+
+from code_review_env import CodeReviewAction, CodeReviewEnv, ReviewFinding
+
+
+async def main() -> None:
+    async with CodeReviewEnv(base_url="http://127.0.0.1:8000") as env:
+        result = await env.reset(task_id="sql_injection_report_filters")
+        print(result.observation.pr_title)
+
+        await env.step(
+            CodeReviewAction(
+                action_type="inspect_file",
+                file_path="analytics/reporting.py",
+                view_mode="full",
+                start_line=1,
+                end_line=120,
+            )
+        )
+
+        graded = await env.step(
+            CodeReviewAction(
+                action_type="submit_review",
+                findings=[
+                    ReviewFinding(
+                        file_path="analytics/reporting.py",
+                        line_start=24,
+                        line_end=31,
+                        severity="critical",
+                        category="sql_injection",
+                        title="Unsafe string interpolation in SQL query",
+                        explanation=(
+                            "customer_id and period are inserted directly into SQL, "
+                            "so an attacker can change the query instead of using "
+                            "parameter binding."
+                        ),
+                        confidence=0.95,
+                    )
+                ],
+            )
+        )
+        print(graded.observation.scorecard.overall_score)
+
+
+asyncio.run(main())
+```
+
+## Hugging Face / Space deployment
+
+The environment is ready for:
+
+```bash
+openenv push --repo-id <your-hf-space>
+```
+
+No API keys are required for the benchmark itself. If you later want a private rubric
+bundle for leaderboard use, you can point `CODE_REVIEW_TASK_BUNDLE_PATH` at a private
+JSON file without changing the environment interface.
+
@@ -0,0 +1,38 @@
+"""Code review benchmark environment for OpenEnv."""
+
+try:
+    from .client import CodeReviewEnv
+    from .models import (
+        ChangedFileSummary,
+        CodeReviewAction,
+        CodeReviewObservation,
+        CodeReviewState,
+        FindingAssessment,
+        ReviewFinding,
+        ReviewScorecard,
+        SearchHit,
+    )
+except ImportError:  # pragma: no cover
+    from client import CodeReviewEnv
+    from models import (
+        ChangedFileSummary,
+        CodeReviewAction,
+        CodeReviewObservation,
+        CodeReviewState,
+        FindingAssessment,
+        ReviewFinding,
+        ReviewScorecard,
+        SearchHit,
+    )
+
+__all__ = [
+    "ChangedFileSummary",
+    "CodeReviewAction",
+    "CodeReviewEnv",
+    "CodeReviewObservation",
+    "CodeReviewState",
+    "FindingAssessment",
+    "ReviewFinding",
+    "ReviewScorecard",
+    "SearchHit",
+]
@@ -0,0 +1,72 @@
+"""Typed OpenEnv client for the code review benchmark."""
+
+from __future__ import annotations
+
+from typing import Any
+
+from openenv.core.env_client import EnvClient
+from openenv.core.client_types import StepResult
+
+try:
+    from .models import (
+        ChangedFileSummary,
+        CodeReviewAction,
+        CodeReviewObservation,
+        CodeReviewState,
+        ReviewScorecard,
+        SearchHit,
+    )
+except ImportError:  # pragma: no cover
+    from models import (
+        ChangedFileSummary,
+        CodeReviewAction,
+        CodeReviewObservation,
+        CodeReviewState,
+        ReviewScorecard,
+        SearchHit,
+    )
+
+
+class CodeReviewEnv(EnvClient[CodeReviewAction, CodeReviewObservation, CodeReviewState]):
+    """Persistent WebSocket client for the code review environment."""
+
+    def _step_payload(self, action: CodeReviewAction) -> dict[str, Any]:
+        return action.model_dump(exclude_none=True)
+
+    def _parse_result(self, payload: dict[str, Any]) -> StepResult[CodeReviewObservation]:
+        obs_data = payload.get("observation", {})
+        scorecard_data = obs_data.get("scorecard")
+        observation = CodeReviewObservation(
+            task_id=obs_data.get("task_id", ""),
+            task_title=obs_data.get("task_title", ""),
+            difficulty=obs_data.get("difficulty", ""),
+            phase=obs_data.get("phase", "overview"),
+            instructions=obs_data.get("instructions", ""),
+            repo_name=obs_data.get("repo_name", ""),
+            pr_title=obs_data.get("pr_title", ""),
+            pr_description=obs_data.get("pr_description", ""),
+            ci_summary=obs_data.get("ci_summary", ""),
+            action_result=obs_data.get("action_result", ""),
+            displayed_content=obs_data.get("displayed_content", ""),
+            changed_files=[
+                ChangedFileSummary.model_validate(item)
+                for item in obs_data.get("changed_files", [])
+            ],
+            search_results=[
+                SearchHit.model_validate(item) for item in obs_data.get("search_results", [])
+            ],
+            attempts_remaining=obs_data.get("attempts_remaining", 0),
+            scorecard=(
+                ReviewScorecard.model_validate(scorecard_data) if scorecard_data else None
+            ),
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+
+    def _parse_state(self, payload: dict[str, Any]) -> CodeReviewState:
+        return CodeReviewState.model_validate(payload)