Add llm_review

MatALass · MatALass · commit 626f8ed6bb68 · 2026-05-01T17:06:55.000+02:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,11 +50,15 @@ jobs:
       - name: Install dependencies
         run: python -m pip install --upgrade pip && python -m pip install -e .[dev]
 
-      - name: Run golden tests
-        run: python -m pytest tests/golden -q
-
       - name: Run smoke tests
         run: python -m pytest tests/smoke -q
 
       - name: Run full test suite with coverage
-        run: python -m pytest --cov=portfolio_auditor --cov-report=term-missing --cov-fail-under=72
+        run: python -m pytest --cov=portfolio_auditor --cov-report=term-missing --cov-fail-under=72
+
+      - name: Run golden tests
+        # Golden tests run last: a snapshot drift (e.g. after an intentional policy
+        # change) should not block coverage reporting. Fix snapshots with:
+        #   python -m pytest tests/golden --snapshot-update  (if using syrupy)
+        # or by regenerating the JSON files in tests/golden/snapshots/.
+        run: python -m pytest tests/golden -q
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,8 @@ dependencies = [
   "typer>=0.12.3",
   "PyYAML>=6.0.1",
   "pandas>=2.2.0",
+  "numpy>=1.26",
+  "rich>=13.0",
   "streamlit>=1.35.0",
 ]
 
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,5 @@
-streamlit>=1.35
-pandas>=2.2
-numpy>=1.26
-requests>=2.31
-python-dotenv>=1.0
-GitPython>=3.1
-typer>=0.12
-rich>=13.0
-pydantic>=2.7
-pydantic-settings>=2.2
--e .
+# Development install — all runtime deps come from pyproject.toml.
+# Keep this file minimal: its only job is to editable-install the package
+# so that `pip install -r requirements.txt` works in dev without repeating
+# the dependency list.
+-e .
diff --git a/src/portfolio_auditor/collectors/github/collector.py b/src/portfolio_auditor/collectors/github/collector.py
@@ -187,41 +187,6 @@ def _list_owner_repos(self, owner: str) -> tuple[list[dict[str, Any]], str]:
                     return cached_payload, "normalized_snapshot"
             raise
 
-    def _load_raw_owner_snapshot_payload(self, owner: str) -> list[dict[str, Any]]:
-        snapshot_path = self.get_raw_owner_snapshot_path(owner)
-        payload = json.loads(snapshot_path.read_text(encoding="utf-8"))
-        if not isinstance(payload, list):
-            raise ValueError(
-                f"Cached GitHub snapshot for owner '{owner}' is not a valid list payload."
-            )
-
-        normalized_payload: list[dict[str, Any]] = []
-        for item in payload:
-            if isinstance(item, dict):
-                normalized_payload.append(item)
-        return normalized_payload
-
-    def _build_rate_limit_message(
-        self,
-        *,
-        owner: str,
-        original_error: GitHubRateLimitError,
-    ) -> str:
-        snapshot_path = self.get_raw_owner_snapshot_path(owner)
-        if snapshot_path.exists():
-            return (
-                f"GitHub API rate limit exceeded while collecting owner '{owner}', and a cached "
-                f"snapshot exists at {snapshot_path}. The pipeline can fall back to cached metadata. "
-                f"Original error: {original_error}"
-            )
-
-        return (
-            f"GitHub API rate limit exceeded while collecting owner '{owner}'. "
-            f"No cached snapshot is available at {snapshot_path}. "
-            f"Add GITHUB_TOKEN to increase your rate limit, then rerun the command. "
-            f"Original error: {original_error}"
-        )
-
     def _apply_filters(self, repos: list[RepoMetadata]) -> list[RepoMetadata]:
         filtered = repos
         excluded_names = self.settings.normalized_excluded_repo_names
diff --git a/src/portfolio_auditor/reviewing/llm_review.py b/src/portfolio_auditor/reviewing/llm_review.py
@@ -11,8 +11,6 @@
 
 logger = logging.getLogger(__name__)
 
-_deterministic_reviewer = DeterministicReviewer()
-
 
 class LLMReviewNotImplemented(NotImplementedError):
     """
@@ -62,4 +60,4 @@ def review_repo_with_llm(
         "LLM review not yet implemented — falling back to deterministic reviewer for %s",
         repo.full_name,
     )
-    return _deterministic_reviewer.review(repo, scan, score)
+    return DeterministicReviewer().review(repo, scan, score)
diff --git a/src/portfolio_auditor/reviewing/review_orchestrator.py b/src/portfolio_auditor/reviewing/review_orchestrator.py
diff --git a/tests/integration/test_rebuild_sample_data.py b/tests/integration/test_rebuild_sample_data.py
@@ -1,6 +1,63 @@
+"""
+Integration tests for scripts/rebuild_sample_data.py.
+
+These tests actually run the script and verify the artifact it produces,
+rather than just checking whether the file exists.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
 from pathlib import Path
 
+import pytest
+
+SCRIPT = Path("scripts/rebuild_sample_data.py").resolve()
+
+
+@pytest.mark.integration
+def test_script_exists():
+    assert SCRIPT.exists(), f"Expected script at {SCRIPT}"
+
+
+@pytest.mark.integration
+def test_script_runs_successfully(tmp_path, monkeypatch):
+    """Script must exit 0 and write a valid JSON file under the tmp workspace."""
+    monkeypatch.chdir(tmp_path)
+    result = subprocess.run(
+        [sys.executable, str(SCRIPT)],
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0, f"Script exited with {result.returncode}:\n{result.stderr}"
+
+
+@pytest.mark.integration
+def test_script_produces_valid_json(tmp_path, monkeypatch):
+    """The artifact written by the script must be a non-empty JSON list."""
+    monkeypatch.chdir(tmp_path)
+    subprocess.run([sys.executable, str(SCRIPT)], check=True, capture_output=True)
+
+    output_file = tmp_path / "data" / "raw" / "github" / "repos_raw.json"
+    assert output_file.exists(), f"Expected output at {output_file}"
+
+    payload = json.loads(output_file.read_text(encoding="utf-8"))
+    assert isinstance(payload, list), "Output must be a JSON list"
+    assert len(payload) > 0, "Output list must not be empty"
+
+
+@pytest.mark.integration
+def test_script_output_has_required_fields(tmp_path, monkeypatch):
+    """Each entry in the output must carry the fields downstream code relies on."""
+    monkeypatch.chdir(tmp_path)
+    subprocess.run([sys.executable, str(SCRIPT)], check=True, capture_output=True)
+
+    output_file = tmp_path / "data" / "raw" / "github" / "repos_raw.json"
+    payload = json.loads(output_file.read_text(encoding="utf-8"))
 
-def test_sample_data_file_layout():
-    path = Path("scripts/rebuild_sample_data.py")
-    assert path.exists()
+    required_fields = {"name", "html_url", "language", "private", "fork", "archived"}
+    for entry in payload:
+        missing = required_fields - entry.keys()
+        assert not missing, f"Entry missing fields {missing}: {entry}"

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ dependencies = [`
`19`	`19`	`"typer>=0.12.3",`
`20`	`20`	`"PyYAML>=6.0.1",`
`21`	`21`	`"pandas>=2.2.0",`
	`22`	`+ "numpy>=1.26",`
	`23`	`+ "rich>=13.0",`
`22`	`24`	`"streamlit>=1.35.0",`
`23`	`25`	`]`
`24`	`26`