fix: address gemini-code-assist 11th review on PR #1

kanywst · kanywst · commit 1eaebb63a2be · 2026-05-17T22:57:15.000+09:00
All 3 findings legitimate, all applied:

- collect_github.collect: add `isinstance(entry, dict)` guard before
  `entry.get("repo")`. A null or scalar YAML entry (`- foo`) used to
  AttributeError out of the whole track; now it's logged and skipped.
  Existing malformed-entry test extended with null and string cases.
  [MEDIUM]

- collect_rss.collect: same isinstance guard for `feed`. Existing
  malformed-feed test extended with a `null` entry. [MEDIUM]

- score._compiled_patterns: wrap the per-keyword regex compile in
  `functools.lru_cache` keyed on the keyword tuple. score_item is
  called once per item (hundreds per track per run); without caching
  we'd recompile the same primary/secondary patterns every call.
  Switched score_item's keyword extraction to tuples so they're
  hashable for the cache. No behaviour change — same word-bounded
  semantics, just compiled once per distinct keyword set. [MEDIUM]
diff --git a/scripts/awsdd/collect_github.py b/scripts/awsdd/collect_github.py
@@ -117,7 +117,12 @@ def collect(track: str) -> None:
     now = datetime.now(UTC)
     items: list[dict] = []
     for entry in repos:
-        # defensive: skip malformed config rather than crashing the whole track
+        # defensive: skip malformed config rather than crashing the whole track.
+        # The isinstance guard covers null and scalar entries; .get covers
+        # dict-shaped but incomplete entries.
+        if not isinstance(entry, dict):
+            print(f"[collect_github] skipping non-dict entry: {entry!r}")
+            continue
         repo = entry.get("repo")
         if not repo:
             print(f"[collect_github] skipping malformed entry: {entry!r}")
diff --git a/scripts/awsdd/collect_rss.py b/scripts/awsdd/collect_rss.py
@@ -149,6 +149,10 @@ def collect(track: str) -> None:
     for feed in feeds:
         # defensive: skip incomplete config entries instead of KeyError-ing the
         # whole pipeline if one feed in sources.yaml is missing id or url.
+        # isinstance guard covers null / scalar entries (e.g. `- foo` in YAML).
+        if not isinstance(feed, dict):
+            print(f"[collect_rss] skipping non-dict entry: {feed!r}")
+            continue
         sid, url = feed.get("id"), feed.get("url")
         if not sid or not url:
             print(f"[collect_rss] skipping malformed entry: {feed!r}")
diff --git a/scripts/awsdd/score.py b/scripts/awsdd/score.py
@@ -5,17 +5,26 @@
 import math
 import re
 from datetime import UTC, datetime
+from functools import lru_cache
 
 from ._dates import parse_iso
 from .config import load_sources, track_dir
 
 SEVERITY_WEIGHT = {"critical": 3.0, "high": 2.0, "medium": 1.0, "low": 0.5}
 
 
-def _keyword_hits(keywords: list[str], text: str) -> int:
-    """Count word-bounded matches. Substring matching would let `iam` hit
+@lru_cache(maxsize=64)
+def _compiled_patterns(keywords: tuple[str, ...]) -> tuple[re.Pattern, ...]:
+    """Compile word-bounded patterns once per distinct keyword set.
+    score_item is called per item; without caching we'd recompile the same
+    patterns hundreds of times per track."""
+    return tuple(re.compile(rf"\b{re.escape(k)}\b") for k in keywords)
+
+
+def _keyword_hits(patterns: tuple[re.Pattern, ...], text: str) -> int:
+    """Word-bounded match count. Substring matching would let `iam` hit
     `diagram` or `sts` hit `tests`, which dilutes the topic signal."""
-    return sum(1 for k in keywords if re.search(rf"\b{re.escape(k)}\b", text))
+    return sum(1 for p in patterns if p.search(text))
 
 
 def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
@@ -25,10 +34,10 @@ def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
 
     text = f"{item.get('title', '')} {item.get('summary', '')}".lower()
     kws = sources.get("keywords") or {}
-    primary = [k.lower() for k in (kws.get("primary") or [])]
-    secondary = [k.lower() for k in (kws.get("secondary") or [])]
-    p_hits = _keyword_hits(primary, text)
-    s_hits = _keyword_hits(secondary, text)
+    primary = tuple(k.lower() for k in (kws.get("primary") or []))
+    secondary = tuple(k.lower() for k in (kws.get("secondary") or []))
+    p_hits = _keyword_hits(_compiled_patterns(primary), text)
+    s_hits = _keyword_hits(_compiled_patterns(secondary), text)
     keyword = p_hits * 2.0 + s_hits * 0.5
 
     weights = sources.get("source_weights") or {}
diff --git a/tests/test_collect_github.py b/tests/test_collect_github.py
@@ -70,14 +70,16 @@ def test_collect_skips_malformed_repo_entry(make_track, monkeypatch, capsys):
         sources_yaml="""
 github:
   - repo: aws/aws-cli
-  - per_page: 5  # no repo key
+  - per_page: 5  # no repo key (dict shape but missing repo)
+  - null  # not a dict at all
+  - "just a string"  # also not a dict
 """,
     )
-    # stub _get_all so we don't actually hit the API
     calls: list[str] = []
     monkeypatch.setattr(collect_github, "_get_all", lambda path: calls.append(path) or [])
     collect_github.collect("test")
     # only the well-formed entry is fetched
     assert calls == ["/repos/aws/aws-cli/releases?per_page=50"]
     out = capsys.readouterr().out
-    assert "skipping malformed entry" in out
+    assert out.count("skipping non-dict entry") == 2
+    assert out.count("skipping malformed entry") == 1
diff --git a/tests/test_collect_rss.py b/tests/test_collect_rss.py
@@ -103,13 +103,15 @@ def test_collect_skips_malformed_feed_entry(make_track, monkeypatch, capsys):
 rss:
   - id: good
     url: https://example.com/good
-  - url: https://example.com/orphan  # no id
-  - id: noisy  # no url
+  - url: https://example.com/orphan  # dict shape, no id
+  - id: noisy  # dict shape, no url
+  - null  # not a dict at all
 """,
     )
     fetches: list[str] = []
     monkeypatch.setattr(collect_rss, "_fetch", lambda url, **k: fetches.append(url) or None)
     collect_rss.collect("test")
     assert fetches == ["https://example.com/good"]
     out = capsys.readouterr().out
-    assert out.count("skipping malformed entry") == 2
+    assert out.count("skipping malformed entry") == 2  # missing-key entries
+    assert out.count("skipping non-dict entry") == 1  # null entry