Skip to content

Commit 1eaebb6

Browse files
committed
fix: address gemini-code-assist 11th review on PR #1
All 3 findings legitimate, all applied: - collect_github.collect: add `isinstance(entry, dict)` guard before `entry.get("repo")`. A null or scalar YAML entry (`- foo`) used to AttributeError out of the whole track; now it's logged and skipped. Existing malformed-entry test extended with null and string cases. [MEDIUM] - collect_rss.collect: same isinstance guard for `feed`. Existing malformed-feed test extended with a `null` entry. [MEDIUM] - score._compiled_patterns: wrap the per-keyword regex compile in `functools.lru_cache` keyed on the keyword tuple. score_item is called once per item (hundreds per track per run); without caching we'd recompile the same primary/secondary patterns every call. Switched score_item's keyword extraction to tuples so they're hashable for the cache. No behaviour change — same word-bounded semantics, just compiled once per distinct keyword set. [MEDIUM]
1 parent 04fc60a commit 1eaebb6

5 files changed

Lines changed: 36 additions & 14 deletions

File tree

scripts/awsdd/collect_github.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,12 @@ def collect(track: str) -> None:
117117
now = datetime.now(UTC)
118118
items: list[dict] = []
119119
for entry in repos:
120-
# defensive: skip malformed config rather than crashing the whole track
120+
# defensive: skip malformed config rather than crashing the whole track.
121+
# The isinstance guard covers null and scalar entries; .get covers
122+
# dict-shaped but incomplete entries.
123+
if not isinstance(entry, dict):
124+
print(f"[collect_github] skipping non-dict entry: {entry!r}")
125+
continue
121126
repo = entry.get("repo")
122127
if not repo:
123128
print(f"[collect_github] skipping malformed entry: {entry!r}")

scripts/awsdd/collect_rss.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ def collect(track: str) -> None:
149149
for feed in feeds:
150150
# defensive: skip incomplete config entries instead of KeyError-ing the
151151
# whole pipeline if one feed in sources.yaml is missing id or url.
152+
# isinstance guard covers null / scalar entries (e.g. `- foo` in YAML).
153+
if not isinstance(feed, dict):
154+
print(f"[collect_rss] skipping non-dict entry: {feed!r}")
155+
continue
152156
sid, url = feed.get("id"), feed.get("url")
153157
if not sid or not url:
154158
print(f"[collect_rss] skipping malformed entry: {feed!r}")

scripts/awsdd/score.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,26 @@
55
import math
66
import re
77
from datetime import UTC, datetime
8+
from functools import lru_cache
89

910
from ._dates import parse_iso
1011
from .config import load_sources, track_dir
1112

1213
SEVERITY_WEIGHT = {"critical": 3.0, "high": 2.0, "medium": 1.0, "low": 0.5}
1314

1415

15-
def _keyword_hits(keywords: list[str], text: str) -> int:
16-
"""Count word-bounded matches. Substring matching would let `iam` hit
16+
@lru_cache(maxsize=64)
17+
def _compiled_patterns(keywords: tuple[str, ...]) -> tuple[re.Pattern, ...]:
18+
"""Compile word-bounded patterns once per distinct keyword set.
19+
score_item is called per item; without caching we'd recompile the same
20+
patterns hundreds of times per track."""
21+
return tuple(re.compile(rf"\b{re.escape(k)}\b") for k in keywords)
22+
23+
24+
def _keyword_hits(patterns: tuple[re.Pattern, ...], text: str) -> int:
25+
"""Word-bounded match count. Substring matching would let `iam` hit
1726
`diagram` or `sts` hit `tests`, which dilutes the topic signal."""
18-
return sum(1 for k in keywords if re.search(rf"\b{re.escape(k)}\b", text))
27+
return sum(1 for p in patterns if p.search(text))
1928

2029

2130
def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
@@ -25,10 +34,10 @@ def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
2534

2635
text = f"{item.get('title', '')} {item.get('summary', '')}".lower()
2736
kws = sources.get("keywords") or {}
28-
primary = [k.lower() for k in (kws.get("primary") or [])]
29-
secondary = [k.lower() for k in (kws.get("secondary") or [])]
30-
p_hits = _keyword_hits(primary, text)
31-
s_hits = _keyword_hits(secondary, text)
37+
primary = tuple(k.lower() for k in (kws.get("primary") or []))
38+
secondary = tuple(k.lower() for k in (kws.get("secondary") or []))
39+
p_hits = _keyword_hits(_compiled_patterns(primary), text)
40+
s_hits = _keyword_hits(_compiled_patterns(secondary), text)
3241
keyword = p_hits * 2.0 + s_hits * 0.5
3342

3443
weights = sources.get("source_weights") or {}

tests/test_collect_github.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,14 +70,16 @@ def test_collect_skips_malformed_repo_entry(make_track, monkeypatch, capsys):
7070
sources_yaml="""
7171
github:
7272
- repo: aws/aws-cli
73-
- per_page: 5 # no repo key
73+
- per_page: 5 # no repo key (dict shape but missing repo)
74+
- null # not a dict at all
75+
- "just a string" # also not a dict
7476
""",
7577
)
76-
# stub _get_all so we don't actually hit the API
7778
calls: list[str] = []
7879
monkeypatch.setattr(collect_github, "_get_all", lambda path: calls.append(path) or [])
7980
collect_github.collect("test")
8081
# only the well-formed entry is fetched
8182
assert calls == ["/repos/aws/aws-cli/releases?per_page=50"]
8283
out = capsys.readouterr().out
83-
assert "skipping malformed entry" in out
84+
assert out.count("skipping non-dict entry") == 2
85+
assert out.count("skipping malformed entry") == 1

tests/test_collect_rss.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,15 @@ def test_collect_skips_malformed_feed_entry(make_track, monkeypatch, capsys):
103103
rss:
104104
- id: good
105105
url: https://example.com/good
106-
- url: https://example.com/orphan # no id
107-
- id: noisy # no url
106+
- url: https://example.com/orphan # dict shape, no id
107+
- id: noisy # dict shape, no url
108+
- null # not a dict at all
108109
""",
109110
)
110111
fetches: list[str] = []
111112
monkeypatch.setattr(collect_rss, "_fetch", lambda url, **k: fetches.append(url) or None)
112113
collect_rss.collect("test")
113114
assert fetches == ["https://example.com/good"]
114115
out = capsys.readouterr().out
115-
assert out.count("skipping malformed entry") == 2
116+
assert out.count("skipping malformed entry") == 2 # missing-key entries
117+
assert out.count("skipping non-dict entry") == 1 # null entry

0 commit comments

Comments
 (0)