Skip to content

Commit dbd065a

Browse files
committed
fix: address gemini-code-assist 6th review on PR #1
All 6 findings legitimate, all applied (with regression tests): - collect_github._get_page caps r.read at 10 MiB (MAX_RESPONSE_BYTES), same DoS guard as collect_rss. [MEDIUM] - collect_github now follows Link.rel="next" up to MAX_PAGES=5 pages via _get_all + _next_url, and default per_page bumped 20→50. A busy repo (aws-cli, aws-cdk) could previously lose releases beyond page 1; with daily CI the new bound covers all realistic catch-up windows. Three tests pin the Link-header parser. [MEDIUM] - collect_rss._strip_html now uses stdlib html.parser instead of a regex. A real parser handles `<`/`>` inside text content (the old regex chomped through "1 < 2 and 4 > 3"), and a _TextExtractor that tracks <script>/<style> depth drops their contents entirely. Two regression tests cover both cases. [MEDIUM] - collect_rss._severity now matches with `\b(critical|high|medium| low)\b`. The old substring check turned "Slow performance" into a low-severity item and "Highlighted feature" into a high one. [MEDIUM] - score._keyword_hits uses `\b{re.escape(keyword)}\b`. The old substring check let `iam` hit "diagram" and `sts` hit "tests". Two regression tests lock in that "updated diagram and test results" scores zero and "iam supports sts session tags" scores correctly. [MEDIUM] - web/src/lib/data.ts keeps the process.cwd()-based ROOT (we validated that import.meta.url breaks under Astro's bundler in a prior attempt), but now asserts at module load that ROOT/tracks exists and throws a clear error if invoked from the wrong cwd, so the previous silent "no items yet" failure mode becomes loud. [MEDIUM] Side effects of the word-bounded scoring change: all four tracks were re-scored against the existing normalized.json snapshots and seed reports regenerated. Top items shift slightly (substring-only hits no longer count) but the change is purely a precision improvement.
1 parent 52a8e41 commit dbd065a

15 files changed

Lines changed: 4712 additions & 4564 deletions

File tree

scripts/awsdd/collect_github.py

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import hashlib
55
import json
66
import os
7+
import re
78
from datetime import UTC, datetime
89
from urllib.error import HTTPError, URLError
910
from urllib.request import Request, urlopen
@@ -13,34 +14,68 @@
1314

1415
API = "https://api.github.com"
1516
USER_AGENT = "aws-deepdive/0.1 (+https://github.com/0-draft/aws-deepdive)"
17+
FETCH_TIMEOUT = 30
18+
MAX_RESPONSE_BYTES = 10 * 1024 * 1024 # 10 MiB safety cap per page
19+
MAX_PAGES = 5 # follow Link.rel="next" up to this many pages per repo
1620

1721

1822
def _id(url: str) -> str:
1923
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
2024

2125

22-
def _get(path: str) -> list[dict]:
23-
headers = {
26+
def _headers() -> dict[str, str]:
27+
h = {
2428
"Accept": "application/vnd.github+json",
2529
"X-GitHub-Api-Version": "2022-11-28",
2630
"User-Agent": USER_AGENT,
2731
}
2832
token = os.environ.get("GITHUB_TOKEN")
2933
if token:
30-
headers["Authorization"] = f"Bearer {token}"
31-
req = Request(f"{API}{path}", headers=headers)
34+
h["Authorization"] = f"Bearer {token}"
35+
return h
36+
37+
38+
_NEXT_LINK_RE = re.compile(r'<([^>]+)>;\s*rel="next"')
39+
40+
41+
def _next_url(link_header: str | None) -> str | None:
42+
if not link_header:
43+
return None
44+
m = _NEXT_LINK_RE.search(link_header)
45+
return m.group(1) if m else None
46+
47+
48+
def _get_page(url: str) -> tuple[list[dict], str | None]:
49+
"""Fetch one page. Returns (items, next_url). Empty list + None on error."""
50+
req = Request(url, headers=_headers())
3251
try:
33-
with urlopen(req, timeout=30) as r:
34-
res = json.loads(r.read().decode("utf-8"))
35-
# GitHub returns a JSON object (not a list) on error envelopes
36-
# (rate-limit, 404, etc.); guard so callers can iterate safely.
37-
return res if isinstance(res, list) else []
52+
with urlopen(req, timeout=FETCH_TIMEOUT) as r:
53+
body = r.read(MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")
54+
link = r.headers.get("Link")
55+
res = json.loads(body)
3856
except HTTPError as e:
39-
print(f"[collect_github] {path}: HTTP {e.code}")
40-
return []
57+
print(f"[collect_github] {url}: HTTP {e.code}")
58+
return [], None
4159
except (URLError, TimeoutError, json.JSONDecodeError) as e:
42-
print(f"[collect_github] {path}: error {e}")
43-
return []
60+
print(f"[collect_github] {url}: error {e}")
61+
return [], None
62+
# GitHub returns a JSON object (not a list) on error envelopes (rate-limit etc.);
63+
# guard so callers can iterate safely.
64+
items = res if isinstance(res, list) else []
65+
return items, _next_url(link)
66+
67+
68+
def _get_all(path: str) -> list[dict]:
69+
"""Follow Link.rel="next" up to MAX_PAGES pages."""
70+
url = f"{API}{path}"
71+
out: list[dict] = []
72+
for _ in range(MAX_PAGES):
73+
items, nxt = _get_page(url)
74+
out.extend(items)
75+
if not nxt:
76+
break
77+
url = nxt
78+
return out
4479

4580

4681
def release_to_item(rel: dict, repo: str, track: str, now: datetime) -> dict | None:
@@ -72,8 +107,8 @@ def collect(track: str) -> None:
72107
items: list[dict] = []
73108
for entry in repos:
74109
repo = entry["repo"]
75-
per_page = entry.get("per_page", 20)
76-
releases = _get(f"/repos/{repo}/releases?per_page={per_page}")
110+
per_page = entry.get("per_page", 50)
111+
releases = _get_all(f"/repos/{repo}/releases?per_page={per_page}")
77112
for rel in releases:
78113
item = release_to_item(rel, repo, track, now)
79114
if item is not None:

scripts/awsdd/collect_rss.py

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import re
88
from datetime import UTC, datetime
9+
from html.parser import HTMLParser
910
from urllib.error import URLError
1011
from urllib.request import Request, urlopen
1112

@@ -43,14 +44,47 @@ def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
4344
return None
4445

4546

47+
class _TextExtractor(HTMLParser):
48+
"""Pull text content out of HTML, dropping <script>/<style> bodies entirely.
49+
50+
Uses stdlib html.parser so it correctly handles entities and `<` / `>`
51+
inside text content (e.g. "1 < 2 and 4 > 3"), which a regex strip would
52+
chomp.
53+
"""
54+
55+
_DROP_CONTENT = frozenset({"script", "style"})
56+
57+
def __init__(self) -> None:
58+
super().__init__(convert_charrefs=True)
59+
self.parts: list[str] = []
60+
self._drop_depth = 0
61+
62+
def handle_starttag(self, tag: str, attrs: list) -> None:
63+
if tag in self._DROP_CONTENT:
64+
self._drop_depth += 1
65+
66+
def handle_endtag(self, tag: str) -> None:
67+
if tag in self._DROP_CONTENT and self._drop_depth > 0:
68+
self._drop_depth -= 1
69+
70+
def handle_data(self, data: str) -> None:
71+
if self._drop_depth == 0:
72+
self.parts.append(data)
73+
74+
4675
def _strip_html(text: str) -> str:
47-
# Unescape FIRST so entity-encoded tags like `&lt;script&gt;` are turned
48-
# into their bracketed form before the strip pass; otherwise they bypass
49-
# the regex and leak through into the report as raw HTML.
76+
# Unescape FIRST so entity-encoded tags like `&lt;script&gt;` become
77+
# real tags the parser can recognise (otherwise they'd survive as text).
5078
text = html.unescape(text)
51-
text = re.sub(r"<[^>]*>", " ", text)
52-
text = re.sub(r"\s+", " ", text)
53-
return text.strip()
79+
parser = _TextExtractor()
80+
try:
81+
parser.feed(text)
82+
parser.close()
83+
except Exception:
84+
# html.parser is fairly tolerant; if it bails on truly malformed input,
85+
# fall back to the raw text rather than dropping the whole summary.
86+
return re.sub(r"\s+", " ", text).strip()
87+
return re.sub(r"\s+", " ", "".join(parser.parts)).strip()
5488

5589

5690
def _summary(entry) -> str:
@@ -62,12 +96,15 @@ def _title(entry) -> str:
6296
return html.unescape((entry.get("title") or "").strip())
6397

6498

99+
_SEVERITY_RE = re.compile(r"\b(critical|high|medium|low)\b", re.IGNORECASE)
100+
101+
65102
def _severity(entry) -> str | None:
66-
title = (entry.get("title") or "").lower()
67-
for s in ("critical", "high", "medium", "low"):
68-
if s in title:
69-
return s
70-
return None
103+
# Word-boundary match: avoids "Slow performance" hitting "low" and
104+
# "Highlighted" hitting "high".
105+
title = entry.get("title") or ""
106+
m = _SEVERITY_RE.search(title)
107+
return m.group(1).lower() if m else None
71108

72109

73110
def entry_to_item(entry, sid: str, track: str, now: datetime) -> dict | None:

scripts/awsdd/score.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import argparse
44
import json
55
import math
6+
import re
67
from datetime import UTC, datetime
78

89
from ._dates import parse_iso
@@ -11,6 +12,12 @@
1112
SEVERITY_WEIGHT = {"critical": 3.0, "high": 2.0, "medium": 1.0, "low": 0.5}
1213

1314

15+
def _keyword_hits(keywords: list[str], text: str) -> int:
16+
"""Count word-bounded matches. Substring matching would let `iam` hit
17+
`diagram` or `sts` hit `tests`, which dilutes the topic signal."""
18+
return sum(1 for k in keywords if re.search(rf"\b{re.escape(k)}\b", text))
19+
20+
1421
def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
1522
pub = parse_iso(item.get("published_at", ""))
1623
days = max(0.0, (now - pub).total_seconds() / 86400.0)
@@ -20,8 +27,8 @@ def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
2027
kws = sources.get("keywords") or {}
2128
primary = [k.lower() for k in (kws.get("primary") or [])]
2229
secondary = [k.lower() for k in (kws.get("secondary") or [])]
23-
p_hits = sum(1 for k in primary if k in text)
24-
s_hits = sum(1 for k in secondary if k in text)
30+
p_hits = _keyword_hits(primary, text)
31+
s_hits = _keyword_hits(secondary, text)
2532
keyword = p_hits * 2.0 + s_hits * 0.5
2633

2734
weights = sources.get("source_weights") or {}

tests/test_collect_github.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import json
44

5-
from awsdd.collect_github import release_to_item
5+
from awsdd.collect_github import _next_url, release_to_item
66

77
from .conftest import FIXTURES, NOW
88

@@ -42,3 +42,21 @@ def test_falls_back_to_tag_name_when_no_name():
4242
}
4343
item = release_to_item(rel, "x/y", "releases", NOW)
4444
assert item["title"] == "v1"
45+
46+
47+
def test_next_url_parses_link_header():
48+
link = (
49+
'<https://api.github.com/repos/x/y/releases?page=2>; rel="next", '
50+
'<https://api.github.com/repos/x/y/releases?page=5>; rel="last"'
51+
)
52+
assert _next_url(link) == "https://api.github.com/repos/x/y/releases?page=2"
53+
54+
55+
def test_next_url_returns_none_when_no_next():
56+
link = '<https://api.github.com/repos/x/y/releases?page=5>; rel="last"'
57+
assert _next_url(link) is None
58+
59+
60+
def test_next_url_handles_missing_header():
61+
assert _next_url(None) is None
62+
assert _next_url("") is None

tests/test_collect_rss.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,22 @@ def test_summary_strips_entity_encoded_tags():
6363
assert "<script>" not in out
6464
assert "&lt;" not in out
6565
assert "hi" in out
66+
67+
68+
def test_summary_drops_script_body():
69+
# Regression: with the html.parser switch, content inside <script>/<style>
70+
# tags is dropped entirely instead of leaking as text.
71+
payload = "before<script>alert(1)</script>after"
72+
fake = type("E", (), {"get": lambda self, k, d=None: payload if k == "summary" else d})()
73+
out = _summary(fake)
74+
assert "alert" not in out
75+
assert "before" in out and "after" in out
76+
77+
78+
def test_summary_preserves_lt_gt_in_text():
79+
# Regression: the old regex strip chomped through "1 < 2 and 4 > 3"
80+
# because `<[^>]*>` matched across stray angle brackets.
81+
payload = "if 1 < 2 and 4 > 3 then ok"
82+
fake = type("E", (), {"get": lambda self, k, d=None: payload if k == "summary" else d})()
83+
out = _summary(fake)
84+
assert "1" in out and "2" in out and "3" in out and "4" in out and "ok" in out

tests/test_score.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,25 @@ def test_corrupted_date_falls_back_to_epoch_not_now():
8686
item = _item(published_at="this is not a date")
8787
b = score_item(item, SOURCES_IAM, NOW)
8888
assert b["freshness"] < 0.01
89+
90+
91+
def test_keyword_match_is_word_bounded():
92+
# Regression: substring matching let `iam` hit `diagram` and `sts` hit
93+
# `tests`. With word boundaries, neither should match.
94+
item = _item(
95+
title="updated diagram and test results for hosts",
96+
source="rss:aws-whats-new",
97+
)
98+
b = score_item(item, SOURCES_IAM, NOW)
99+
assert b["keyword"] == 0.0
100+
assert b["keyword_signal"] == 0.0
101+
102+
103+
def test_keyword_match_hits_exact_words():
104+
item = _item(
105+
title="iam supports sts session tags",
106+
source="rss:aws-whats-new",
107+
)
108+
b = score_item(item, SOURCES_IAM, NOW)
109+
# both "iam" and "sts" are secondary keywords: 2 hits * 0.5 = 1.0
110+
assert b["keyword"] == 1.0

0 commit comments

Comments
 (0)