Skip to content
This repository was archived by the owner on Jun 23, 2026. It is now read-only.

Commit c09410a

Browse files
committed
fix: address gemini-code-assist 9th review on PR #1
All 3 findings legitimate, all applied: - collect_rss._fetch now returns raw bytes instead of decoded str. feedparser does its own encoding detection (XML prolog charset, BOM, Content-Type) and gzip handling when fed bytes; pre-decoding to UTF-8 with errors='replace' silently defeats that and would also mojibake non-UTF-8 feeds. collect() already passes the result directly to feedparser.parse, so no caller change. [MEDIUM] - collect_github._get_page reads MAX_RESPONSE_BYTES + 1 and refuses any response that hits the cap, instead of decoding a truncated body with errors='replace' that could feed corrupt JSON into json.loads. The strict utf-8 decode now surfaces a real encoding bug as UnicodeDecodeError (also caught) instead of being papered over. [MEDIUM] - report.render escapes `>` in URLs to %3E before wrapping in the Markdown `<...>` angle pair. A query string like `?q=a>b` would otherwise close the URL pair early and break link parsing. New test_gt_in_url_is_escaped pins it. [MEDIUM]
1 parent d7b0d3a commit c09410a

4 files changed

Lines changed: 37 additions & 9 deletions

File tree

scripts/awsdd/collect_github.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,24 @@ def _get_page(url: str) -> tuple[list[dict], str | None]:
5050
req = Request(url, headers=_headers())
5151
try:
5252
with urlopen(req, timeout=FETCH_TIMEOUT) as r:
53-
body = r.read(MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")
53+
# Read one extra byte so we can detect (and refuse) responses that
54+
# would otherwise be silently truncated mid-multibyte char and yield
55+
# a corrupt JSONDecodeError downstream.
56+
raw = r.read(MAX_RESPONSE_BYTES + 1)
5457
link = r.headers.get("Link")
55-
res = json.loads(body)
58+
if len(raw) > MAX_RESPONSE_BYTES:
59+
print(
60+
f"[collect_github] {url}: response exceeded {MAX_RESPONSE_BYTES} bytes; "
61+
f"skipping (raise per_page or implement narrower paging)"
62+
)
63+
return [], None
64+
# Strict decode so a real encoding bug surfaces instead of being
65+
# masked by errors='replace' that would also corrupt the JSON.
66+
res = json.loads(raw.decode("utf-8"))
5667
except HTTPError as e:
5768
print(f"[collect_github] {url}: HTTP {e.code}")
5869
return [], None
59-
except (URLError, TimeoutError, json.JSONDecodeError) as e:
70+
except (URLError, TimeoutError, UnicodeDecodeError, json.JSONDecodeError) as e:
6071
print(f"[collect_github] {url}: error {e}")
6172
return [], None
6273
# GitHub returns a JSON object (not a list) on error envelopes (rate-limit etc.);

scripts/awsdd/collect_rss.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,17 @@ def _iso(time_struct) -> str:
3333
return datetime(*time_struct[:6], tzinfo=UTC).isoformat()
3434

3535

36-
def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
37-
"""Fetch a feed body with an explicit timeout; None on network failure."""
36+
def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> bytes | None:
37+
"""Fetch a feed body with an explicit timeout; None on network failure.
38+
39+
Returns raw bytes so feedparser can do its own encoding sniffing
40+
(XML prolog charset, HTTP Content-Type, BOM) and gzip handling.
41+
Decoding here would defeat that.
42+
"""
3843
req = Request(url, headers={"User-Agent": USER_AGENT})
3944
try:
4045
with urlopen(req, timeout=timeout) as r:
41-
return r.read(MAX_FEED_BYTES).decode("utf-8", errors="replace")
46+
return r.read(MAX_FEED_BYTES)
4247
except (URLError, TimeoutError) as e:
4348
print(f"[collect_rss] fetch {url}: {e}")
4449
return None

scripts/awsdd/report.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,13 @@ def render(track: str, mode: str) -> None:
6868
# escape both brackets so titles like "[CVE-...] thing" don't
6969
# collide with markdown link parsing
7070
title = (it.get("title") or "(untitled)").replace("[", r"\[").replace("]", r"\]")
71-
url = it.get("url", "")
71+
# angle-bracket the URL: AWS links sometimes contain `(` / `)`,
72+
# which break vanilla Markdown link parsing. Escape any `>` in
73+
# the URL itself so it doesn't terminate the bracket pair.
74+
url = (it.get("url") or "").replace(">", "%3E")
7275
src = it.get("source", "")
7376
pub = (it.get("published_at") or "")[:10]
7477
score = float(it.get("score", 0.0))
75-
# angle-bracket the URL: AWS links sometimes contain `(` / `)`,
76-
# which break vanilla Markdown link parsing.
7778
lines.append(f"- [{title}](<{url}>) — `{src}` · {pub} · **score {score:.2f}**")
7879
lines.append("")
7980

tests/test_report.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,14 @@ def test_no_scored_file_yields_empty_report(make_track):
8282
render("iam", "daily")
8383
body = next((td / "reports" / "daily").glob("*.md")).read_text()
8484
assert "No items at all" in body
85+
86+
87+
def test_gt_in_url_is_escaped(make_track):
88+
# Regression: a `>` in the URL would close the `[t](<url>)` angle pair
89+
# early and break Markdown link parsing.
90+
td = make_track("iam")
91+
(td / "data" / "scored.json").write_text(_scored([_item(url="https://example.com/?q=a>b")]))
92+
render("iam", "daily")
93+
body = next((td / "reports" / "daily").glob("*.md")).read_text()
94+
assert "%3E" in body
95+
assert "a>b" not in body

0 commit comments

Comments
 (0)