fix: address gemini-code-assist 9th review on PR #1

kanywst · kanywst · commit c09410a738c6 · 2026-05-17T22:57:15.000+09:00
All 3 findings legitimate, all applied:

- collect_rss._fetch now returns raw bytes instead of decoded str.
  feedparser does its own encoding detection (XML prolog charset, BOM,
  Content-Type) and gzip handling when fed bytes; pre-decoding to
  UTF-8 with errors='replace' silently defeats that and would also
  mojibake non-UTF-8 feeds. collect() already passes the result
  directly to feedparser.parse, so no caller change. [MEDIUM]

- collect_github._get_page reads MAX_RESPONSE_BYTES + 1 and refuses
  any response that hits the cap, instead of decoding a truncated
  body with errors='replace' that could feed corrupt JSON into
  json.loads. The strict utf-8 decode now surfaces a real encoding
  bug as UnicodeDecodeError (also caught) instead of being papered
  over. [MEDIUM]

- report.render escapes `&gt;` in URLs to %3E before wrapping in the
  Markdown `&lt;...&gt;` angle pair. A query string like `?q=a&gt;b` would
  otherwise close the URL pair early and break link parsing. New
  test_gt_in_url_is_escaped pins it. [MEDIUM]
diff --git a/scripts/awsdd/collect_github.py b/scripts/awsdd/collect_github.py
@@ -50,13 +50,24 @@ def _get_page(url: str) -> tuple[list[dict], str | None]:
     req = Request(url, headers=_headers())
     try:
         with urlopen(req, timeout=FETCH_TIMEOUT) as r:
-            body = r.read(MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")
+            # Read one extra byte so we can detect (and refuse) responses that
+            # would otherwise be silently truncated mid-multibyte char and yield
+            # a corrupt JSONDecodeError downstream.
+            raw = r.read(MAX_RESPONSE_BYTES + 1)
             link = r.headers.get("Link")
-        res = json.loads(body)
+        if len(raw) > MAX_RESPONSE_BYTES:
+            print(
+                f"[collect_github] {url}: response exceeded {MAX_RESPONSE_BYTES} bytes; "
+                f"skipping (raise per_page or implement narrower paging)"
+            )
+            return [], None
+        # Strict decode so a real encoding bug surfaces instead of being
+        # masked by errors='replace' that would also corrupt the JSON.
+        res = json.loads(raw.decode("utf-8"))
     except HTTPError as e:
         print(f"[collect_github] {url}: HTTP {e.code}")
         return [], None
-    except (URLError, TimeoutError, json.JSONDecodeError) as e:
+    except (URLError, TimeoutError, UnicodeDecodeError, json.JSONDecodeError) as e:
         print(f"[collect_github] {url}: error {e}")
         return [], None
     # GitHub returns a JSON object (not a list) on error envelopes (rate-limit etc.);
diff --git a/scripts/awsdd/collect_rss.py b/scripts/awsdd/collect_rss.py
@@ -33,12 +33,17 @@ def _iso(time_struct) -> str:
     return datetime(*time_struct[:6], tzinfo=UTC).isoformat()
 
 
-def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
-    """Fetch a feed body with an explicit timeout; None on network failure."""
+def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> bytes | None:
+    """Fetch a feed body with an explicit timeout; None on network failure.
+
+    Returns raw bytes so feedparser can do its own encoding sniffing
+    (XML prolog charset, HTTP Content-Type, BOM) and gzip handling.
+    Decoding here would defeat that.
+    """
     req = Request(url, headers={"User-Agent": USER_AGENT})
     try:
         with urlopen(req, timeout=timeout) as r:
-            return r.read(MAX_FEED_BYTES).decode("utf-8", errors="replace")
+            return r.read(MAX_FEED_BYTES)
     except (URLError, TimeoutError) as e:
         print(f"[collect_rss] fetch {url}: {e}")
         return None
diff --git a/scripts/awsdd/report.py b/scripts/awsdd/report.py
@@ -68,12 +68,13 @@ def render(track: str, mode: str) -> None:
                 # escape both brackets so titles like "[CVE-...] thing" don't
                 # collide with markdown link parsing
                 title = (it.get("title") or "(untitled)").replace("[", r"\[").replace("]", r"\]")
-                url = it.get("url", "")
+                # angle-bracket the URL: AWS links sometimes contain `(` / `)`,
+                # which break vanilla Markdown link parsing. Escape any `>` in
+                # the URL itself so it doesn't terminate the bracket pair.
+                url = (it.get("url") or "").replace(">", "%3E")
                 src = it.get("source", "")
                 pub = (it.get("published_at") or "")[:10]
                 score = float(it.get("score", 0.0))
-                # angle-bracket the URL: AWS links sometimes contain `(` / `)`,
-                # which break vanilla Markdown link parsing.
                 lines.append(f"- [{title}](<{url}>) — `{src}` · {pub} · **score {score:.2f}**")
             lines.append("")
 
diff --git a/tests/test_report.py b/tests/test_report.py
@@ -82,3 +82,14 @@ def test_no_scored_file_yields_empty_report(make_track):
     render("iam", "daily")
     body = next((td / "reports" / "daily").glob("*.md")).read_text()
     assert "No items at all" in body
+
+
+def test_gt_in_url_is_escaped(make_track):
+    # Regression: a `>` in the URL would close the `[t](<url>)` angle pair
+    # early and break Markdown link parsing.
+    td = make_track("iam")
+    (td / "data" / "scored.json").write_text(_scored([_item(url="https://example.com/?q=a>b")]))
+    render("iam", "daily")
+    body = next((td / "reports" / "daily").glob("*.md")).read_text()
+    assert "%3E" in body
+    assert "a>b" not in body