fix: address gemini-code-assist 5th review on PR #1

kanywst · kanywst · commit 52a8e41bbb13 · 2026-05-17T22:57:15.000+09:00
Both findings legitimate, both applied:

- collect_rss._fetch caps the response at 10 MiB (MAX_FEED_BYTES) via
  r.read(MAX_FEED_BYTES). Without a bound, a hostile or runaway origin
  could exhaust the worker's memory. AWS feeds are sub-megabyte; the
  cap is just a safety net. [MEDIUM]
- collect_rss._strip_html now unescapes BEFORE stripping tags. The
  old order let entity-encoded markup (`&amp;lt;script&amp;gt;...`) bypass
  the strip pass and resurface as raw HTML after html.unescape ran on
  the regex output. New regression test
  `test_summary_strips_entity_encoded_tags` locks in the corrected
  order. DOMPurify on the web rendering side is still the last line of
  defense, but cleaning at the source is the right layer. [MEDIUM]
diff --git a/scripts/awsdd/collect_rss.py b/scripts/awsdd/collect_rss.py
@@ -16,6 +16,7 @@
 
 USER_AGENT = "aws-deepdive/0.1 (+https://github.com/0-draft/aws-deepdive)"
 FETCH_TIMEOUT = 30  # seconds
+MAX_FEED_BYTES = 10 * 1024 * 1024  # 10 MiB cap to bound memory on hostile / runaway feeds
 EPOCH_ISO = "1970-01-01T00:00:00+00:00"
 
 
@@ -36,15 +37,18 @@ def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
     req = Request(url, headers={"User-Agent": USER_AGENT})
     try:
         with urlopen(req, timeout=timeout) as r:
-            return r.read().decode("utf-8", errors="replace")
+            return r.read(MAX_FEED_BYTES).decode("utf-8", errors="replace")
     except (URLError, TimeoutError) as e:
         print(f"[collect_rss] fetch {url}: {e}")
         return None
 
 
 def _strip_html(text: str) -> str:
-    text = re.sub(r"<[^>]+>", " ", text)
+    # Unescape FIRST so entity-encoded tags like `&lt;script&gt;` are turned
+    # into their bracketed form before the strip pass; otherwise they bypass
+    # the regex and leak through into the report as raw HTML.
     text = html.unescape(text)
+    text = re.sub(r"<[^>]*>", " ", text)
     text = re.sub(r"\s+", " ", text)
     return text.strip()
 
diff --git a/tests/test_collect_rss.py b/tests/test_collect_rss.py
@@ -52,3 +52,14 @@ def test_summary_helper_strips_html():
         "E", (), {"get": lambda self, k, d=None: "<b>hi</b> there" if k == "summary" else d}
     )()
     assert _summary(fake) == "hi there"
+
+
+def test_summary_strips_entity_encoded_tags():
+    # Regression: some feeds double-encode tags as `&lt;script&gt;...`.
+    # The old strip-then-unescape order let those leak through as raw HTML.
+    payload = "&lt;script&gt;alert(1)&lt;/script&gt;hi"
+    fake = type("E", (), {"get": lambda self, k, d=None: payload if k == "summary" else d})()
+    out = _summary(fake)
+    assert "<script>" not in out
+    assert "&lt;" not in out
+    assert "hi" in out