Skip to content
This repository was archived by the owner on Jun 23, 2026. It is now read-only.

Commit a14a5b7

Browse files
committed
fix: address gemini-code-assist re-review on PR #1
Date-handling robustness: - collect_rss._iso, collect_github.release_to_item, report._parse_iso, score._parse_iso: fall back to the Unix epoch (1970-01-01) instead of datetime.now(UTC) when a date is missing or corrupted. The old behaviour gave malformed items a maximum freshness score and pushed them to the top of every report; the new behaviour leaves them at ~0 freshness so they sink. [MEDIUM x4] Network robustness: - collect_rss.collect now fetches via urlopen with a 30s timeout and passes the body to feedparser.parse, instead of feedparser.parse(url) which has no built-in timeout. A single stuck origin no longer hangs the whole pipeline. [MEDIUM] XSS hardening: - web/src/lib/markdown.ts pipes marked's output through isomorphic-dompurify before returning. Report content is rendered via `set:html`, and titles / summaries originate in untrusted RSS and GitHub feeds, so unsanitized HTML could land in the page (verified by smoke: a fixture with <script>, onerror, and javascript: URLs is fully stripped). [SECURITY-MEDIUM] Tests: - tests/test_score.py: regression test that a garbage published_at scores near-zero freshness. Locks in the epoch fallback.
1 parent 405b8b7 commit a14a5b7

8 files changed

Lines changed: 617 additions & 9 deletions

File tree

scripts/awsdd/collect_github.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ def release_to_item(rel: dict, repo: str, track: str, now: datetime) -> dict | N
5656
url=url,
5757
title=(rel.get("name") or rel.get("tag_name") or "").strip(),
5858
summary=(rel.get("body") or "")[:500],
59-
published_at=(rel.get("published_at") or rel.get("created_at") or now.isoformat()),
59+
# epoch fallback so items missing both timestamps sink rather than rise
60+
published_at=(
61+
rel.get("published_at") or rel.get("created_at") or "1970-01-01T00:00:00+00:00"
62+
),
6063
fetched_at=now.isoformat(),
6164
tags=["prerelease"] if rel.get("prerelease") else [],
6265
).to_dict()

scripts/awsdd/collect_rss.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,42 @@
66
import json
77
import re
88
from datetime import UTC, datetime
9+
from urllib.error import URLError
10+
from urllib.request import Request, urlopen
911

1012
import feedparser
1113

1214
from .config import load_sources, track_dir
1315
from .schema import Item
1416

1517
USER_AGENT = "aws-deepdive/0.1 (+https://github.com/0-draft/aws-deepdive)"
18+
FETCH_TIMEOUT = 30 # seconds
19+
EPOCH_ISO = "1970-01-01T00:00:00+00:00"
1620

1721

1822
def _id(url: str) -> str:
1923
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
2024

2125

2226
def _iso(time_struct) -> str:
27+
# Fall back to the Unix epoch (not "now") so items with missing dates
28+
# rank as stale and are not promoted by the freshness signal.
2329
if not time_struct:
24-
return datetime.now(UTC).isoformat()
30+
return EPOCH_ISO
2531
return datetime(*time_struct[:6], tzinfo=UTC).isoformat()
2632

2733

34+
def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
35+
"""Fetch a feed body with an explicit timeout; None on network failure."""
36+
req = Request(url, headers={"User-Agent": USER_AGENT})
37+
try:
38+
with urlopen(req, timeout=timeout) as r:
39+
return r.read().decode("utf-8", errors="replace")
40+
except (URLError, TimeoutError) as e:
41+
print(f"[collect_rss] fetch {url}: {e}")
42+
return None
43+
44+
2845
def _strip_html(text: str) -> str:
2946
text = re.sub(r"<[^>]+>", " ", text)
3047
text = html.unescape(text)
@@ -76,11 +93,12 @@ def collect(track: str) -> None:
7693
items: list[dict] = []
7794
for feed in feeds:
7895
sid, url = feed["id"], feed["url"]
79-
try:
80-
parsed = feedparser.parse(url, request_headers={"User-Agent": USER_AGENT})
81-
except Exception as e:
82-
print(f"[collect_rss] {sid}: error {e}")
96+
# Fetch with an explicit timeout — feedparser.parse(url) has no built-in
97+
# timeout and a stuck origin would hang the whole pipeline.
98+
body = _fetch(url)
99+
if body is None:
83100
continue
101+
parsed = feedparser.parse(body)
84102
if getattr(parsed, "bozo", 0) and not parsed.entries:
85103
print(
86104
f"[collect_rss] {sid}: feed parse warning ({getattr(parsed, 'bozo_exception', '')})"

scripts/awsdd/report.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212

1313
def _parse_iso(s: str) -> datetime:
1414
# Python 3.11+ accepts the "Z" suffix natively.
15+
# Fall back to the Unix epoch (not "now") so corrupted dates sink below
16+
# the freshness window instead of getting promoted to the top.
1517
try:
1618
return datetime.fromisoformat(s or "")
1719
except (ValueError, TypeError):
18-
return datetime.now(UTC)
20+
return datetime(1970, 1, 1, tzinfo=UTC)
1921

2022

2123
def _filename(mode: str, now: datetime) -> str:

scripts/awsdd/score.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@
1212

1313
def _parse_iso(s: str) -> datetime:
1414
# Python 3.11+ accepts the "Z" suffix natively.
15+
# Fall back to the Unix epoch (not "now") so corrupted dates yield ~0
16+
# freshness instead of an artificial max.
1517
try:
1618
return datetime.fromisoformat(s or "")
1719
except (ValueError, TypeError):
18-
return datetime.now(UTC)
20+
return datetime(1970, 1, 1, tzinfo=UTC)
1921

2022

2123
def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:

tests/test_score.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,11 @@ def test_unknown_severity_zero():
7878
item = _item(severity="info")
7979
b = score_item(item, SOURCES_IAM, NOW)
8080
assert b["severity"] == 0.0
81+
82+
83+
def test_corrupted_date_falls_back_to_epoch_not_now():
84+
# Regression: previously fell back to datetime.now(UTC), which made
85+
# garbage dates score as maximally fresh and floated bad data to the top.
86+
item = _item(published_at="this is not a date")
87+
b = score_item(item, SOURCES_IAM, NOW)
88+
assert b["freshness"] < 0.01

0 commit comments

Comments
 (0)