0-draft
diff --git a/‎scripts/awsdd/collect_github.py‎
Lines changed: 4 additions & 1 deletion b/‎scripts/awsdd/collect_github.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎scripts/awsdd/collect_rss.py‎
Lines changed: 23 additions & 5 deletions b/‎scripts/awsdd/collect_rss.py‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎scripts/awsdd/report.py‎
Lines changed: 3 additions & 1 deletion b/‎scripts/awsdd/report.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎scripts/awsdd/score.py‎
Lines changed: 3 additions & 1 deletion b/‎scripts/awsdd/score.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/test_score.py‎
Lines changed: 8 additions & 0 deletions b/‎tests/test_score.py‎
Lines changed: 8 additions & 0 deletions
@@ -56,7 +56,10 @@ def release_to_item(rel: dict, repo: str, track: str, now: datetime) -> dict | N
         url=url,
         title=(rel.get("name") or rel.get("tag_name") or "").strip(),
         summary=(rel.get("body") or "")[:500],
-        published_at=(rel.get("published_at") or rel.get("created_at") or now.isoformat()),
+        # epoch fallback so items missing both timestamps sink rather than rise
+        published_at=(
+            rel.get("published_at") or rel.get("created_at") or "1970-01-01T00:00:00+00:00"
+        ),
         fetched_at=now.isoformat(),
         tags=["prerelease"] if rel.get("prerelease") else [],
     ).to_dict()
 
@@ -6,25 +6,42 @@
 import json
 import re
 from datetime import UTC, datetime
+from urllib.error import URLError
+from urllib.request import Request, urlopen
 
 import feedparser
 
 from .config import load_sources, track_dir
 from .schema import Item
 
 USER_AGENT = "aws-deepdive/0.1 (+https://github.com/0-draft/aws-deepdive)"
+FETCH_TIMEOUT = 30  # seconds
+EPOCH_ISO = "1970-01-01T00:00:00+00:00"
 
 
 def _id(url: str) -> str:
     return hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
 
 
 def _iso(time_struct) -> str:
+    # Fall back to the Unix epoch (not "now") so items with missing dates
+    # rank as stale and are not promoted by the freshness signal.
     if not time_struct:
-        return datetime.now(UTC).isoformat()
+        return EPOCH_ISO
     return datetime(*time_struct[:6], tzinfo=UTC).isoformat()
 
 
+def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
+    """Fetch a feed body with an explicit timeout; None on network failure."""
+    req = Request(url, headers={"User-Agent": USER_AGENT})
+    try:
+        with urlopen(req, timeout=timeout) as r:
+            return r.read().decode("utf-8", errors="replace")
+    except (URLError, TimeoutError) as e:
+        print(f"[collect_rss] fetch {url}: {e}")
+        return None
+
+
 def _strip_html(text: str) -> str:
     text = re.sub(r"<[^>]+>", " ", text)
     text = html.unescape(text)
@@ -76,11 +93,12 @@ def collect(track: str) -> None:
     items: list[dict] = []
     for feed in feeds:
         sid, url = feed["id"], feed["url"]
-        try:
-            parsed = feedparser.parse(url, request_headers={"User-Agent": USER_AGENT})
-        except Exception as e:
-            print(f"[collect_rss] {sid}: error {e}")
+        # Fetch with an explicit timeout — feedparser.parse(url) has no built-in
+        # timeout and a stuck origin would hang the whole pipeline.
+        body = _fetch(url)
+        if body is None:
             continue
+        parsed = feedparser.parse(body)
         if getattr(parsed, "bozo", 0) and not parsed.entries:
             print(
                 f"[collect_rss] {sid}: feed parse warning ({getattr(parsed, 'bozo_exception', '')})"
 
@@ -12,10 +12,12 @@
 
 def _parse_iso(s: str) -> datetime:
     # Python 3.11+ accepts the "Z" suffix natively.
+    # Fall back to the Unix epoch (not "now") so corrupted dates sink below
+    # the freshness window instead of getting promoted to the top.
     try:
         return datetime.fromisoformat(s or "")
     except (ValueError, TypeError):
-        return datetime.now(UTC)
+        return datetime(1970, 1, 1, tzinfo=UTC)
 
 
 def _filename(mode: str, now: datetime) -> str:
 
@@ -12,10 +12,12 @@
 
 def _parse_iso(s: str) -> datetime:
     # Python 3.11+ accepts the "Z" suffix natively.
+    # Fall back to the Unix epoch (not "now") so corrupted dates yield ~0
+    # freshness instead of an artificial max.
     try:
         return datetime.fromisoformat(s or "")
     except (ValueError, TypeError):
-        return datetime.now(UTC)
+        return datetime(1970, 1, 1, tzinfo=UTC)
 
 
 def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
 
@@ -78,3 +78,11 @@ def test_unknown_severity_zero():
     item = _item(severity="info")
     b = score_item(item, SOURCES_IAM, NOW)
     assert b["severity"] == 0.0
+
+
+def test_corrupted_date_falls_back_to_epoch_not_now():
+    # Regression: previously fell back to datetime.now(UTC), which made
+    # garbage dates score as maximally fresh and floated bad data to the top.
+    item = _item(published_at="this is not a date")
+    b = score_item(item, SOURCES_IAM, NOW)
+    assert b["freshness"] < 0.01