0-draft
diff --git a/‎scripts/awsdd/collect_github.py‎
Lines changed: 50 additions & 15 deletions b/‎scripts/awsdd/collect_github.py‎
Lines changed: 50 additions & 15 deletions
diff --git a/‎scripts/awsdd/collect_rss.py‎
Lines changed: 48 additions & 11 deletions b/‎scripts/awsdd/collect_rss.py‎
Lines changed: 48 additions & 11 deletions
diff --git a/‎scripts/awsdd/score.py‎
Lines changed: 9 additions & 2 deletions b/‎scripts/awsdd/score.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎tests/test_collect_github.py‎
Lines changed: 19 additions & 1 deletion b/‎tests/test_collect_github.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎tests/test_collect_rss.py‎
Lines changed: 19 additions & 0 deletions b/‎tests/test_collect_rss.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎tests/test_score.py‎
Lines changed: 22 additions & 0 deletions b/‎tests/test_score.py‎
Lines changed: 22 additions & 0 deletions
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import re
 from datetime import UTC, datetime
 from urllib.error import HTTPError, URLError
 from urllib.request import Request, urlopen
@@ -13,34 +14,68 @@
 
 API = "https://api.github.com"
 USER_AGENT = "aws-deepdive/0.1 (+https://github.com/0-draft/aws-deepdive)"
+FETCH_TIMEOUT = 30
+MAX_RESPONSE_BYTES = 10 * 1024 * 1024  # 10 MiB safety cap per page
+MAX_PAGES = 5  # follow Link.rel="next" up to this many pages per repo
 
 
 def _id(url: str) -> str:
     return hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
 
 
-def _get(path: str) -> list[dict]:
-    headers = {
+def _headers() -> dict[str, str]:
+    h = {
         "Accept": "application/vnd.github+json",
         "X-GitHub-Api-Version": "2022-11-28",
         "User-Agent": USER_AGENT,
     }
     token = os.environ.get("GITHUB_TOKEN")
     if token:
-        headers["Authorization"] = f"Bearer {token}"
-    req = Request(f"{API}{path}", headers=headers)
+        h["Authorization"] = f"Bearer {token}"
+    return h
+
+
+_NEXT_LINK_RE = re.compile(r'<([^>]+)>;\s*rel="next"')
+
+
+def _next_url(link_header: str | None) -> str | None:
+    if not link_header:
+        return None
+    m = _NEXT_LINK_RE.search(link_header)
+    return m.group(1) if m else None
+
+
+def _get_page(url: str) -> tuple[list[dict], str | None]:
+    """Fetch one page. Returns (items, next_url). Empty list + None on error."""
+    req = Request(url, headers=_headers())
     try:
-        with urlopen(req, timeout=30) as r:
-            res = json.loads(r.read().decode("utf-8"))
-        # GitHub returns a JSON object (not a list) on error envelopes
-        # (rate-limit, 404, etc.); guard so callers can iterate safely.
-        return res if isinstance(res, list) else []
+        with urlopen(req, timeout=FETCH_TIMEOUT) as r:
+            body = r.read(MAX_RESPONSE_BYTES).decode("utf-8", errors="replace")
+            link = r.headers.get("Link")
+        res = json.loads(body)
     except HTTPError as e:
-        print(f"[collect_github] {path}: HTTP {e.code}")
-        return []
+        print(f"[collect_github] {url}: HTTP {e.code}")
+        return [], None
     except (URLError, TimeoutError, json.JSONDecodeError) as e:
-        print(f"[collect_github] {path}: error {e}")
-        return []
+        print(f"[collect_github] {url}: error {e}")
+        return [], None
+    # GitHub returns a JSON object (not a list) on error envelopes (rate-limit etc.);
+    # guard so callers can iterate safely.
+    items = res if isinstance(res, list) else []
+    return items, _next_url(link)
+
+
+def _get_all(path: str) -> list[dict]:
+    """Follow Link.rel="next" up to MAX_PAGES pages."""
+    url = f"{API}{path}"
+    out: list[dict] = []
+    for _ in range(MAX_PAGES):
+        items, nxt = _get_page(url)
+        out.extend(items)
+        if not nxt:
+            break
+        url = nxt
+    return out
 
 
 def release_to_item(rel: dict, repo: str, track: str, now: datetime) -> dict | None:
@@ -72,8 +107,8 @@ def collect(track: str) -> None:
     items: list[dict] = []
     for entry in repos:
         repo = entry["repo"]
-        per_page = entry.get("per_page", 20)
-        releases = _get(f"/repos/{repo}/releases?per_page={per_page}")
+        per_page = entry.get("per_page", 50)
+        releases = _get_all(f"/repos/{repo}/releases?per_page={per_page}")
         for rel in releases:
             item = release_to_item(rel, repo, track, now)
             if item is not None:
 
@@ -6,6 +6,7 @@
 import json
 import re
 from datetime import UTC, datetime
+from html.parser import HTMLParser
 from urllib.error import URLError
 from urllib.request import Request, urlopen
 
@@ -43,14 +44,47 @@ def _fetch(url: str, timeout: int = FETCH_TIMEOUT) -> str | None:
         return None
 
 
+class _TextExtractor(HTMLParser):
+    """Pull text content out of HTML, dropping <script>/<style> bodies entirely.
+
+    Uses stdlib html.parser so it correctly handles entities and `<` / `>`
+    inside text content (e.g. "1 < 2 and 4 > 3"), which a regex strip would
+    chomp.
+    """
+
+    _DROP_CONTENT = frozenset({"script", "style"})
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self.parts: list[str] = []
+        self._drop_depth = 0
+
+    def handle_starttag(self, tag: str, attrs: list) -> None:
+        if tag in self._DROP_CONTENT:
+            self._drop_depth += 1
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self._DROP_CONTENT and self._drop_depth > 0:
+            self._drop_depth -= 1
+
+    def handle_data(self, data: str) -> None:
+        if self._drop_depth == 0:
+            self.parts.append(data)
+
+
 def _strip_html(text: str) -> str:
-    # Unescape FIRST so entity-encoded tags like `&lt;script&gt;` are turned
-    # into their bracketed form before the strip pass; otherwise they bypass
-    # the regex and leak through into the report as raw HTML.
+    # Unescape FIRST so entity-encoded tags like `&lt;script&gt;` become
+    # real tags the parser can recognise (otherwise they'd survive as text).
     text = html.unescape(text)
-    text = re.sub(r"<[^>]*>", " ", text)
-    text = re.sub(r"\s+", " ", text)
-    return text.strip()
+    parser = _TextExtractor()
+    try:
+        parser.feed(text)
+        parser.close()
+    except Exception:
+        # html.parser is fairly tolerant; if it bails on truly malformed input,
+        # fall back to the raw text rather than dropping the whole summary.
+        return re.sub(r"\s+", " ", text).strip()
+    return re.sub(r"\s+", " ", "".join(parser.parts)).strip()
 
 
 def _summary(entry) -> str:
@@ -62,12 +96,15 @@ def _title(entry) -> str:
     return html.unescape((entry.get("title") or "").strip())
 
 
+_SEVERITY_RE = re.compile(r"\b(critical|high|medium|low)\b", re.IGNORECASE)
+
+
 def _severity(entry) -> str | None:
-    title = (entry.get("title") or "").lower()
-    for s in ("critical", "high", "medium", "low"):
-        if s in title:
-            return s
-    return None
+    # Word-boundary match: avoids "Slow performance" hitting "low" and
+    # "Highlighted" hitting "high".
+    title = entry.get("title") or ""
+    m = _SEVERITY_RE.search(title)
+    return m.group(1).lower() if m else None
 
 
 def entry_to_item(entry, sid: str, track: str, now: datetime) -> dict | None:
 
@@ -3,6 +3,7 @@
 import argparse
 import json
 import math
+import re
 from datetime import UTC, datetime
 
 from ._dates import parse_iso
@@ -11,6 +12,12 @@
 SEVERITY_WEIGHT = {"critical": 3.0, "high": 2.0, "medium": 1.0, "low": 0.5}
 
 
+def _keyword_hits(keywords: list[str], text: str) -> int:
+    """Count word-bounded matches. Substring matching would let `iam` hit
+    `diagram` or `sts` hit `tests`, which dilutes the topic signal."""
+    return sum(1 for k in keywords if re.search(rf"\b{re.escape(k)}\b", text))
+
+
 def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
     pub = parse_iso(item.get("published_at", ""))
     days = max(0.0, (now - pub).total_seconds() / 86400.0)
@@ -20,8 +27,8 @@ def score_item(item: dict, sources: dict, now: datetime) -> dict[str, float]:
     kws = sources.get("keywords") or {}
     primary = [k.lower() for k in (kws.get("primary") or [])]
     secondary = [k.lower() for k in (kws.get("secondary") or [])]
-    p_hits = sum(1 for k in primary if k in text)
-    s_hits = sum(1 for k in secondary if k in text)
+    p_hits = _keyword_hits(primary, text)
+    s_hits = _keyword_hits(secondary, text)
     keyword = p_hits * 2.0 + s_hits * 0.5
 
     weights = sources.get("source_weights") or {}
 
@@ -2,7 +2,7 @@
 
 import json
 
-from awsdd.collect_github import release_to_item
+from awsdd.collect_github import _next_url, release_to_item
 
 from .conftest import FIXTURES, NOW
 
@@ -42,3 +42,21 @@ def test_falls_back_to_tag_name_when_no_name():
     }
     item = release_to_item(rel, "x/y", "releases", NOW)
     assert item["title"] == "v1"
+
+
+def test_next_url_parses_link_header():
+    link = (
+        '<https://api.github.com/repos/x/y/releases?page=2>; rel="next", '
+        '<https://api.github.com/repos/x/y/releases?page=5>; rel="last"'
+    )
+    assert _next_url(link) == "https://api.github.com/repos/x/y/releases?page=2"
+
+
+def test_next_url_returns_none_when_no_next():
+    link = '<https://api.github.com/repos/x/y/releases?page=5>; rel="last"'
+    assert _next_url(link) is None
+
+
+def test_next_url_handles_missing_header():
+    assert _next_url(None) is None
+    assert _next_url("") is None
@@ -63,3 +63,22 @@ def test_summary_strips_entity_encoded_tags():
     assert "<script>" not in out
     assert "&lt;" not in out
     assert "hi" in out
+
+
+def test_summary_drops_script_body():
+    # Regression: with the html.parser switch, content inside <script>/<style>
+    # tags is dropped entirely instead of leaking as text.
+    payload = "before<script>alert(1)</script>after"
+    fake = type("E", (), {"get": lambda self, k, d=None: payload if k == "summary" else d})()
+    out = _summary(fake)
+    assert "alert" not in out
+    assert "before" in out and "after" in out
+
+
+def test_summary_preserves_lt_gt_in_text():
+    # Regression: the old regex strip chomped through "1 < 2 and 4 > 3"
+    # because `<[^>]*>` matched across stray angle brackets.
+    payload = "if 1 < 2 and 4 > 3 then ok"
+    fake = type("E", (), {"get": lambda self, k, d=None: payload if k == "summary" else d})()
+    out = _summary(fake)
+    assert "1" in out and "2" in out and "3" in out and "4" in out and "ok" in out
@@ -86,3 +86,25 @@ def test_corrupted_date_falls_back_to_epoch_not_now():
     item = _item(published_at="this is not a date")
     b = score_item(item, SOURCES_IAM, NOW)
     assert b["freshness"] < 0.01
+
+
+def test_keyword_match_is_word_bounded():
+    # Regression: substring matching let `iam` hit `diagram` and `sts` hit
+    # `tests`. With word boundaries, neither should match.
+    item = _item(
+        title="updated diagram and test results for hosts",
+        source="rss:aws-whats-new",
+    )
+    b = score_item(item, SOURCES_IAM, NOW)
+    assert b["keyword"] == 0.0
+    assert b["keyword_signal"] == 0.0
+
+
+def test_keyword_match_hits_exact_words():
+    item = _item(
+        title="iam supports sts session tags",
+        source="rss:aws-whats-new",
+    )
+    b = score_item(item, SOURCES_IAM, NOW)
+    # both "iam" and "sts" are secondary keywords: 2 hits * 0.5 = 1.0
+    assert b["keyword"] == 1.0