feat: stream-parse feed XML and expand OPML subscriptions

zer0yu · zer0yu · commit 0ba5ec6cb656 · 2026-03-07T02:47:27.000+01:00
- Refactor fetch_feed_metadata to use XMLPullParser for incremental
  streaming, avoiding full payload load for large feeds (&gt;2 MB)
- Extract _build_feed_metadata helper to eliminate duplicated logic
- Add FakeResponse mock and incremental-read test in test suite
- Add 2 security feeds (Google Bug Hunters, Youssef Sammouda)
- Add ~40 AI/tech feeds (Lil'Log, Simon Willison, HN, Wired, etc.)
diff --git a/scripts/add_feed_to_tiny.py b/scripts/add_feed_to_tiny.py
@@ -12,14 +12,15 @@
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple
 
 XML_DECLARATION = b'<?xml version="1.0" encoding="UTF-8"?>\n'
 DEFAULT_USER_AGENT = (
     "Mozilla/5.0 (compatible; CyberSecurityRSSBot/1.0; "
     "+https://github.com/zer0yu/CyberSecurityRSS)"
 )
 MAX_FEED_BYTES = 2 * 1024 * 1024
+READ_CHUNK_SIZE = 64 * 1024
 
 
 class OpmlStructureError(ValueError):
@@ -150,6 +151,19 @@ def _extract_atom_html_url(root: ET.Element) -> str:
     return fallback
 
 
+def _build_feed_metadata(root_name: str, title: str, html_url: str, feed_url: str) -> FeedMetadata:
+    if root_name not in {"rss", "feed", "rdf"}:
+        raise ValueError(f"Unsupported feed root tag: {root_name}")
+
+    safe_title = title or urllib.parse.urlparse(feed_url).netloc or feed_url
+    safe_html_url = urllib.parse.urljoin(feed_url, html_url) if html_url else feed_url
+    return FeedMetadata(
+        title=safe_title,
+        html_url=normalize_url(safe_html_url),
+        xml_url=normalize_url(feed_url),
+    )
+
+
 def parse_feed_metadata(xml_bytes: bytes, feed_url: str) -> FeedMetadata:
     try:
         root = ET.fromstring(xml_bytes)
@@ -173,16 +187,98 @@ def parse_feed_metadata(xml_bytes: bytes, feed_url: str) -> FeedMetadata:
         if channel is not None:
             title = _find_text_child(channel, "title")
             html_url = _find_text_child(channel, "link")
-    else:
-        raise ValueError(f"Unsupported feed root tag: {root_name}")
+    return _build_feed_metadata(root_name, title, html_url, feed_url)
 
-    safe_title = title or urllib.parse.urlparse(feed_url).netloc or feed_url
-    safe_html_url = urllib.parse.urljoin(feed_url, html_url) if html_url else feed_url
-    return FeedMetadata(
-        title=safe_title,
-        html_url=normalize_url(safe_html_url),
-        xml_url=normalize_url(feed_url),
-    )
+
+def parse_feed_metadata_stream(
+    stream: BinaryIO,
+    feed_url: str,
+    max_bytes: int = MAX_FEED_BYTES,
+) -> FeedMetadata:
+    parser = ET.XMLPullParser(events=("start", "end"))
+    path: List[str] = []
+    root_name = ""
+    title = ""
+    html_url = ""
+    atom_fallback_html_url = ""
+    bytes_read = 0
+
+    while True:
+        chunk = stream.read(READ_CHUNK_SIZE)
+        if not chunk:
+            break
+        bytes_read += len(chunk)
+        if bytes_read > max_bytes:
+            raise ValueError(f"Feed payload is too large (> {max_bytes} bytes)")
+
+        try:
+            parser.feed(chunk)
+        except ET.ParseError as exc:
+            raise ValueError(f"RSS/Atom XML parse failed: {exc}") from exc
+
+        for event, elem in parser.read_events():
+            local_name = strip_namespace(elem.tag)
+
+            if event == "start":
+                path.append(local_name)
+
+                if len(path) == 1:
+                    root_name = local_name
+                    if root_name not in {"rss", "feed", "rdf"}:
+                        raise ValueError(f"Unsupported feed root tag: {root_name}")
+                elif root_name == "feed" and len(path) == 2:
+                    if local_name == "entry":
+                        return _build_feed_metadata(
+                            root_name,
+                            title,
+                            html_url or atom_fallback_html_url,
+                            feed_url,
+                        )
+                    if local_name == "link":
+                        href = normalize_url(elem.attrib.get("href", ""))
+                        if href:
+                            rel = (elem.attrib.get("rel", "alternate") or "alternate").strip().lower()
+                            if rel in {"alternate", ""}:
+                                html_url = href
+                            elif not atom_fallback_html_url:
+                                atom_fallback_html_url = href
+                            if title and html_url:
+                                return _build_feed_metadata(root_name, title, html_url, feed_url)
+                continue
+
+            if root_name in {"rss", "rdf"}:
+                if len(path) == 3 and path[1] == "channel":
+                    if local_name == "title" and not title:
+                        title = (elem.text or "").strip()
+                    elif local_name == "link" and not html_url:
+                        html_url = (elem.text or "").strip()
+                    if title and html_url:
+                        return _build_feed_metadata(root_name, title, html_url, feed_url)
+                elif path == [root_name, "channel"]:
+                    return _build_feed_metadata(root_name, title, html_url, feed_url)
+            elif root_name == "feed":
+                if path == ["feed", "title"] and not title:
+                    title = (elem.text or "").strip()
+                    if title and html_url:
+                        return _build_feed_metadata(root_name, title, html_url, feed_url)
+                elif path == ["feed"]:
+                    return _build_feed_metadata(
+                        root_name,
+                        title,
+                        html_url or atom_fallback_html_url,
+                        feed_url,
+                    )
+
+            if path:
+                path.pop()
+            elem.clear()
+
+    try:
+        parser.close()
+    except ET.ParseError as exc:
+        raise ValueError(f"RSS/Atom XML parse failed: {exc}") from exc
+
+    return _build_feed_metadata(root_name, title, html_url or atom_fallback_html_url, feed_url)
 
 
 def fetch_feed_metadata(feed_url: str, timeout: float) -> FeedMetadata:
@@ -197,16 +293,12 @@ def fetch_feed_metadata(feed_url: str, timeout: float) -> FeedMetadata:
             status_code = int(status) if status is not None else 0
             if status_code >= 400:
                 raise ValueError(f"Feed request failed with HTTP {status_code}")
-            payload = response.read(MAX_FEED_BYTES + 1)
+            return parse_feed_metadata_stream(response, feed_url=feed_url)
     except urllib.error.HTTPError as exc:
         raise ValueError(f"Feed request failed with HTTP {int(exc.code)}") from exc
     except urllib.error.URLError as exc:
         raise ValueError(f"Feed request failed: {exc.reason}") from exc
 
-    if len(payload) > MAX_FEED_BYTES:
-        raise ValueError(f"Feed payload is too large (> {MAX_FEED_BYTES} bytes)")
-    return parse_feed_metadata(payload, feed_url=feed_url)
-
 
 def find_existing_category_for_url(body: ET.Element, xml_url: str) -> Optional[str]:
     wanted = normalize_url(xml_url)
diff --git a/tests/test_add_feed_to_tiny.py b/tests/test_add_feed_to_tiny.py
@@ -1,9 +1,12 @@
 import unittest
 import xml.etree.ElementTree as ET
+from unittest import mock
 
 from scripts.add_feed_to_tiny import (
     FeedMetadata,
+    MAX_FEED_BYTES,
     add_feed_to_tree,
+    fetch_feed_metadata,
     parse_feed_metadata,
 )
 
@@ -30,6 +33,30 @@ def rss_urls_in_category(tree: ET.ElementTree, category_name: str):
     return []
 
 
+class FakeResponse:
+    def __init__(self, payload: bytes, status: int = 200):
+        self._payload = payload
+        self._offset = 0
+        self.status = status
+
+    def read(self, size: int = -1) -> bytes:
+        if size is None or size < 0:
+            size = len(self._payload) - self._offset
+        start = self._offset
+        end = min(len(self._payload), self._offset + size)
+        self._offset = end
+        return self._payload[start:end]
+
+    def getcode(self) -> int:
+        return self.status
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        return False
+
+
 class AddFeedToTinyTests(unittest.TestCase):
     def test_parse_rss_metadata(self):
         xml_bytes = b"""<?xml version='1.0' encoding='UTF-8'?>
@@ -117,6 +144,32 @@ def test_add_feed_skips_duplicate_xml_url(self):
             ["https://x.example.com/feed.xml"],
         )
 
+    def test_fetch_feed_metadata_reads_large_feed_incrementally(self):
+        item = (
+            b"<item><title>Entry</title><link>https://example.com/post</link>"
+            b"<description>" + (b"x" * 16384) + b"</description></item>"
+        )
+        payload = (
+            b"<?xml version='1.0' encoding='UTF-8'?>\n"
+            b"<rss version='2.0'><channel>"
+            b"<title>Large Feed</title>"
+            b"<link>https://example.com</link>"
+            b"<description>Example feed</description>"
+            + (item * 160)
+            + b"</channel></rss>"
+        )
+        self.assertGreater(len(payload), MAX_FEED_BYTES)
+
+        with mock.patch(
+            "scripts.add_feed_to_tiny.urllib.request.urlopen",
+            return_value=FakeResponse(payload),
+        ):
+            meta = fetch_feed_metadata("https://example.com/feed.xml", timeout=5)
+
+        self.assertEqual(meta.title, "Large Feed")
+        self.assertEqual(meta.html_url, "https://example.com")
+        self.assertEqual(meta.xml_url, "https://example.com/feed.xml")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tiny.opml b/tiny.opml
@@ -152,6 +152,8 @@
 <outline type="rss" xmlUrl="http://wjlshare.com/feed" htmlUrl="http://wjlshare.com" title="天下大木头" text="天下大木头" />
 <outline xmlUrl="https://rce.moe/atom.xml" type="rss" title="白帽酱の博客" text="白帽酱の博客" htmlUrl="https://rce.moe/atom.xml" />
 <outline type="rss" xmlUrl="https://blog.slonser.info/index.xml" htmlUrl="https://blog.slonser.info" text="Slonser Notes" title="Slonser Notes" />
+<outline type="rss" title="Security Engineering Blog" text="Security Engineering Blog" htmlUrl="https://bughunters.google.com/blog" xmlUrl="https://bughunters.google.com/feed/en" />
+<outline type="rss" title="Youssef Sammouda (sam0) personal blog" text="Youssef Sammouda (sam0) personal blog" htmlUrl="http://localhost:4000/" xmlUrl="https://ysamm.com/feed.xml" />
 </outline>
 <outline title="AI" text="AI">
 <outline text="krebsonsecurity.com" title="krebsonsecurity.com" htmlUrl="https://krebsonsecurity.com" type="rss" xmlUrl="https://krebsonsecurity.com/feed/" />
@@ -201,6 +203,46 @@
 <outline type="rss" title="Engineering" text="Engineering" htmlUrl="https://www.anthropic.com/engineering/" xmlUrl="https://rss.app/feeds/E5pQ7SXRZQVEiFuh.xml" />
 <outline type="rss" title="博客 · Cursor" text="博客 · Cursor" htmlUrl="https://cursor.com/cn/blog/" xmlUrl="https://rss.app/feeds/gURU8i2ldFzpwRCr.xml" />
 <outline type="rss" title="OpenAI Newsroom | Engineering" text="OpenAI Newsroom | Engineering" htmlUrl="https://openai.com/news/engineering/" xmlUrl="https://rss.app/feeds/75Fpp0Mbx9x4TbDs.xml" />
+<outline type="rss" title="Lil'Log" text="Lil'Log" htmlUrl="https://lilianweng.github.io/" xmlUrl="https://lilianweng.github.io/index.xml" />
+<outline type="rss" title="Simon Willison" text="Simon Willison" htmlUrl="https://simonwillison.net/" xmlUrl="https://simonwillison.net/atom/everything/" />
+<outline type="rss" title="Hugging Face Blog" text="Hugging Face Blog" htmlUrl="https://huggingface.co/blog/" xmlUrl="https://huggingface.co/blog/feed.xml" />
+<outline type="rss" title="OpenAI Blog" text="OpenAI Blog" htmlUrl="https://openai.com/blog/" xmlUrl="https://openai.com/blog/rss.xml" />
+<outline type="rss" title="Sebastian Raschka" text="Sebastian Raschka" htmlUrl="https://magazine.sebastianraschka.com/" xmlUrl="https://magazine.sebastianraschka.com/feed" />
+<outline type="rss" title="Gwern" text="Gwern" htmlUrl="https://gwern.substack.com/" xmlUrl="https://gwern.substack.com/feed" />
+<outline type="rss" title="Dwarkesh Patel" text="Dwarkesh Patel" htmlUrl="https://www.dwarkeshpatel.com/" xmlUrl="https://www.dwarkeshpatel.com/feed" />
+<outline type="rss" title="minimaxir (Max Woolf)" text="minimaxir (Max Woolf)" htmlUrl="https://minimaxir.com/" xmlUrl="https://minimaxir.com/index.xml" />
+<outline type="rss" title="Google AI Blog" text="Google AI Blog" htmlUrl="https://blog.google/technology/ai/" xmlUrl="https://blog.google/technology/ai/rss/" />
+<outline type="rss" title="Hacker News Frontpage" text="Hacker News Frontpage" htmlUrl="https://news.ycombinator.com/" xmlUrl="https://hnrss.org/frontpage" />
+<outline type="rss" title="Ars Technica" text="Ars Technica" htmlUrl="https://arstechnica.com/" xmlUrl="https://feeds.arstechnica.com/arstechnica/index" />
+<outline type="rss" title="TechCrunch" text="TechCrunch" htmlUrl="https://techcrunch.com/" xmlUrl="https://techcrunch.com/feed/" />
+<outline type="rss" title="The Verge" text="The Verge" htmlUrl="https://www.theverge.com/" xmlUrl="https://www.theverge.com/rss/index.xml" />
+<outline type="rss" title="Daring Fireball" text="Daring Fireball" htmlUrl="https://daringfireball.net/" xmlUrl="https://daringfireball.net/feeds/main" />
+<outline type="rss" title="Paul Graham" text="Paul Graham" htmlUrl="http://www.paulgraham.com/" xmlUrl="http://www.aaronsw.com/2002/feeds/pgessays.rss" />
+<outline type="rss" title="Geohot" text="Geohot" htmlUrl="https://geohot.github.io/blog/" xmlUrl="https://geohot.github.io/blog/feed.xml" />
+<outline type="rss" title="r/MachineLearning" text="r/MachineLearning" htmlUrl="https://www.reddit.com/r/MachineLearning/" xmlUrl="https://www.reddit.com/r/MachineLearning/.rss" />
+<outline type="rss" title="36氪" text="36氪" htmlUrl="https://36kr.com/" xmlUrl="https://36kr.com/feed" />
+<outline type="rss" title="机器之心 Synced" text="机器之心 Synced" htmlUrl="https://www.jiqizhixin.com/" xmlUrl="https://www.jiqizhixin.com/rss" />
+<outline type="rss" title="量子位 QbitAI" text="量子位 QbitAI" htmlUrl="https://www.qbitai.com/" xmlUrl="https://www.qbitai.com/feed" />
+<outline type="rss" title="InfoQ 中文" text="InfoQ 中文" htmlUrl="https://www.infoq.cn/" xmlUrl="https://www.infoq.cn/feed" />
+<outline type="rss" title="MIT Technology Review" text="MIT Technology Review" htmlUrl="https://www.technologyreview.com/" xmlUrl="https://www.technologyreview.com/feed" />
+<outline type="rss" title="VentureBeat AI" text="VentureBeat AI" htmlUrl="https://venturebeat.com/category/ai/" xmlUrl="https://venturebeat.com/category/ai/feed/" />
+<outline type="rss" title="404 Media" text="404 Media" htmlUrl="https://www.404media.co/" xmlUrl="https://www.404media.co/rss" />
+<outline type="rss" title="AI Snake Oil" text="AI Snake Oil" htmlUrl="https://aisnakeoil.substack.com/" xmlUrl="https://aisnakeoil.substack.com/feed" />
+<outline type="rss" title="ByteByteGo" text="ByteByteGo" htmlUrl="https://blog.bytebytego.com/" xmlUrl="https://blog.bytebytego.com/feed" />
+<outline type="rss" title="NVIDIA AI Blog" text="NVIDIA AI Blog" htmlUrl="https://blogs.nvidia.com/" xmlUrl="https://blogs.nvidia.com/feed/" />
+<outline type="rss" title="Google DeepMind Blog" text="Google DeepMind Blog" htmlUrl="https://deepmind.google/blog/" xmlUrl="https://deepmind.google/blog/rss.xml" />
+<outline type="rss" title="Product Hunt" text="Product Hunt" htmlUrl="https://www.producthunt.com/" xmlUrl="https://www.producthunt.com/feed" />
+<outline type="rss" title="爱范儿" text="爱范儿" htmlUrl="https://www.ifanr.com/" xmlUrl="https://www.ifanr.com/feed" />
+<outline type="rss" title="少数派" text="少数派" htmlUrl="https://sspai.com/" xmlUrl="https://sspai.com/feed" />
+<outline type="rss" title="Wired" text="Wired" htmlUrl="https://www.wired.com/" xmlUrl="https://www.wired.com/feed/rss" />
+<outline type="rss" title="IEEE Spectrum" text="IEEE Spectrum" htmlUrl="https://spectrum.ieee.org/" xmlUrl="https://spectrum.ieee.org/feeds/feed.rss" />
+<outline type="rss" title="Ben's Bites" text="Ben's Bites" htmlUrl="https://www.bensbites.com/" xmlUrl="https://www.bensbites.com/feed" />
+<outline type="rss" title="The Decoder" text="The Decoder" htmlUrl="https://the-decoder.com/" xmlUrl="https://the-decoder.com/feed/" />
+<outline type="rss" title="Rachel by the Bay" text="Rachel by the Bay" htmlUrl="https://rachelbythebay.com/" xmlUrl="https://rachelbythebay.com/w/atom.xml" />
+<outline type="rss" title="Xe Iaso" text="Xe Iaso" htmlUrl="https://xeiaso.net/" xmlUrl="https://xeiaso.net/blog.rss" />
+<outline type="rss" title="lcamtuf (Michal Zalewski)" text="lcamtuf (Michal Zalewski)" htmlUrl="https://lcamtuf.substack.com/" xmlUrl="https://lcamtuf.substack.com/feed" />
+<outline type="rss" title="Dynomight" text="Dynomight" htmlUrl="https://dynomight.net/" xmlUrl="https://dynomight.net/feed.xml" />
+<outline type="rss" title="Armin Ronacher" text="Armin Ronacher" htmlUrl="https://lucumr.pocoo.org/" xmlUrl="https://lucumr.pocoo.org/feed.atom" />
 </outline>
 </body>
 </opml>