Skip to content

Commit 0ba5ec6

Browse files
committed
feat: stream-parse feed XML and expand OPML subscriptions
- Refactor fetch_feed_metadata to use XMLPullParser for incremental streaming, avoiding full payload load for large feeds (>2 MB) - Extract _build_feed_metadata helper to eliminate duplicated logic - Add FakeResponse mock and incremental-read test in test suite - Add 2 security feeds (Google Bug Hunters, Youssef Sammouda) - Add ~40 AI/tech feeds (Lil'Log, Simon Willison, HN, Wired, etc.)
1 parent e0a0b15 commit 0ba5ec6

3 files changed

Lines changed: 202 additions & 15 deletions

File tree

scripts/add_feed_to_tiny.py

Lines changed: 107 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,15 @@
1212
import xml.etree.ElementTree as ET
1313
from dataclasses import dataclass
1414
from pathlib import Path
15-
from typing import Dict, Iterable, List, Optional, Tuple
15+
from typing import BinaryIO, Dict, Iterable, List, Optional, Tuple
1616

1717
XML_DECLARATION = b'<?xml version="1.0" encoding="UTF-8"?>\n'
1818
DEFAULT_USER_AGENT = (
1919
"Mozilla/5.0 (compatible; CyberSecurityRSSBot/1.0; "
2020
"+https://github.com/zer0yu/CyberSecurityRSS)"
2121
)
2222
MAX_FEED_BYTES = 2 * 1024 * 1024
23+
READ_CHUNK_SIZE = 64 * 1024
2324

2425

2526
class OpmlStructureError(ValueError):
@@ -150,6 +151,19 @@ def _extract_atom_html_url(root: ET.Element) -> str:
150151
return fallback
151152

152153

154+
def _build_feed_metadata(root_name: str, title: str, html_url: str, feed_url: str) -> FeedMetadata:
155+
if root_name not in {"rss", "feed", "rdf"}:
156+
raise ValueError(f"Unsupported feed root tag: {root_name}")
157+
158+
safe_title = title or urllib.parse.urlparse(feed_url).netloc or feed_url
159+
safe_html_url = urllib.parse.urljoin(feed_url, html_url) if html_url else feed_url
160+
return FeedMetadata(
161+
title=safe_title,
162+
html_url=normalize_url(safe_html_url),
163+
xml_url=normalize_url(feed_url),
164+
)
165+
166+
153167
def parse_feed_metadata(xml_bytes: bytes, feed_url: str) -> FeedMetadata:
154168
try:
155169
root = ET.fromstring(xml_bytes)
@@ -173,16 +187,98 @@ def parse_feed_metadata(xml_bytes: bytes, feed_url: str) -> FeedMetadata:
173187
if channel is not None:
174188
title = _find_text_child(channel, "title")
175189
html_url = _find_text_child(channel, "link")
176-
else:
177-
raise ValueError(f"Unsupported feed root tag: {root_name}")
190+
return _build_feed_metadata(root_name, title, html_url, feed_url)
178191

179-
safe_title = title or urllib.parse.urlparse(feed_url).netloc or feed_url
180-
safe_html_url = urllib.parse.urljoin(feed_url, html_url) if html_url else feed_url
181-
return FeedMetadata(
182-
title=safe_title,
183-
html_url=normalize_url(safe_html_url),
184-
xml_url=normalize_url(feed_url),
185-
)
192+
193+
def parse_feed_metadata_stream(
194+
stream: BinaryIO,
195+
feed_url: str,
196+
max_bytes: int = MAX_FEED_BYTES,
197+
) -> FeedMetadata:
198+
parser = ET.XMLPullParser(events=("start", "end"))
199+
path: List[str] = []
200+
root_name = ""
201+
title = ""
202+
html_url = ""
203+
atom_fallback_html_url = ""
204+
bytes_read = 0
205+
206+
while True:
207+
chunk = stream.read(READ_CHUNK_SIZE)
208+
if not chunk:
209+
break
210+
bytes_read += len(chunk)
211+
if bytes_read > max_bytes:
212+
raise ValueError(f"Feed payload is too large (> {max_bytes} bytes)")
213+
214+
try:
215+
parser.feed(chunk)
216+
except ET.ParseError as exc:
217+
raise ValueError(f"RSS/Atom XML parse failed: {exc}") from exc
218+
219+
for event, elem in parser.read_events():
220+
local_name = strip_namespace(elem.tag)
221+
222+
if event == "start":
223+
path.append(local_name)
224+
225+
if len(path) == 1:
226+
root_name = local_name
227+
if root_name not in {"rss", "feed", "rdf"}:
228+
raise ValueError(f"Unsupported feed root tag: {root_name}")
229+
elif root_name == "feed" and len(path) == 2:
230+
if local_name == "entry":
231+
return _build_feed_metadata(
232+
root_name,
233+
title,
234+
html_url or atom_fallback_html_url,
235+
feed_url,
236+
)
237+
if local_name == "link":
238+
href = normalize_url(elem.attrib.get("href", ""))
239+
if href:
240+
rel = (elem.attrib.get("rel", "alternate") or "alternate").strip().lower()
241+
if rel in {"alternate", ""}:
242+
html_url = href
243+
elif not atom_fallback_html_url:
244+
atom_fallback_html_url = href
245+
if title and html_url:
246+
return _build_feed_metadata(root_name, title, html_url, feed_url)
247+
continue
248+
249+
if root_name in {"rss", "rdf"}:
250+
if len(path) == 3 and path[1] == "channel":
251+
if local_name == "title" and not title:
252+
title = (elem.text or "").strip()
253+
elif local_name == "link" and not html_url:
254+
html_url = (elem.text or "").strip()
255+
if title and html_url:
256+
return _build_feed_metadata(root_name, title, html_url, feed_url)
257+
elif path == [root_name, "channel"]:
258+
return _build_feed_metadata(root_name, title, html_url, feed_url)
259+
elif root_name == "feed":
260+
if path == ["feed", "title"] and not title:
261+
title = (elem.text or "").strip()
262+
if title and html_url:
263+
return _build_feed_metadata(root_name, title, html_url, feed_url)
264+
elif path == ["feed"]:
265+
return _build_feed_metadata(
266+
root_name,
267+
title,
268+
html_url or atom_fallback_html_url,
269+
feed_url,
270+
)
271+
272+
if path:
273+
path.pop()
274+
elem.clear()
275+
276+
try:
277+
parser.close()
278+
except ET.ParseError as exc:
279+
raise ValueError(f"RSS/Atom XML parse failed: {exc}") from exc
280+
281+
return _build_feed_metadata(root_name, title, html_url or atom_fallback_html_url, feed_url)
186282

187283

188284
def fetch_feed_metadata(feed_url: str, timeout: float) -> FeedMetadata:
@@ -197,16 +293,12 @@ def fetch_feed_metadata(feed_url: str, timeout: float) -> FeedMetadata:
197293
status_code = int(status) if status is not None else 0
198294
if status_code >= 400:
199295
raise ValueError(f"Feed request failed with HTTP {status_code}")
200-
payload = response.read(MAX_FEED_BYTES + 1)
296+
return parse_feed_metadata_stream(response, feed_url=feed_url)
201297
except urllib.error.HTTPError as exc:
202298
raise ValueError(f"Feed request failed with HTTP {int(exc.code)}") from exc
203299
except urllib.error.URLError as exc:
204300
raise ValueError(f"Feed request failed: {exc.reason}") from exc
205301

206-
if len(payload) > MAX_FEED_BYTES:
207-
raise ValueError(f"Feed payload is too large (> {MAX_FEED_BYTES} bytes)")
208-
return parse_feed_metadata(payload, feed_url=feed_url)
209-
210302

211303
def find_existing_category_for_url(body: ET.Element, xml_url: str) -> Optional[str]:
212304
wanted = normalize_url(xml_url)

tests/test_add_feed_to_tiny.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import unittest
22
import xml.etree.ElementTree as ET
3+
from unittest import mock
34

45
from scripts.add_feed_to_tiny import (
56
FeedMetadata,
7+
MAX_FEED_BYTES,
68
add_feed_to_tree,
9+
fetch_feed_metadata,
710
parse_feed_metadata,
811
)
912

@@ -30,6 +33,30 @@ def rss_urls_in_category(tree: ET.ElementTree, category_name: str):
3033
return []
3134

3235

36+
class FakeResponse:
37+
def __init__(self, payload: bytes, status: int = 200):
38+
self._payload = payload
39+
self._offset = 0
40+
self.status = status
41+
42+
def read(self, size: int = -1) -> bytes:
43+
if size is None or size < 0:
44+
size = len(self._payload) - self._offset
45+
start = self._offset
46+
end = min(len(self._payload), self._offset + size)
47+
self._offset = end
48+
return self._payload[start:end]
49+
50+
def getcode(self) -> int:
51+
return self.status
52+
53+
def __enter__(self):
54+
return self
55+
56+
def __exit__(self, exc_type, exc, tb):
57+
return False
58+
59+
3360
class AddFeedToTinyTests(unittest.TestCase):
3461
def test_parse_rss_metadata(self):
3562
xml_bytes = b"""<?xml version='1.0' encoding='UTF-8'?>
@@ -117,6 +144,32 @@ def test_add_feed_skips_duplicate_xml_url(self):
117144
["https://x.example.com/feed.xml"],
118145
)
119146

147+
def test_fetch_feed_metadata_reads_large_feed_incrementally(self):
148+
item = (
149+
b"<item><title>Entry</title><link>https://example.com/post</link>"
150+
b"<description>" + (b"x" * 16384) + b"</description></item>"
151+
)
152+
payload = (
153+
b"<?xml version='1.0' encoding='UTF-8'?>\n"
154+
b"<rss version='2.0'><channel>"
155+
b"<title>Large Feed</title>"
156+
b"<link>https://example.com</link>"
157+
b"<description>Example feed</description>"
158+
+ (item * 160)
159+
+ b"</channel></rss>"
160+
)
161+
self.assertGreater(len(payload), MAX_FEED_BYTES)
162+
163+
with mock.patch(
164+
"scripts.add_feed_to_tiny.urllib.request.urlopen",
165+
return_value=FakeResponse(payload),
166+
):
167+
meta = fetch_feed_metadata("https://example.com/feed.xml", timeout=5)
168+
169+
self.assertEqual(meta.title, "Large Feed")
170+
self.assertEqual(meta.html_url, "https://example.com")
171+
self.assertEqual(meta.xml_url, "https://example.com/feed.xml")
172+
120173

121174
if __name__ == "__main__":
122175
unittest.main()

tiny.opml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@
152152
<outline type="rss" xmlUrl="http://wjlshare.com/feed" htmlUrl="http://wjlshare.com" title="天下大木头" text="天下大木头" />
153153
<outline xmlUrl="https://rce.moe/atom.xml" type="rss" title="白帽酱の博客" text="白帽酱の博客" htmlUrl="https://rce.moe/atom.xml" />
154154
<outline type="rss" xmlUrl="https://blog.slonser.info/index.xml" htmlUrl="https://blog.slonser.info" text="Slonser Notes" title="Slonser Notes" />
155+
<outline type="rss" title="Security Engineering Blog" text="Security Engineering Blog" htmlUrl="https://bughunters.google.com/blog" xmlUrl="https://bughunters.google.com/feed/en" />
156+
<outline type="rss" title="Youssef Sammouda (sam0) personal blog" text="Youssef Sammouda (sam0) personal blog" htmlUrl="http://localhost:4000/" xmlUrl="https://ysamm.com/feed.xml" />
155157
</outline>
156158
<outline title="AI" text="AI">
157159
<outline text="krebsonsecurity.com" title="krebsonsecurity.com" htmlUrl="https://krebsonsecurity.com" type="rss" xmlUrl="https://krebsonsecurity.com/feed/" />
@@ -201,6 +203,46 @@
201203
<outline type="rss" title="Engineering" text="Engineering" htmlUrl="https://www.anthropic.com/engineering/" xmlUrl="https://rss.app/feeds/E5pQ7SXRZQVEiFuh.xml" />
202204
<outline type="rss" title="博客 · Cursor" text="博客 · Cursor" htmlUrl="https://cursor.com/cn/blog/" xmlUrl="https://rss.app/feeds/gURU8i2ldFzpwRCr.xml" />
203205
<outline type="rss" title="OpenAI Newsroom | Engineering" text="OpenAI Newsroom | Engineering" htmlUrl="https://openai.com/news/engineering/" xmlUrl="https://rss.app/feeds/75Fpp0Mbx9x4TbDs.xml" />
206+
<outline type="rss" title="Lil'Log" text="Lil'Log" htmlUrl="https://lilianweng.github.io/" xmlUrl="https://lilianweng.github.io/index.xml" />
207+
<outline type="rss" title="Simon Willison" text="Simon Willison" htmlUrl="https://simonwillison.net/" xmlUrl="https://simonwillison.net/atom/everything/" />
208+
<outline type="rss" title="Hugging Face Blog" text="Hugging Face Blog" htmlUrl="https://huggingface.co/blog/" xmlUrl="https://huggingface.co/blog/feed.xml" />
209+
<outline type="rss" title="OpenAI Blog" text="OpenAI Blog" htmlUrl="https://openai.com/blog/" xmlUrl="https://openai.com/blog/rss.xml" />
210+
<outline type="rss" title="Sebastian Raschka" text="Sebastian Raschka" htmlUrl="https://magazine.sebastianraschka.com/" xmlUrl="https://magazine.sebastianraschka.com/feed" />
211+
<outline type="rss" title="Gwern" text="Gwern" htmlUrl="https://gwern.substack.com/" xmlUrl="https://gwern.substack.com/feed" />
212+
<outline type="rss" title="Dwarkesh Patel" text="Dwarkesh Patel" htmlUrl="https://www.dwarkeshpatel.com/" xmlUrl="https://www.dwarkeshpatel.com/feed" />
213+
<outline type="rss" title="minimaxir (Max Woolf)" text="minimaxir (Max Woolf)" htmlUrl="https://minimaxir.com/" xmlUrl="https://minimaxir.com/index.xml" />
214+
<outline type="rss" title="Google AI Blog" text="Google AI Blog" htmlUrl="https://blog.google/technology/ai/" xmlUrl="https://blog.google/technology/ai/rss/" />
215+
<outline type="rss" title="Hacker News Frontpage" text="Hacker News Frontpage" htmlUrl="https://news.ycombinator.com/" xmlUrl="https://hnrss.org/frontpage" />
216+
<outline type="rss" title="Ars Technica" text="Ars Technica" htmlUrl="https://arstechnica.com/" xmlUrl="https://feeds.arstechnica.com/arstechnica/index" />
217+
<outline type="rss" title="TechCrunch" text="TechCrunch" htmlUrl="https://techcrunch.com/" xmlUrl="https://techcrunch.com/feed/" />
218+
<outline type="rss" title="The Verge" text="The Verge" htmlUrl="https://www.theverge.com/" xmlUrl="https://www.theverge.com/rss/index.xml" />
219+
<outline type="rss" title="Daring Fireball" text="Daring Fireball" htmlUrl="https://daringfireball.net/" xmlUrl="https://daringfireball.net/feeds/main" />
220+
<outline type="rss" title="Paul Graham" text="Paul Graham" htmlUrl="http://www.paulgraham.com/" xmlUrl="http://www.aaronsw.com/2002/feeds/pgessays.rss" />
221+
<outline type="rss" title="Geohot" text="Geohot" htmlUrl="https://geohot.github.io/blog/" xmlUrl="https://geohot.github.io/blog/feed.xml" />
222+
<outline type="rss" title="r/MachineLearning" text="r/MachineLearning" htmlUrl="https://www.reddit.com/r/MachineLearning/" xmlUrl="https://www.reddit.com/r/MachineLearning/.rss" />
223+
<outline type="rss" title="36氪" text="36氪" htmlUrl="https://36kr.com/" xmlUrl="https://36kr.com/feed" />
224+
<outline type="rss" title="机器之心 Synced" text="机器之心 Synced" htmlUrl="https://www.jiqizhixin.com/" xmlUrl="https://www.jiqizhixin.com/rss" />
225+
<outline type="rss" title="量子位 QbitAI" text="量子位 QbitAI" htmlUrl="https://www.qbitai.com/" xmlUrl="https://www.qbitai.com/feed" />
226+
<outline type="rss" title="InfoQ 中文" text="InfoQ 中文" htmlUrl="https://www.infoq.cn/" xmlUrl="https://www.infoq.cn/feed" />
227+
<outline type="rss" title="MIT Technology Review" text="MIT Technology Review" htmlUrl="https://www.technologyreview.com/" xmlUrl="https://www.technologyreview.com/feed" />
228+
<outline type="rss" title="VentureBeat AI" text="VentureBeat AI" htmlUrl="https://venturebeat.com/category/ai/" xmlUrl="https://venturebeat.com/category/ai/feed/" />
229+
<outline type="rss" title="404 Media" text="404 Media" htmlUrl="https://www.404media.co/" xmlUrl="https://www.404media.co/rss" />
230+
<outline type="rss" title="AI Snake Oil" text="AI Snake Oil" htmlUrl="https://aisnakeoil.substack.com/" xmlUrl="https://aisnakeoil.substack.com/feed" />
231+
<outline type="rss" title="ByteByteGo" text="ByteByteGo" htmlUrl="https://blog.bytebytego.com/" xmlUrl="https://blog.bytebytego.com/feed" />
232+
<outline type="rss" title="NVIDIA AI Blog" text="NVIDIA AI Blog" htmlUrl="https://blogs.nvidia.com/" xmlUrl="https://blogs.nvidia.com/feed/" />
233+
<outline type="rss" title="Google DeepMind Blog" text="Google DeepMind Blog" htmlUrl="https://deepmind.google/blog/" xmlUrl="https://deepmind.google/blog/rss.xml" />
234+
<outline type="rss" title="Product Hunt" text="Product Hunt" htmlUrl="https://www.producthunt.com/" xmlUrl="https://www.producthunt.com/feed" />
235+
<outline type="rss" title="爱范儿" text="爱范儿" htmlUrl="https://www.ifanr.com/" xmlUrl="https://www.ifanr.com/feed" />
236+
<outline type="rss" title="少数派" text="少数派" htmlUrl="https://sspai.com/" xmlUrl="https://sspai.com/feed" />
237+
<outline type="rss" title="Wired" text="Wired" htmlUrl="https://www.wired.com/" xmlUrl="https://www.wired.com/feed/rss" />
238+
<outline type="rss" title="IEEE Spectrum" text="IEEE Spectrum" htmlUrl="https://spectrum.ieee.org/" xmlUrl="https://spectrum.ieee.org/feeds/feed.rss" />
239+
<outline type="rss" title="Ben's Bites" text="Ben's Bites" htmlUrl="https://www.bensbites.com/" xmlUrl="https://www.bensbites.com/feed" />
240+
<outline type="rss" title="The Decoder" text="The Decoder" htmlUrl="https://the-decoder.com/" xmlUrl="https://the-decoder.com/feed/" />
241+
<outline type="rss" title="Rachel by the Bay" text="Rachel by the Bay" htmlUrl="https://rachelbythebay.com/" xmlUrl="https://rachelbythebay.com/w/atom.xml" />
242+
<outline type="rss" title="Xe Iaso" text="Xe Iaso" htmlUrl="https://xeiaso.net/" xmlUrl="https://xeiaso.net/blog.rss" />
243+
<outline type="rss" title="lcamtuf (Michal Zalewski)" text="lcamtuf (Michal Zalewski)" htmlUrl="https://lcamtuf.substack.com/" xmlUrl="https://lcamtuf.substack.com/feed" />
244+
<outline type="rss" title="Dynomight" text="Dynomight" htmlUrl="https://dynomight.net/" xmlUrl="https://dynomight.net/feed.xml" />
245+
<outline type="rss" title="Armin Ronacher" text="Armin Ronacher" htmlUrl="https://lucumr.pocoo.org/" xmlUrl="https://lucumr.pocoo.org/feed.atom" />
204246
</outline>
205247
</body>
206248
</opml>

0 commit comments

Comments
 (0)