1212import xml .etree .ElementTree as ET
1313from dataclasses import dataclass
1414from pathlib import Path
15- from typing import Dict , Iterable , List , Optional , Tuple
15+ from typing import BinaryIO , Dict , Iterable , List , Optional , Tuple
1616
1717XML_DECLARATION = b'<?xml version="1.0" encoding="UTF-8"?>\n '
1818DEFAULT_USER_AGENT = (
1919 "Mozilla/5.0 (compatible; CyberSecurityRSSBot/1.0; "
2020 "+https://github.com/zer0yu/CyberSecurityRSS)"
2121)
2222MAX_FEED_BYTES = 2 * 1024 * 1024
23+ READ_CHUNK_SIZE = 64 * 1024
2324
2425
2526class OpmlStructureError (ValueError ):
@@ -150,6 +151,19 @@ def _extract_atom_html_url(root: ET.Element) -> str:
150151 return fallback
151152
152153
154+ def _build_feed_metadata (root_name : str , title : str , html_url : str , feed_url : str ) -> FeedMetadata :
155+ if root_name not in {"rss" , "feed" , "rdf" }:
156+ raise ValueError (f"Unsupported feed root tag: { root_name } " )
157+
158+ safe_title = title or urllib .parse .urlparse (feed_url ).netloc or feed_url
159+ safe_html_url = urllib .parse .urljoin (feed_url , html_url ) if html_url else feed_url
160+ return FeedMetadata (
161+ title = safe_title ,
162+ html_url = normalize_url (safe_html_url ),
163+ xml_url = normalize_url (feed_url ),
164+ )
165+
166+
153167def parse_feed_metadata (xml_bytes : bytes , feed_url : str ) -> FeedMetadata :
154168 try :
155169 root = ET .fromstring (xml_bytes )
@@ -173,16 +187,98 @@ def parse_feed_metadata(xml_bytes: bytes, feed_url: str) -> FeedMetadata:
173187 if channel is not None :
174188 title = _find_text_child (channel , "title" )
175189 html_url = _find_text_child (channel , "link" )
176- else :
177- raise ValueError (f"Unsupported feed root tag: { root_name } " )
190+ return _build_feed_metadata (root_name , title , html_url , feed_url )
178191
179- safe_title = title or urllib .parse .urlparse (feed_url ).netloc or feed_url
180- safe_html_url = urllib .parse .urljoin (feed_url , html_url ) if html_url else feed_url
181- return FeedMetadata (
182- title = safe_title ,
183- html_url = normalize_url (safe_html_url ),
184- xml_url = normalize_url (feed_url ),
185- )
192+
193+ def parse_feed_metadata_stream (
194+ stream : BinaryIO ,
195+ feed_url : str ,
196+ max_bytes : int = MAX_FEED_BYTES ,
197+ ) -> FeedMetadata :
198+ parser = ET .XMLPullParser (events = ("start" , "end" ))
199+ path : List [str ] = []
200+ root_name = ""
201+ title = ""
202+ html_url = ""
203+ atom_fallback_html_url = ""
204+ bytes_read = 0
205+
206+ while True :
207+ chunk = stream .read (READ_CHUNK_SIZE )
208+ if not chunk :
209+ break
210+ bytes_read += len (chunk )
211+ if bytes_read > max_bytes :
212+ raise ValueError (f"Feed payload is too large (> { max_bytes } bytes)" )
213+
214+ try :
215+ parser .feed (chunk )
216+ except ET .ParseError as exc :
217+ raise ValueError (f"RSS/Atom XML parse failed: { exc } " ) from exc
218+
219+ for event , elem in parser .read_events ():
220+ local_name = strip_namespace (elem .tag )
221+
222+ if event == "start" :
223+ path .append (local_name )
224+
225+ if len (path ) == 1 :
226+ root_name = local_name
227+ if root_name not in {"rss" , "feed" , "rdf" }:
228+ raise ValueError (f"Unsupported feed root tag: { root_name } " )
229+ elif root_name == "feed" and len (path ) == 2 :
230+ if local_name == "entry" :
231+ return _build_feed_metadata (
232+ root_name ,
233+ title ,
234+ html_url or atom_fallback_html_url ,
235+ feed_url ,
236+ )
237+ if local_name == "link" :
238+ href = normalize_url (elem .attrib .get ("href" , "" ))
239+ if href :
240+ rel = (elem .attrib .get ("rel" , "alternate" ) or "alternate" ).strip ().lower ()
241+ if rel in {"alternate" , "" }:
242+ html_url = href
243+ elif not atom_fallback_html_url :
244+ atom_fallback_html_url = href
245+ if title and html_url :
246+ return _build_feed_metadata (root_name , title , html_url , feed_url )
247+ continue
248+
249+ if root_name in {"rss" , "rdf" }:
250+ if len (path ) == 3 and path [1 ] == "channel" :
251+ if local_name == "title" and not title :
252+ title = (elem .text or "" ).strip ()
253+ elif local_name == "link" and not html_url :
254+ html_url = (elem .text or "" ).strip ()
255+ if title and html_url :
256+ return _build_feed_metadata (root_name , title , html_url , feed_url )
257+ elif path == [root_name , "channel" ]:
258+ return _build_feed_metadata (root_name , title , html_url , feed_url )
259+ elif root_name == "feed" :
260+ if path == ["feed" , "title" ] and not title :
261+ title = (elem .text or "" ).strip ()
262+ if title and html_url :
263+ return _build_feed_metadata (root_name , title , html_url , feed_url )
264+ elif path == ["feed" ]:
265+ return _build_feed_metadata (
266+ root_name ,
267+ title ,
268+ html_url or atom_fallback_html_url ,
269+ feed_url ,
270+ )
271+
272+ if path :
273+ path .pop ()
274+ elem .clear ()
275+
276+ try :
277+ parser .close ()
278+ except ET .ParseError as exc :
279+ raise ValueError (f"RSS/Atom XML parse failed: { exc } " ) from exc
280+
281+ return _build_feed_metadata (root_name , title , html_url or atom_fallback_html_url , feed_url )
186282
187283
188284def fetch_feed_metadata (feed_url : str , timeout : float ) -> FeedMetadata :
@@ -197,16 +293,12 @@ def fetch_feed_metadata(feed_url: str, timeout: float) -> FeedMetadata:
197293 status_code = int (status ) if status is not None else 0
198294 if status_code >= 400 :
199295 raise ValueError (f"Feed request failed with HTTP { status_code } " )
200- payload = response . read ( MAX_FEED_BYTES + 1 )
296+ return parse_feed_metadata_stream ( response , feed_url = feed_url )
201297 except urllib .error .HTTPError as exc :
202298 raise ValueError (f"Feed request failed with HTTP { int (exc .code )} " ) from exc
203299 except urllib .error .URLError as exc :
204300 raise ValueError (f"Feed request failed: { exc .reason } " ) from exc
205301
206- if len (payload ) > MAX_FEED_BYTES :
207- raise ValueError (f"Feed payload is too large (> { MAX_FEED_BYTES } bytes)" )
208- return parse_feed_metadata (payload , feed_url = feed_url )
209-
210302
211303def find_existing_category_for_url (body : ET .Element , xml_url : str ) -> Optional [str ]:
212304 wanted = normalize_url (xml_url )
0 commit comments