Skip to content

Commit 21a94bb

Browse files
Niloth-ptimabbott
authored andcommitted
rss-bot: Support unordered RSS feeds.
By splitting the logic into two loops - one for processing all the entries in the feed, and another to post only the latest ones in chronological order. Instead of tracking new_hashes in memory while processing the feed file, we track unhashed_entries now, since we will not be hashing all the entries, only the ones that we post. Fixes #831.
1 parent 89c777c commit 21a94bb

1 file changed

Lines changed: 27 additions & 29 deletions

File tree

zulip/integrations/rss/rss-bot

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,6 @@ client: zulip.Client = zulip.Client(
226226
client="ZulipRSS/" + VERSION,
227227
)
228228

229-
first_message = True
230-
231229
for feed_url in feed_urls:
232230
feed_hashes_file = os.path.join(
233231
opts.data_dir, urllib.parse.urlparse(feed_url).netloc
@@ -239,7 +237,7 @@ for feed_url in feed_urls:
239237
except OSError:
240238
old_feed_hashes = {}
241239

242-
new_hashes: List[str] = []
240+
unhashed_entries: List[tuple[Any, str, float]] = []
243241
data = feedparser.parse(feed_url)
244242
feed_name: str = data.feed.title or feed_url
245243
# Safeguard to not process older entries in unordered feeds
@@ -249,32 +247,32 @@ for feed_url in feed_urls:
249247
entry_hash = compute_entry_hash(entry)
250248
entry_time, is_time_tagged = get_entry_time(entry)
251249
if (is_time_tagged and entry_time < entry_threshold) or entry_hash in old_feed_hashes:
252-
# As a safeguard against misbehaving feeds, don't try to process
253-
# entries older than some threshold.
254250
continue
255-
if entry_hash in old_feed_hashes:
256-
# We've already seen this. No need to process any older entries.
257-
break
258-
if not old_feed_hashes and len(new_hashes) >= opts.max_batch_size:
259-
# On a first run, pick up the n (= opts.max_batch_size) most recent entries.
260-
# An RSS feed has entries in reverse chronological order.
261-
break
262-
263-
response: Dict[str, Any] = send_zulip(entry, feed_name)
264-
if response["result"] != "success":
265-
logger.error("Error processing %s", feed_url)
266-
logger.error("%s", response)
267-
if first_message:
268-
# This is probably some fundamental problem like the stream not
269-
# existing or something being misconfigured, so bail instead of
270-
# getting the same error for every RSS entry.
271-
log_error_and_exit("Failed to process first message")
272-
# Go ahead and move on -- perhaps this entry is corrupt.
273-
new_hashes.append(entry_hash)
274-
first_message = False
251+
unhashed_entries.append((entry, entry_hash, entry_time))
275252

276-
with open(feed_hashes_file, "a") as f:
277-
for hash in new_hashes:
278-
f.write(hash + "\n")
253+
# We process all entries to support unordered feeds,
254+
# but post only the latest ones in chronological order.
255+
sorted_entries = sorted(unhashed_entries, key=lambda x: x[2])[-opts.max_batch_size :]
279256

280-
logger.info("Sent zulips for %d %s entries", len(new_hashes), feed_url)
257+
with open(feed_hashes_file, "a") as f:
258+
for entry_tuple in sorted_entries:
259+
entry, entry_hash, _ = entry_tuple
260+
261+
response: Dict[str, Any] = send_zulip(entry, feed_name)
262+
if response["result"] != "success":
263+
logger.error("Error processing %s", feed_url)
264+
logger.error("%s", response)
265+
if not old_feed_hashes and entry_tuple == sorted_entries[0]:
266+
# This is probably some fundamental problem like the stream not
267+
# existing or something being misconfigured, so bail instead of
268+
# getting the same error for every RSS entry.
269+
log_error_and_exit("Failed to process first message")
270+
# Go ahead and move on -- perhaps this entry is corrupt.
271+
f.write(entry_hash + "\n")
272+
273+
logger.info(
274+
"Processed %d entries from %s and sent %d zulips",
275+
len(unhashed_entries),
276+
feed_url,
277+
len(sorted_entries),
278+
)

0 commit comments

Comments
 (0)