-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscraper_utils.py
More file actions
30 lines (24 loc) · 878 Bytes
/
scraper_utils.py
File metadata and controls
30 lines (24 loc) · 878 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import datetime
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
def needs_update(url, previous_data, max_days, force_update):
if force_update or url not in previous_data:
return True
last_ts = previous_data[url]["timestamp"]
last_date = datetime.datetime.strptime(last_ts, "%Y/%m/%d")
return (datetime.datetime.now() - last_date).days > max_days
def parse_timestamp(date_str, fmt="%Y-%m-%d"):
return datetime.datetime.strptime(date_str[:10], fmt).strftime("%Y/%m/%d")
def convert_to_rss_url(url: str) -> str:
parsed = urlparse(url)
query = parse_qs(parsed.query)
query["page"] = ["rss"]
return urlunparse(
(
parsed.scheme,
parsed.netloc,
parsed.path,
parsed.params,
urlencode(query, doseq=True),
parsed.fragment,
)
)