Skip to content

Commit 768b030

Browse files
Merge pull request #3 from PoCInnovation/scrape/arxiv
add(scrap): add scrapping of "le monde" and "arxiv"
2 parents 920acf1 + 4bb5c92 commit 768b030

5 files changed

Lines changed: 164 additions & 0 deletions

File tree

scrap/arxiv/arxiv_papers.db

32 KB
Binary file not shown.

scrap/arxiv/category.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
| Code | Domaine |
2+
| -------------- | --------------------------------------- |
3+
| cs.AI | Artificial Intelligence |
4+
| cs.CL | Computation and Language |
5+
| cs.CV | Computer Vision and Pattern Recognition |
6+
| math.PR | Probability |
7+
| stat.ML | Machine Learning (Statistics) |
8+
| physics.gen-ph | General Physics |

scrap/arxiv/scrap_arxiv.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import time
2+
import os
3+
import sqlite3
4+
import arxiv
5+
6+
CATEGORY = "cs.LG" # check in category.md
7+
INTERVAL = 300 # secondes
8+
DB_FILE = os.path.join(os.path.dirname(__file__), "arxiv_papers.db")
9+
10+
conn = sqlite3.connect(DB_FILE)
11+
cursor = conn.cursor()
12+
cursor.execute("""
13+
CREATE TABLE IF NOT EXISTS papers (
14+
id TEXT PRIMARY KEY,
15+
title TEXT,
16+
authors TEXT,
17+
published TEXT,
18+
summary TEXT,
19+
link TEXT
20+
)
21+
""")
22+
conn.commit()
23+
24+
def save_paper(paper):
25+
cursor.execute("""
26+
INSERT OR IGNORE INTO papers (id, title, authors, published, summary, link)
27+
VALUES (?, ?, ?, ?, ?, ?)
28+
""", (
29+
paper.entry_id,
30+
paper.title,
31+
", ".join([a.name for a in paper.authors]),
32+
paper.published.isoformat(),
33+
paper.summary,
34+
paper.entry_id
35+
))
36+
conn.commit()
37+
38+
cursor.execute("SELECT id FROM papers")
39+
seen_ids = set(row[0] for row in cursor.fetchall())
40+
41+
while True:
42+
search = arxiv.Search(
43+
query=f"cat:{CATEGORY}",
44+
max_results=10,
45+
sort_by=arxiv.SortCriterion.SubmittedDate,
46+
sort_order=arxiv.SortOrder.Descending
47+
)
48+
49+
for result in search.results():
50+
if result.entry_id not in seen_ids:
51+
print("NOUVEAU PAPIER !")
52+
print("Title:", result.title)
53+
print("Authors:", ", ".join([author.name for author in result.authors]))
54+
print("Published:", result.published)
55+
print("Link:", result.entry_id)
56+
print("="*80)
57+
58+
save_paper(result)
59+
seen_ids.add(result.entry_id)
60+
61+
time.sleep(INTERVAL)

scrap/le_monde/lemonde_articles.db

72 KB
Binary file not shown.

scrap/le_monde/scrap_le_monde.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import time
2+
import os
3+
import sqlite3
4+
import feedparser
5+
from datetime import datetime
6+
7+
FEEDS = [
8+
# Flux par catégorie
9+
"https://www.lemonde.fr/bresil/rss_full.xml",
10+
"https://www.lemonde.fr/international/rss_full.xml",
11+
"https://www.lemonde.fr/actualite-medias/rss_full.xml",
12+
# Flux "en continu"
13+
"https://www.lemonde.fr/en_continu/rss_full.xml"
14+
]
15+
16+
DB_FILE = os.path.join(os.path.dirname(__file__), "lemonde_articles.db")
17+
INTERVAL = 300
18+
19+
conn = sqlite3.connect(DB_FILE, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
20+
cur = conn.cursor()
21+
cur.execute("""
22+
CREATE TABLE IF NOT EXISTS articles (
23+
id TEXT PRIMARY KEY,
24+
title TEXT,
25+
published TIMESTAMP,
26+
summary TEXT,
27+
link TEXT,
28+
feed TEXT
29+
)
30+
""")
31+
conn.commit()
32+
33+
def save_article(entry, feed_url):
34+
"""
35+
entry: objet feedparser entry
36+
"""
37+
entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
38+
title = getattr(entry, "title", "")
39+
link = getattr(entry, "link", "")
40+
summary = getattr(entry, "summary", "")
41+
published = None
42+
if getattr(entry, "published_parsed", None):
43+
published = datetime.fromtimestamp(time.mktime(entry.published_parsed))
44+
elif getattr(entry, "updated_parsed", None):
45+
published = datetime.fromtimestamp(time.mktime(entry.updated_parsed))
46+
else:
47+
published = datetime.utcnow()
48+
49+
cur.execute("""
50+
INSERT OR IGNORE INTO articles (id, title, published, summary, link, feed)
51+
VALUES (?, ?, ?, ?, ?, ?)
52+
""", (entry_id, title, published, summary, link, feed_url))
53+
conn.commit()
54+
55+
def load_seen_ids():
56+
cur.execute("SELECT id FROM articles")
57+
return set(row[0] for row in cur.fetchall())
58+
59+
def fetch_feed(feed_url):
60+
return feedparser.parse(feed_url)
61+
62+
def main():
63+
print("Initialisation...")
64+
seen_ids = load_seen_ids()
65+
print(f"{len(seen_ids)} articles déjà en base.")
66+
try:
67+
while True:
68+
for feed in FEEDS:
69+
try:
70+
d = fetch_feed(feed)
71+
if d.bozo:
72+
print(f"[WARN] Problème lecture flux {feed}: {getattr(d, 'bozo_exception', '')}")
73+
continue
74+
75+
for entry in d.entries:
76+
entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
77+
if entry_id is None:
78+
continue
79+
if entry_id not in seen_ids:
80+
print(f"[NOUVEAU] {entry.get('title','(no title)')}")
81+
print(" ->", entry.get("link",""))
82+
save_article(entry, feed)
83+
seen_ids.add(entry_id)
84+
except Exception as e:
85+
print(f"[ERREUR] du fetch du feed {feed}: {e}")
86+
87+
print(f"Attente {INTERVAL}s avant prochaine vérification...")
88+
time.sleep(INTERVAL)
89+
except KeyboardInterrupt:
90+
print("Arrêt par l'utilisateur.")
91+
finally:
92+
conn.close()
93+
94+
if __name__ == "__main__":
95+
main()

0 commit comments

Comments
 (0)