PoCInnovation
diff --git a/‎scrap/arxiv/arxiv_papers.db‎
32 KB b/‎scrap/arxiv/arxiv_papers.db‎
32 KB
diff --git a/‎scrap/arxiv/category.md‎
Lines changed: 8 additions & 0 deletions b/‎scrap/arxiv/category.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎scrap/arxiv/scrap_arxiv.py‎
Lines changed: 61 additions & 0 deletions b/‎scrap/arxiv/scrap_arxiv.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎scrap/le_monde/lemonde_articles.db‎
72 KB b/‎scrap/le_monde/lemonde_articles.db‎
72 KB
diff --git a/‎scrap/le_monde/scrap_le_monde.py‎
Lines changed: 95 additions & 0 deletions b/‎scrap/le_monde/scrap_le_monde.py‎
Lines changed: 95 additions & 0 deletions
@@ -0,0 +1,8 @@
+| Code           | Domaine                                 |
+| -------------- | --------------------------------------- |
+| cs.AI          | Artificial Intelligence                 |
+| cs.CL          | Computation and Language                |
+| cs.CV          | Computer Vision and Pattern Recognition |
+| math.PR        | Probability                             |
+| stat.ML        | Machine Learning (Statistics)           |
+| physics.gen-ph | General Physics                         |
@@ -0,0 +1,61 @@
+import time
+import os
+import sqlite3
+import arxiv
+
+CATEGORY = "cs.LG"  # check in category.md
+INTERVAL = 300      # secondes
+DB_FILE = os.path.join(os.path.dirname(__file__), "arxiv_papers.db")
+
+conn = sqlite3.connect(DB_FILE)
+cursor = conn.cursor()
+cursor.execute("""
+CREATE TABLE IF NOT EXISTS papers (
+    id TEXT PRIMARY KEY,
+    title TEXT,
+    authors TEXT,
+    published TEXT,
+    summary TEXT,
+    link TEXT
+)
+""")
+conn.commit()
+
+def save_paper(paper):
+    cursor.execute("""
+    INSERT OR IGNORE INTO papers (id, title, authors, published, summary, link)
+    VALUES (?, ?, ?, ?, ?, ?)
+    """, (
+        paper.entry_id,
+        paper.title,
+        ", ".join([a.name for a in paper.authors]),
+        paper.published.isoformat(),
+        paper.summary,
+        paper.entry_id
+    ))
+    conn.commit()
+
+cursor.execute("SELECT id FROM papers")
+seen_ids = set(row[0] for row in cursor.fetchall())
+
+while True:
+    search = arxiv.Search(
+        query=f"cat:{CATEGORY}",
+        max_results=10,
+        sort_by=arxiv.SortCriterion.SubmittedDate,
+        sort_order=arxiv.SortOrder.Descending
+    )
+
+    for result in search.results():
+        if result.entry_id not in seen_ids:
+            print("NOUVEAU PAPIER !")
+            print("Title:", result.title)
+            print("Authors:", ", ".join([author.name for author in result.authors]))
+            print("Published:", result.published)
+            print("Link:", result.entry_id)
+            print("="*80)
+            
+            save_paper(result)
+            seen_ids.add(result.entry_id)
+
+    time.sleep(INTERVAL)
@@ -0,0 +1,95 @@
+import time
+import os
+import sqlite3
+import feedparser
+from datetime import datetime
+
+FEEDS = [
+    # Flux par catégorie
+    "https://www.lemonde.fr/bresil/rss_full.xml",
+    "https://www.lemonde.fr/international/rss_full.xml",
+    "https://www.lemonde.fr/actualite-medias/rss_full.xml",
+    # Flux "en continu"
+    "https://www.lemonde.fr/en_continu/rss_full.xml"
+]
+
+DB_FILE = os.path.join(os.path.dirname(__file__), "lemonde_articles.db")
+INTERVAL = 300
+
+conn = sqlite3.connect(DB_FILE, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
+cur = conn.cursor()
+cur.execute("""
+CREATE TABLE IF NOT EXISTS articles (
+    id TEXT PRIMARY KEY,
+    title TEXT,
+    published TIMESTAMP,
+    summary TEXT,
+    link TEXT,
+    feed TEXT
+)
+""")
+conn.commit()
+
+def save_article(entry, feed_url):
+    """
+    entry: objet feedparser entry
+    """
+    entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
+    title = getattr(entry, "title", "")
+    link = getattr(entry, "link", "")
+    summary = getattr(entry, "summary", "")
+    published = None
+    if getattr(entry, "published_parsed", None):
+        published = datetime.fromtimestamp(time.mktime(entry.published_parsed))
+    elif getattr(entry, "updated_parsed", None):
+        published = datetime.fromtimestamp(time.mktime(entry.updated_parsed))
+    else:
+        published = datetime.utcnow()
+
+    cur.execute("""
+    INSERT OR IGNORE INTO articles (id, title, published, summary, link, feed)
+    VALUES (?, ?, ?, ?, ?, ?)
+    """, (entry_id, title, published, summary, link, feed_url))
+    conn.commit()
+
+def load_seen_ids():
+    cur.execute("SELECT id FROM articles")
+    return set(row[0] for row in cur.fetchall())
+
+def fetch_feed(feed_url):
+    return feedparser.parse(feed_url)
+
+def main():
+    print("Initialisation...")
+    seen_ids = load_seen_ids()
+    print(f"{len(seen_ids)} articles déjà en base.")
+    try:
+        while True:
+            for feed in FEEDS:
+                try:
+                    d = fetch_feed(feed)
+                    if d.bozo:
+                        print(f"[WARN] Problème lecture flux {feed}: {getattr(d, 'bozo_exception', '')}")
+                        continue
+
+                    for entry in d.entries:
+                        entry_id = getattr(entry, "id", None) or getattr(entry, "link", None)
+                        if entry_id is None:
+                            continue
+                        if entry_id not in seen_ids:
+                            print(f"[NOUVEAU] {entry.get('title','(no title)')}")
+                            print(" ->", entry.get("link",""))
+                            save_article(entry, feed)
+                            seen_ids.add(entry_id)
+                except Exception as e:
+                    print(f"[ERREUR] du fetch du feed {feed}: {e}")
+
+            print(f"Attente {INTERVAL}s avant prochaine vérification...")
+            time.sleep(INTERVAL)
+    except KeyboardInterrupt:
+        print("Arrêt par l'utilisateur.")
+    finally:
+        conn.close()
+
+if __name__ == "__main__":
+    main()