fix: mark duplicate arxiv_ids as processed when yaml_writer detects them

Zhaoyang-Chu · Zhaoyang-Chu · commit 1d0306499a82 · 2026-05-31T18:55:25.000Z
When append_paper finds a title/URL match, it now returns the paper's
arxiv_id. run_finalize adds these to processed_ids so the pipeline
never re-processes manually-added papers that lack an arXiv URL entry.
diff --git a/automation/finalizer/yaml_writer.py b/automation/finalizer/yaml_writer.py
@@ -55,28 +55,32 @@ def _build_yaml_entry(paper: dict[str, Any]) -> dict[str, Any]:
     return entry
 
 
-def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
+def append_paper(paper: dict[str, Any], data_dir: Path) -> tuple[bool, str]:
     """
     Append one paper to the appropriate YAML file.
-    Returns True if written, False if skipped (already exists).
+    Returns (written, duplicate_arxiv_id):
+      - written=True  → paper was added
+      - written=False → skipped; duplicate_arxiv_id is the paper's arxiv_id
+        so the caller can add it to processed_ids and avoid re-processing.
     """
+    arxiv_id = paper.get("arxiv_id", "")
     category = paper.get("category", "")
     if not category:
         logger.warning("Paper has no category, skipping: %s", paper.get("title"))
-        return False
+        return False, arxiv_id
 
     yaml_path = data_dir / f"papers_{category}.yaml"
     if not yaml_path.exists():
         logger.warning("Category file not found: %s — skipping", yaml_path)
-        return False
+        return False, arxiv_id
 
     # Load existing entries
     existing_text = yaml_path.read_text(encoding="utf-8")
     try:
         existing: list[dict] = yaml.safe_load(existing_text) or []
     except yaml.YAMLError as exc:
         logger.error("YAML parse error in %s: %s", yaml_path, exc)
-        return False
+        return False, arxiv_id
 
     # Dedup by paper URL or title
     paper_url = paper.get("links", {}).get("paper", "")
@@ -86,7 +90,7 @@ def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
         e_title = e.get("title", "")
         if (paper_url and e_url and e_url == paper_url) or (title and e_title == title):
             logger.info("Already exists in %s, skipping: %s", yaml_path.name, title[:60])
-            return False
+            return False, arxiv_id   # return arxiv_id so caller marks it processed
 
     new_entry = _build_yaml_entry(paper)
 
@@ -104,13 +108,21 @@ def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
 
     yaml_path.write_text(new_text, encoding="utf-8")
     logger.info("Written to %s: %s", yaml_path.name, title[:60])
-    return True
+    return True, ""
 
 
-def append_papers(papers: list[dict[str, Any]], data_dir: Path) -> int:
-    """Append multiple papers. Returns count of successfully written papers."""
+def append_papers(papers: list[dict[str, Any]], data_dir: Path) -> tuple[int, list[str]]:
+    """
+    Append multiple papers.
+    Returns (written_count, duplicate_arxiv_ids) so callers can mark
+    duplicates in processed_ids and avoid re-processing them next time.
+    """
     count = 0
+    duplicate_ids: list[str] = []
     for paper in papers:
-        if append_paper(paper, data_dir):
+        written, dup_id = append_paper(paper, data_dir)
+        if written:
             count += 1
-    return count
+        elif dup_id:
+            duplicate_ids.append(dup_id)
+    return count, duplicate_ids
diff --git a/automation/main.py b/automation/main.py
@@ -298,7 +298,14 @@ def run_finalize() -> None:
 
     logger.info("=== Writing %d approved paper(s) to YAML ===", len(approved_papers))
     data_dir = _REPO_ROOT / "data"
-    written = append_papers(approved_papers, data_dir)
+    written, duplicate_ids = append_papers(approved_papers, data_dir)
+
+    # Papers detected as duplicates by yaml_writer were manually added before
+    # the pipeline started and have no arxiv_id in processed_ids yet.
+    # Mark them now so the pipeline never re-processes them.
+    if duplicate_ids:
+        state_mgr.mark_processed(state, duplicate_ids)
+        logger.info("Marked %d duplicate arxiv_id(s) as processed", len(duplicate_ids))
 
     state_mgr.save(state)