Skip to content

Commit 1d03064

Browse files
committed
fix: mark duplicate arxiv_ids as processed when yaml_writer detects them
When append_paper finds a title/URL match, it now returns the paper's arxiv_id. run_finalize adds these to processed_ids so the pipeline never re-processes manually-added papers that lack an arXiv URL entry.
1 parent 4f8476b commit 1d03064

2 files changed

Lines changed: 31 additions & 12 deletions

File tree

automation/finalizer/yaml_writer.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -55,28 +55,32 @@ def _build_yaml_entry(paper: dict[str, Any]) -> dict[str, Any]:
5555
return entry
5656

5757

58-
def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
58+
def append_paper(paper: dict[str, Any], data_dir: Path) -> tuple[bool, str]:
5959
"""
6060
Append one paper to the appropriate YAML file.
61-
Returns True if written, False if skipped (already exists).
61+
Returns (written, duplicate_arxiv_id):
62+
- written=True → paper was added
63+
- written=False → skipped; duplicate_arxiv_id is the paper's arxiv_id
64+
so the caller can add it to processed_ids and avoid re-processing.
6265
"""
66+
arxiv_id = paper.get("arxiv_id", "")
6367
category = paper.get("category", "")
6468
if not category:
6569
logger.warning("Paper has no category, skipping: %s", paper.get("title"))
66-
return False
70+
return False, arxiv_id
6771

6872
yaml_path = data_dir / f"papers_{category}.yaml"
6973
if not yaml_path.exists():
7074
logger.warning("Category file not found: %s — skipping", yaml_path)
71-
return False
75+
return False, arxiv_id
7276

7377
# Load existing entries
7478
existing_text = yaml_path.read_text(encoding="utf-8")
7579
try:
7680
existing: list[dict] = yaml.safe_load(existing_text) or []
7781
except yaml.YAMLError as exc:
7882
logger.error("YAML parse error in %s: %s", yaml_path, exc)
79-
return False
83+
return False, arxiv_id
8084

8185
# Dedup by paper URL or title
8286
paper_url = paper.get("links", {}).get("paper", "")
@@ -86,7 +90,7 @@ def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
8690
e_title = e.get("title", "")
8791
if (paper_url and e_url and e_url == paper_url) or (title and e_title == title):
8892
logger.info("Already exists in %s, skipping: %s", yaml_path.name, title[:60])
89-
return False
93+
return False, arxiv_id # return arxiv_id so caller marks it processed
9094

9195
new_entry = _build_yaml_entry(paper)
9296

@@ -104,13 +108,21 @@ def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
104108

105109
yaml_path.write_text(new_text, encoding="utf-8")
106110
logger.info("Written to %s: %s", yaml_path.name, title[:60])
107-
return True
111+
return True, ""
108112

109113

110-
def append_papers(papers: list[dict[str, Any]], data_dir: Path) -> int:
111-
"""Append multiple papers. Returns count of successfully written papers."""
114+
def append_papers(papers: list[dict[str, Any]], data_dir: Path) -> tuple[int, list[str]]:
115+
"""
116+
Append multiple papers.
117+
Returns (written_count, duplicate_arxiv_ids) so callers can mark
118+
duplicates in processed_ids and avoid re-processing them next time.
119+
"""
112120
count = 0
121+
duplicate_ids: list[str] = []
113122
for paper in papers:
114-
if append_paper(paper, data_dir):
123+
written, dup_id = append_paper(paper, data_dir)
124+
if written:
115125
count += 1
116-
return count
126+
elif dup_id:
127+
duplicate_ids.append(dup_id)
128+
return count, duplicate_ids

automation/main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,14 @@ def run_finalize() -> None:
298298

299299
logger.info("=== Writing %d approved paper(s) to YAML ===", len(approved_papers))
300300
data_dir = _REPO_ROOT / "data"
301-
written = append_papers(approved_papers, data_dir)
301+
written, duplicate_ids = append_papers(approved_papers, data_dir)
302+
303+
# Papers detected as duplicates by yaml_writer were manually added before
304+
# the pipeline started and have no arxiv_id in processed_ids yet.
305+
# Mark them now so the pipeline never re-processes them.
306+
if duplicate_ids:
307+
state_mgr.mark_processed(state, duplicate_ids)
308+
logger.info("Marked %d duplicate arxiv_id(s) as processed", len(duplicate_ids))
302309

303310
state_mgr.save(state)
304311

0 commit comments

Comments
 (0)