@@ -55,28 +55,32 @@ def _build_yaml_entry(paper: dict[str, Any]) -> dict[str, Any]:
5555 return entry
5656
5757
58- def append_paper (paper : dict [str , Any ], data_dir : Path ) -> bool :
58+ def append_paper (paper : dict [str , Any ], data_dir : Path ) -> tuple [ bool , str ] :
5959 """
6060 Append one paper to the appropriate YAML file.
61- Returns True if written, False if skipped (already exists).
61+ Returns (written, duplicate_arxiv_id):
62+ - written=True → paper was added
63+ - written=False → skipped; duplicate_arxiv_id is the paper's arxiv_id
64+ so the caller can add it to processed_ids and avoid re-processing.
6265 """
66+ arxiv_id = paper .get ("arxiv_id" , "" )
6367 category = paper .get ("category" , "" )
6468 if not category :
6569 logger .warning ("Paper has no category, skipping: %s" , paper .get ("title" ))
66- return False
70+ return False , arxiv_id
6771
6872 yaml_path = data_dir / f"papers_{ category } .yaml"
6973 if not yaml_path .exists ():
7074 logger .warning ("Category file not found: %s — skipping" , yaml_path )
71- return False
75+ return False , arxiv_id
7276
7377 # Load existing entries
7478 existing_text = yaml_path .read_text (encoding = "utf-8" )
7579 try :
7680 existing : list [dict ] = yaml .safe_load (existing_text ) or []
7781 except yaml .YAMLError as exc :
7882 logger .error ("YAML parse error in %s: %s" , yaml_path , exc )
79- return False
83+ return False , arxiv_id
8084
8185 # Dedup by paper URL or title
8286 paper_url = paper .get ("links" , {}).get ("paper" , "" )
@@ -86,7 +90,7 @@ def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
8690 e_title = e .get ("title" , "" )
8791 if (paper_url and e_url and e_url == paper_url ) or (title and e_title == title ):
8892 logger .info ("Already exists in %s, skipping: %s" , yaml_path .name , title [:60 ])
89- return False
93+ return False , arxiv_id # return arxiv_id so caller marks it processed
9094
9195 new_entry = _build_yaml_entry (paper )
9296
@@ -104,13 +108,21 @@ def append_paper(paper: dict[str, Any], data_dir: Path) -> bool:
104108
105109 yaml_path .write_text (new_text , encoding = "utf-8" )
106110 logger .info ("Written to %s: %s" , yaml_path .name , title [:60 ])
107- return True
111+ return True , ""
108112
109113
110- def append_papers (papers : list [dict [str , Any ]], data_dir : Path ) -> int :
111- """Append multiple papers. Returns count of successfully written papers."""
114+ def append_papers (papers : list [dict [str , Any ]], data_dir : Path ) -> tuple [int , list [str ]]:
115+ """
116+ Append multiple papers.
117+ Returns (written_count, duplicate_arxiv_ids) so callers can mark
118+ duplicates in processed_ids and avoid re-processing them next time.
119+ """
112120 count = 0
121+ duplicate_ids : list [str ] = []
113122 for paper in papers :
114- if append_paper (paper , data_dir ):
123+ written , dup_id = append_paper (paper , data_dir )
124+ if written :
115125 count += 1
116- return count
126+ elif dup_id :
127+ duplicate_ids .append (dup_id )
128+ return count , duplicate_ids
0 commit comments