feat: retry LLM classification failures instead of silently dropping

Zhaoyang-Chu · Zhaoyang-Chu · commit 5aa2529109d5 · 2026-06-01T18:13:20.000Z
- classify_paper returns llm_failed=True when all retries exhausted
- classify_papers returns (relevant, failed_ids) tuple
- Failed papers NOT added to processed_ids so they get retried
- State tracks retry_counts per arxiv_id; gives up after 3 failures
- run_daily fetches retry papers at start of each run
- 2605.18747 (Code as Agent Harness) was already recovered manually
diff --git a/automation/classifier/llm.py b/automation/classifier/llm.py
@@ -155,12 +155,13 @@ def classify_paper(
 
     logger.error("All %d attempts failed for paper: %s", retries, paper.get("arxiv_id"))
     return {
-        "relevant":   False,
-        "reason":     "LLM classification failed after retries",
-        "category":   None,
-        "tags":       [],
-        "summary":    "",
-        "venue_hint": "",
+        "relevant":    False,
+        "reason":      "LLM classification failed after retries",
+        "llm_failed":  True,   # distinguishes LLM error from "genuinely not relevant"
+        "category":    None,
+        "tags":        [],
+        "summary":     "",
+        "venue_hint":  "",
     }
 
 
@@ -171,21 +172,33 @@ def classify_papers(
     model: str | None = None,
     temperature: float = 0.1,
     learned_rules: str = "",
-) -> list[dict[str, Any]]:
+) -> tuple[list[dict[str, Any]], list[str]]:
     """
     Classify a list of papers. Mutates each paper in place, adding:
       - paper["category"]  : str | None
       - paper["tags"]      : list[str]
       - paper["summary"]   : str
       - paper["relevant"]  : bool
-    Returns only the relevant papers.
+
+    Returns (relevant_papers, failed_arxiv_ids):
+      - relevant_papers: papers the LLM confirmed are relevant
+      - failed_arxiv_ids: papers where LLM call failed (should be retried later)
     """
     relevant: list[dict[str, Any]] = []
+    failed_ids: list[str] = []
 
     for i, paper in enumerate(papers, 1):
         logger.info("[%d/%d] Classifying: %s", i, len(papers), paper["title"][:80])
         result = classify_paper(paper, categories, tags, model=model, temperature=temperature, learned_rules=learned_rules)
 
+        if result.get("llm_failed"):
+            aid = paper.get("arxiv_id", "")
+            if aid:
+                failed_ids.append(aid)
+            logger.warning("  → LLM FAILED, will retry next run: %s", paper["title"][:60])
+            time.sleep(0.5)
+            continue
+
         paper["relevant"]   = result.get("relevant", False)
         paper["category"]   = result.get("category")
         paper["tags"]       = result.get("tags", [])
@@ -205,5 +218,6 @@ def classify_papers(
         # Small delay to avoid hammering the proxy
         time.sleep(0.5)
 
-    logger.info("Classified %d papers → %d relevant", len(papers), len(relevant))
-    return relevant
+    logger.info("Classified %d papers → %d relevant, %d failed",
+                len(papers), len(relevant), len(failed_ids))
+    return relevant, failed_ids
diff --git a/automation/main.py b/automation/main.py
@@ -141,6 +141,19 @@ def run_daily() -> None:
             deduped.append(p)
     new_papers = deduped
 
+    # ── 3b. Add papers pending retry (LLM failed last time) ─────────────────
+    retry_ids = state_mgr.get_retry_ids(state)
+    if retry_ids:
+        logger.info("=== Retrying %d previously failed classification(s) ===", len(retry_ids))
+        already = {p["arxiv_id"] for p in new_papers}
+        from automation.crawler.arxiv import fetch_single_paper
+        for aid in retry_ids:
+            if aid not in already:
+                p = fetch_single_paper(aid)
+                if p:
+                    new_papers.append(p)
+                    already.add(aid)
+
     logger.info("After dedup: %d new papers to process", len(new_papers))
 
     if not new_papers:
@@ -156,18 +169,24 @@ def run_daily() -> None:
     # ── 5. Classify ─────────────────────────────────────────────────────────
     logger.info("=== Step 4: Classifying with LLM ===")
     learned_rules = state_mgr.maybe_refresh_learned_rules(state)
-    relevant = classify_papers(
+    relevant, failed_ids = classify_papers(
         new_papers,
         categories=cfg["categories"],
         tags=cfg["tags"],
         temperature=cfg.get("llm", {}).get("temperature", 0.1),
         learned_rules=learned_rules,
     )
-    logger.info("Relevant papers: %d / %d", len(relevant), len(new_papers))
+    logger.info("Relevant papers: %d / %d (failed: %d)", len(relevant), len(new_papers), len(failed_ids))
 
-    # Mark as processed only after classification completes successfully.
-    # This way a mid-run crash won't permanently lose unclassified papers.
-    state_mgr.mark_processed(state, [p["arxiv_id"] for p in new_papers])
+    # Mark successfully classified papers as processed (failed ones stay out)
+    succeeded_ids = [p["arxiv_id"] for p in new_papers if p.get("arxiv_id") not in failed_ids]
+    state_mgr.mark_processed(state, succeeded_ids)
+
+    # Track failures — give up after MAX_RETRIES, mark as processed to stop retrying
+    if failed_ids:
+        _, give_up_ids = state_mgr.add_failed_classifications(state, failed_ids)
+        if give_up_ids:
+            state_mgr.mark_processed(state, give_up_ids)
     state_mgr.save(state)
 
     if not relevant:
diff --git a/automation/state_manager.py b/automation/state_manager.py
@@ -41,6 +41,7 @@
     "reject_feedback": [],     # [{arxiv_id, title, reason}, ...] — curator reject reasons
     "learned_rules":   "",     # LLM-synthesised rules injected into classifier prompt
     "rules_last_updated": "",  # ISO date when learned_rules was last generated
+    "retry_counts":    {},     # {arxiv_id: int} — LLM failure retry count
 }
 
 
@@ -91,6 +92,40 @@ def update_pending_issues(state: dict[str, Any], updated: list[dict[str, Any]])
     state["pending_issues"] = updated
 
 
+_MAX_RETRIES = 3
+
+
+def add_failed_classifications(state: dict[str, Any], arxiv_ids: list[str]) -> tuple[list[str], list[str]]:
+    """
+    Record LLM classification failures. Increment retry count for each ID.
+    Returns (retry_later, give_up):
+      - retry_later: IDs that haven't hit the retry limit yet (keep out of processed_ids)
+      - give_up: IDs that have failed too many times (mark as processed to stop retrying)
+    """
+    counts: dict[str, int] = state.setdefault("retry_counts", {})
+    retry_later: list[str] = []
+    give_up: list[str] = []
+
+    for aid in arxiv_ids:
+        counts[aid] = counts.get(aid, 0) + 1
+        if counts[aid] >= _MAX_RETRIES:
+            logger.warning("Giving up on %s after %d failed attempts", aid, counts[aid])
+            give_up.append(aid)
+        else:
+            logger.info("Will retry %s (attempt %d/%d)", aid, counts[aid], _MAX_RETRIES)
+            retry_later.append(aid)
+
+    return retry_later, give_up
+
+
+def get_retry_ids(state: dict[str, Any]) -> list[str]:
+    """Return IDs that failed classification and should be retried."""
+    counts: dict[str, int] = state.get("retry_counts", {})
+    processed = set(state.get("processed_ids", [])) | set(state.get("rejected_ids", []))
+    return [aid for aid, cnt in counts.items()
+            if cnt < _MAX_RETRIES and aid not in processed]
+
+
 def add_reject_feedback(state: dict[str, Any], items: list[dict[str, Any]]) -> None:
     """Append curator reject reasons for classifier learning."""
     state.setdefault("reject_feedback", []).extend(items)