diff --git a/scripts/add_evidence_source.py b/scripts/add_evidence_source.py index 0c43bc61..664bf250 100644 --- a/scripts/add_evidence_source.py +++ b/scripts/add_evidence_source.py @@ -26,7 +26,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) -from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.literature import LiteratureFetcher from communitymech.curate.curation_event import record_curation_event from communitymech.validation.write_validated import ( @@ -39,10 +39,13 @@ class EvidenceSourceAdder: """Add evidence_source to evidence items""" def __init__(self): - self.fetcher = EnhancedLiteratureFetcher( - cache_dir=".literature_cache", - use_fallback_pdf=False - ) + # Previously imported a sibling EnhancedLiteratureFetcher class that + # was never committed to the repo; the LiteratureFetcher in + # communitymech.literature exposes the same fetch_pubmed_abstract + + # fetch_paper surface (plus a richer DOI fallback chain through + # CrossRef / PMC / OpenAlex / Semantic Scholar / Europe PMC) which + # is what these scripts actually need. + self.fetcher = LiteratureFetcher(cache_dir=".literature_cache") self.stats = { 'total_evidence': 0, 'already_has_source': 0, @@ -78,13 +81,12 @@ def guess_evidence_source( self, snippet: str, abstract: str = None, - title: str = None, community_origin: str = None ) -> Optional[str]: """Guess evidence source using heuristics""" # Combine text for keyword matching - text = ' '.join(filter(None, [snippet, abstract, title])).lower() + text = ' '.join(filter(None, [snippet, abstract])).lower() # Check for review first (highest specificity) if any(kw in text for kw in self.review_keywords): @@ -147,18 +149,19 @@ def process_yaml( reference = ev.get('reference', '') # Try to fetch abstract for better classification + # Title is not threaded into the classifier — PubMed + # abstracts already embed the title, and CrossRef + # titles for DOIs are available via fetch_doi_metadata() + # if richer classification is wanted later. abstract = None - title = None try: - paper = self.fetcher.fetch_paper(reference, download_pdf=False) - abstract = paper.get('abstract') - title = paper.get('title') - except: + abstract, _ = self.fetcher.fetch_paper(reference) + except Exception: pass # Guess evidence source guessed_source = self.guess_evidence_source( - snippet, abstract, title, community_origin + snippet, abstract, community_origin ) if auto_mode and guessed_source: @@ -220,17 +223,18 @@ def process_yaml( snippet = ev.get('snippet', '') reference = ev.get('reference', '') + # Title is not threaded into the classifier — PubMed + # abstracts already embed the title, and CrossRef + # titles for DOIs are available via fetch_doi_metadata() + # if richer classification is wanted later. abstract = None - title = None try: - paper = self.fetcher.fetch_paper(reference, download_pdf=False) - abstract = paper.get('abstract') - title = paper.get('title') - except: + abstract, _ = self.fetcher.fetch_paper(reference) + except Exception: pass guessed_source = self.guess_evidence_source( - snippet, abstract, title, community_origin + snippet, abstract, community_origin ) if auto_mode and guessed_source: diff --git a/scripts/intelligent_snippet_fixer.py b/scripts/intelligent_snippet_fixer.py index 2733b2dc..a72d006b 100755 --- a/scripts/intelligent_snippet_fixer.py +++ b/scripts/intelligent_snippet_fixer.py @@ -25,7 +25,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from communitymech.curate.curation_event import record_curation_event -from communitymech.literature_enhanced import EnhancedLiteratureFetcher +from communitymech.literature import LiteratureFetcher from communitymech.validation.write_validated import ( ValidationFailedError, write_validated_community, @@ -59,7 +59,12 @@ class IntelligentSnippetFixer: """Intelligent snippet fixer with context-aware abstract analysis.""" def __init__(self, verbose: bool = False): - self.fetcher = EnhancedLiteratureFetcher() + # Previously imported a sibling EnhancedLiteratureFetcher class + # that was never committed; LiteratureFetcher exposes the same + # fetch_pubmed_abstract + fetch_paper surface plus a richer DOI + # fallback chain (CrossRef / PMC / OpenAlex / Semantic Scholar / + # Europe PMC) which subsumes what fetch_abstract_for_doi did. + self.fetcher = LiteratureFetcher() self.verbose = verbose def extract_relevant_sentences( @@ -210,12 +215,13 @@ def suggest_snippets_for_evidence( if reference.upper().startswith("PMID:"): pmid = reference.replace("PMID:", "").replace("pmid:", "").strip() abstract = self.fetcher.fetch_pubmed_abstract(pmid) - elif "doi" in reference.lower() or reference.startswith("10."): - doi = reference.replace("doi:", "").replace("https://doi.org/", "").strip() - abstract = self.fetcher.fetch_abstract_for_doi(doi) else: - paper = self.fetcher.fetch_paper(reference, download_pdf=False) - abstract = paper.get("abstract") + # fetch_paper auto-detects PMID vs DOI and runs the full + # DOI fallback chain (CrossRef → PMID via DOI lookup → PMC + # full-text → OpenAlex → Semantic Scholar → Europe PMC → + # publisher meta-tag scrape). Returns (abstract, pdf_url); + # we don't need the pdf here. + abstract, _ = self.fetcher.fetch_paper(reference) if not abstract: if self.verbose: