Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 23 additions & 19 deletions scripts/add_evidence_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from communitymech.literature_enhanced import EnhancedLiteratureFetcher
from communitymech.literature import LiteratureFetcher

from communitymech.curate.curation_event import record_curation_event
from communitymech.validation.write_validated import (
Expand All @@ -39,10 +39,13 @@ class EvidenceSourceAdder:
"""Add evidence_source to evidence items"""

def __init__(self):
self.fetcher = EnhancedLiteratureFetcher(
cache_dir=".literature_cache",
use_fallback_pdf=False
)
# Previously imported a sibling EnhancedLiteratureFetcher class that
# was never committed to the repo; the LiteratureFetcher in
# communitymech.literature exposes the same fetch_pubmed_abstract +
# fetch_paper surface (plus a richer DOI fallback chain through
# CrossRef / PMC / OpenAlex / Semantic Scholar / Europe PMC) which
# is what these scripts actually need.
self.fetcher = LiteratureFetcher(cache_dir=".literature_cache")
self.stats = {
'total_evidence': 0,
'already_has_source': 0,
Expand Down Expand Up @@ -78,13 +81,12 @@ def guess_evidence_source(
self,
snippet: str,
abstract: str = None,
title: str = None,
community_origin: str = None
) -> Optional[str]:
"""Guess evidence source using heuristics"""

# Combine text for keyword matching
text = ' '.join(filter(None, [snippet, abstract, title])).lower()
text = ' '.join(filter(None, [snippet, abstract])).lower()

# Check for review first (highest specificity)
if any(kw in text for kw in self.review_keywords):
Expand Down Expand Up @@ -147,18 +149,19 @@ def process_yaml(
reference = ev.get('reference', '')

# Try to fetch abstract for better classification
# Title is not threaded into the classifier — PubMed
# abstracts already embed the title, and CrossRef
# titles for DOIs are available via fetch_doi_metadata()
# if richer classification is wanted later.
abstract = None
title = None
try:
paper = self.fetcher.fetch_paper(reference, download_pdf=False)
abstract = paper.get('abstract')
title = paper.get('title')
except:
abstract, _ = self.fetcher.fetch_paper(reference)
except Exception:
pass

# Guess evidence source
guessed_source = self.guess_evidence_source(
snippet, abstract, title, community_origin
snippet, abstract, community_origin
)

if auto_mode and guessed_source:
Expand Down Expand Up @@ -220,17 +223,18 @@ def process_yaml(
snippet = ev.get('snippet', '')
reference = ev.get('reference', '')

# Title is not threaded into the classifier — PubMed
# abstracts already embed the title, and CrossRef
# titles for DOIs are available via fetch_doi_metadata()
# if richer classification is wanted later.
abstract = None
title = None
try:
paper = self.fetcher.fetch_paper(reference, download_pdf=False)
abstract = paper.get('abstract')
title = paper.get('title')
except:
abstract, _ = self.fetcher.fetch_paper(reference)
except Exception:
pass

guessed_source = self.guess_evidence_source(
snippet, abstract, title, community_origin
snippet, abstract, community_origin
)

if auto_mode and guessed_source:
Expand Down
20 changes: 13 additions & 7 deletions scripts/intelligent_snippet_fixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from communitymech.curate.curation_event import record_curation_event
from communitymech.literature_enhanced import EnhancedLiteratureFetcher
from communitymech.literature import LiteratureFetcher
from communitymech.validation.write_validated import (
ValidationFailedError,
write_validated_community,
Expand Down Expand Up @@ -59,7 +59,12 @@ class IntelligentSnippetFixer:
"""Intelligent snippet fixer with context-aware abstract analysis."""

def __init__(self, verbose: bool = False):
self.fetcher = EnhancedLiteratureFetcher()
# Previously imported a sibling EnhancedLiteratureFetcher class
# that was never committed; LiteratureFetcher exposes the same
# fetch_pubmed_abstract + fetch_paper surface plus a richer DOI
# fallback chain (CrossRef / PMC / OpenAlex / Semantic Scholar /
# Europe PMC) which subsumes what fetch_abstract_for_doi did.
self.fetcher = LiteratureFetcher()
self.verbose = verbose

def extract_relevant_sentences(
Expand Down Expand Up @@ -210,12 +215,13 @@ def suggest_snippets_for_evidence(
if reference.upper().startswith("PMID:"):
pmid = reference.replace("PMID:", "").replace("pmid:", "").strip()
abstract = self.fetcher.fetch_pubmed_abstract(pmid)
elif "doi" in reference.lower() or reference.startswith("10."):
doi = reference.replace("doi:", "").replace("https://doi.org/", "").strip()
abstract = self.fetcher.fetch_abstract_for_doi(doi)
else:
paper = self.fetcher.fetch_paper(reference, download_pdf=False)
abstract = paper.get("abstract")
# fetch_paper auto-detects PMID vs DOI and runs the full
# DOI fallback chain (CrossRef → PMID via DOI lookup → PMC
# full-text → OpenAlex → Semantic Scholar → Europe PMC →
# publisher meta-tag scrape). Returns (abstract, pdf_url);
# we don't need the pdf here.
abstract, _ = self.fetcher.fetch_paper(reference)

if not abstract:
if self.verbose:
Expand Down