Skip to content

Commit 6f6255f

Browse files
realmarcinclaude
andauthored
Add pytest coverage for new LiteratureFetcher fallback methods (#70)
* Add pytest coverage for new LiteratureFetcher fallback methods Cover the four new fallback methods added in earlier PRs, plus the PMC ID-converter helper and the shared cache-path helper. Tests mock requests.Session.get on the fetcher's own session so no network is required. Coverage (19 tests, all passing): - fetch_openalex_abstract: inverted-index reconstruction, disk cache hit, no-abstract record, request exception, case-insensitive DOI prefix stripping. - fetch_semantic_scholar_abstract: successful abstract field, disk cache hit, null abstract field. - fetch_europepmc_abstract: first-result extraction, empty-results None return, DOI: query param shape. - fetch_publisher_meta_abstract: Springer twitter:description prefix-strip, fallback to description meta, short-description filter (>80 chars to skip nav text), request exception. - fetch_pmcid_for_doi: numeric id strip from "PMC..." record, None for status:error records, None for empty records list. - _abstract_cache_path: forward-slash to underscore safe filename encoding. Suite total: 102 -> 121 passing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Address Copilot review on PR #70 Wrap the two 101-char `with patch.object(...) as mock_get:` lines so the file passes the black/ruff 100-char line-length configured in pyproject.toml. Ran `uv run black tests/test_literature_fetcher.py` to apply the standard wrapping; the 19 tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 9dcb4ce commit 6f6255f

1 file changed

Lines changed: 250 additions & 0 deletions

File tree

tests/test_literature_fetcher.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
"""Tests for the new abstract-source fallbacks added to LiteratureFetcher.
2+
3+
Covers:
4+
- fetch_openalex_abstract (inverted-index reconstruction + cache)
5+
- fetch_semantic_scholar_abstract
6+
- fetch_europepmc_abstract
7+
- fetch_publisher_meta_abstract (DOI page meta-tag scrape)
8+
- fetch_pmcid_for_doi (ID Converter API success + not-found)
9+
10+
The tests mock requests.Session.get on the fetcher's session so no
11+
network is required.
12+
"""
13+
14+
from unittest.mock import MagicMock, patch
15+
16+
import pytest
17+
import requests
18+
19+
from communitymech.literature import LiteratureFetcher
20+
21+
22+
@pytest.fixture
23+
def fetcher(tmp_path):
24+
"""LiteratureFetcher pointing at a per-test cache_dir."""
25+
return LiteratureFetcher(cache_dir=str(tmp_path))
26+
27+
28+
def _mock_json_response(payload):
29+
response = MagicMock()
30+
response.json.return_value = payload
31+
response.raise_for_status.return_value = None
32+
return response
33+
34+
35+
def _mock_text_response(text):
36+
response = MagicMock()
37+
response.text = text
38+
response.raise_for_status.return_value = None
39+
return response
40+
41+
42+
# ---------------------------------------------------------------------------
43+
# fetch_openalex_abstract
44+
# ---------------------------------------------------------------------------
45+
46+
47+
def test_openalex_reconstructs_from_inverted_index(fetcher):
48+
"""OpenAlex returns an inverted index; reconstruct in position order."""
49+
payload = {
50+
"abstract_inverted_index": {
51+
"Acetate": [0],
52+
"is": [1],
53+
"oxidized": [2],
54+
"by": [3],
55+
"Rhodoferax": [4],
56+
}
57+
}
58+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
59+
result = fetcher.fetch_openalex_abstract("10.1234/example")
60+
assert result == "Acetate is oxidized by Rhodoferax"
61+
62+
63+
def test_openalex_cache_hit_skips_http(fetcher):
64+
"""Second call reads from disk cache; no HTTP request is issued."""
65+
cache_file = fetcher._abstract_cache_path("openalex", "10.1234/cached")
66+
cache_file.write_text("cached abstract text")
67+
68+
with patch.object(fetcher.session, "get") as mock_get:
69+
result = fetcher.fetch_openalex_abstract("10.1234/cached")
70+
mock_get.assert_not_called()
71+
assert result == "cached abstract text"
72+
73+
74+
def test_openalex_no_abstract_returns_none(fetcher):
75+
"""Records without abstract_inverted_index return None without caching."""
76+
payload = {"title": "Paper without abstract"}
77+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
78+
result = fetcher.fetch_openalex_abstract("10.1234/no-abstract")
79+
assert result is None
80+
assert not fetcher._abstract_cache_path("openalex", "10.1234/no-abstract").exists()
81+
82+
83+
def test_openalex_handles_request_exception(fetcher):
84+
"""Network errors return None rather than raising."""
85+
with patch.object(
86+
fetcher.session, "get", side_effect=requests.exceptions.ConnectionError("boom")
87+
):
88+
result = fetcher.fetch_openalex_abstract("10.1234/network-error")
89+
assert result is None
90+
91+
92+
def test_openalex_strips_doi_prefix_case_insensitively(fetcher):
93+
"""Both "doi:" and "DOI:" prefixes are stripped before hitting the API."""
94+
payload = {"abstract_inverted_index": {"abstract": [0]}}
95+
with patch.object(
96+
fetcher.session, "get", return_value=_mock_json_response(payload)
97+
) as mock_get:
98+
fetcher.fetch_openalex_abstract("DOI:10.1234/example")
99+
called_url = mock_get.call_args[0][0]
100+
assert called_url == "https://api.openalex.org/works/doi:10.1234/example"
101+
102+
103+
# ---------------------------------------------------------------------------
104+
# fetch_semantic_scholar_abstract
105+
# ---------------------------------------------------------------------------
106+
107+
108+
def test_semantic_scholar_returns_abstract_field(fetcher):
109+
payload = {"abstract": "The coculture detoxifies furfural."}
110+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
111+
result = fetcher.fetch_semantic_scholar_abstract("10.1234/example")
112+
assert result == "The coculture detoxifies furfural."
113+
114+
115+
def test_semantic_scholar_cache_hit_skips_http(fetcher):
116+
cache_file = fetcher._abstract_cache_path("semanticscholar", "10.1234/cached")
117+
cache_file.write_text("cached")
118+
119+
with patch.object(fetcher.session, "get") as mock_get:
120+
result = fetcher.fetch_semantic_scholar_abstract("10.1234/cached")
121+
mock_get.assert_not_called()
122+
assert result == "cached"
123+
124+
125+
def test_semantic_scholar_missing_abstract_returns_none(fetcher):
126+
payload = {"abstract": None}
127+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
128+
result = fetcher.fetch_semantic_scholar_abstract("10.1234/no-abstract")
129+
assert result is None
130+
131+
132+
# ---------------------------------------------------------------------------
133+
# fetch_europepmc_abstract
134+
# ---------------------------------------------------------------------------
135+
136+
137+
def test_europepmc_returns_abstract_from_first_result(fetcher):
138+
payload = {"resultList": {"result": [{"abstractText": "Wet sedge tundra Fe(III) reduction."}]}}
139+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
140+
result = fetcher.fetch_europepmc_abstract("10.1234/example")
141+
assert result == "Wet sedge tundra Fe(III) reduction."
142+
143+
144+
def test_europepmc_empty_result_list_returns_none(fetcher):
145+
payload = {"resultList": {"result": []}}
146+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
147+
result = fetcher.fetch_europepmc_abstract("10.1234/missing")
148+
assert result is None
149+
150+
151+
def test_europepmc_passes_doi_query_param(fetcher):
152+
"""The DOI lookup is encoded as a DOI: query, format=json."""
153+
payload = {"resultList": {"result": []}}
154+
with patch.object(
155+
fetcher.session, "get", return_value=_mock_json_response(payload)
156+
) as mock_get:
157+
fetcher.fetch_europepmc_abstract("10.1234/example")
158+
params = mock_get.call_args.kwargs["params"]
159+
assert params["query"] == "DOI:10.1234/example"
160+
assert params["format"] == "json"
161+
162+
163+
# ---------------------------------------------------------------------------
164+
# fetch_publisher_meta_abstract (DOI page scrape)
165+
# ---------------------------------------------------------------------------
166+
167+
168+
def test_publisher_meta_extracts_twitter_description(fetcher):
169+
"""Springer style: twitter:description carries 'Journal - Abstract text...'."""
170+
html = (
171+
"<html><head>"
172+
'<meta name="twitter:description" content="Current Microbiology - '
173+
"Acidobacterium is proposed as a new genus for the acidophilic, "
174+
'chemoorganotrophic bacteria containing menaquinone.">'
175+
"</head></html>"
176+
)
177+
with patch.object(fetcher.session, "get", return_value=_mock_text_response(html)):
178+
result = fetcher.fetch_publisher_meta_abstract("10.1234/springer")
179+
assert result is not None
180+
# The "Journal Name - " prefix is stripped
181+
assert result.startswith("Acidobacterium is proposed as a new genus")
182+
183+
184+
def test_publisher_meta_falls_back_to_description(fetcher):
185+
"""If twitter:description is missing, fall back to description / og:description."""
186+
long_desc = "A long meaningful description " * 5
187+
html = f'<html><head><meta name="description" content="{long_desc}"></head></html>'
188+
with patch.object(fetcher.session, "get", return_value=_mock_text_response(html)):
189+
result = fetcher.fetch_publisher_meta_abstract("10.1234/fallback")
190+
assert result is not None
191+
assert "long meaningful description" in result
192+
193+
194+
def test_publisher_meta_skips_short_descriptions(fetcher):
195+
"""Navigation-text descriptions under 80 chars are rejected as abstracts."""
196+
html = '<html><head><meta name="description" content="Short."></head></html>'
197+
with patch.object(fetcher.session, "get", return_value=_mock_text_response(html)):
198+
result = fetcher.fetch_publisher_meta_abstract("10.1234/nav-text")
199+
assert result is None
200+
201+
202+
def test_publisher_meta_handles_request_exception(fetcher):
203+
with patch.object(fetcher.session, "get", side_effect=requests.exceptions.HTTPError("403")):
204+
result = fetcher.fetch_publisher_meta_abstract("10.1234/blocked")
205+
assert result is None
206+
207+
208+
# ---------------------------------------------------------------------------
209+
# fetch_pmcid_for_doi
210+
# ---------------------------------------------------------------------------
211+
212+
213+
def test_pmcid_for_doi_returns_numeric_id(fetcher):
214+
"""Successful ID Converter records strip the PMC prefix."""
215+
payload = {"records": [{"doi": "10.1234/x", "pmcid": "PMC123456", "pmid": 99999}]}
216+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
217+
result = fetcher.fetch_pmcid_for_doi("10.1234/x")
218+
assert result == "123456"
219+
220+
221+
def test_pmcid_for_doi_returns_none_when_not_in_pmc(fetcher):
222+
"""Records with status: error mean the DOI isn't in PMC."""
223+
payload = {
224+
"records": [
225+
{"doi": "10.1234/x", "status": "error", "errmsg": "Identifier not found in PMC"}
226+
]
227+
}
228+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
229+
result = fetcher.fetch_pmcid_for_doi("10.1234/x")
230+
assert result is None
231+
232+
233+
def test_pmcid_for_doi_returns_none_for_empty_records(fetcher):
234+
"""Defensive: API returning no records also yields None."""
235+
payload = {"records": []}
236+
with patch.object(fetcher.session, "get", return_value=_mock_json_response(payload)):
237+
result = fetcher.fetch_pmcid_for_doi("10.1234/x")
238+
assert result is None
239+
240+
241+
# ---------------------------------------------------------------------------
242+
# _abstract_cache_path helper
243+
# ---------------------------------------------------------------------------
244+
245+
246+
def test_abstract_cache_path_encodes_doi_safely(fetcher):
247+
"""Forward slashes in the DOI are replaced with underscores in the filename."""
248+
path = fetcher._abstract_cache_path("openalex", "10.1234/some/path")
249+
assert path.name == "openalex_10.1234_some_path.txt"
250+
assert path.parent == fetcher.cache_dir

0 commit comments

Comments
 (0)