Skip to content

Commit f8104ce

Browse files
author
Ang
committed
fix: remove hardcoded well_known_papers (#19) and add string/stdin input (#36)
- Remove well_known_papers dict and shortcut lookup from IdentifierModule - 'attention is all you need' now resolved via normal arXiv/Crossref search - CLI process command accepts: file path, '-' for stdin, or inline string - Update examples in --help text to show new usage patterns - Update tests: replace well_known_paper_shortcut test, add string/stdin tests Closes #19 Closes #36
1 parent 2483f10 commit f8104ce

4 files changed

Lines changed: 59 additions & 47 deletions

File tree

onecite/cli.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,11 @@ def create_parser() -> argparse.ArgumentParser:
4747
epilog="""
4848
Examples:
4949
onecite process references.txt --output-format bibtex
50-
onecite process references.bib --input-type bib --template conference_paper
50+
onecite process references.bib --input-type bib --template conference_paper
5151
onecite process references.txt --interactive --output results.bib
52+
onecite process "10.1038/nature14539"
53+
onecite process "attention is all you need, Vaswani et al., NIPS 2017"
54+
echo "10.1038/nature14539" | onecite process -
5255
"""
5356
)
5457

@@ -67,7 +70,7 @@ def create_parser() -> argparse.ArgumentParser:
6770
)
6871
process_parser.add_argument(
6972
'input_file',
70-
help='Input file containing references'
73+
help='Input file, "-" for stdin, or a reference string (e.g. a DOI or title)'
7174
)
7275
process_parser.add_argument(
7376
'--input-type',
@@ -117,12 +120,13 @@ def process_command(args: "argparse.Namespace") -> int:
117120
Exit code — ``0`` on success, ``1`` on failure.
118121
"""
119122
try:
120-
if not os.path.exists(args.input_file):
121-
print(f"Error: Input file not found: {args.input_file}", file=sys.stderr)
122-
return 1
123-
124-
with open(args.input_file, 'r', encoding='utf-8') as f:
125-
input_content = f.read()
123+
if args.input_file == '-':
124+
input_content = sys.stdin.read()
125+
elif os.path.exists(args.input_file):
126+
with open(args.input_file, 'r', encoding='utf-8') as f:
127+
input_content = f.read()
128+
else:
129+
input_content = args.input_file
126130

127131
def interactive_callback(candidates: List[Dict]) -> int:
128132
if not args.interactive:

onecite/pipeline.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -189,19 +189,6 @@ def __init__(self, use_google_scholar: bool = False):
189189
self.pubmed_base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
190190
self.datacite_base = "https://api.datacite.org"
191191

192-
# Well-known papers that might not have DOIs
193-
self.well_known_papers = {
194-
'attention is all you need': {
195-
'title': 'Attention Is All You Need',
196-
'authors': ['Vaswani, Ashish', 'Shazeer, Noam', 'Parmar, Niki', 'Uszkoreit, Jakob',
197-
'Jones, Llion', 'Gomez, Aidan N', 'Kaiser, Lukasz', 'Polosukhin, Illia'],
198-
'year': '2017',
199-
'journal': 'Advances in Neural Information Processing Systems',
200-
'arxiv_id': '1706.03762',
201-
'url': 'https://arxiv.org/abs/1706.03762',
202-
'type': 'conference'
203-
}
204-
}
205192

206193
def identify(self, raw_entries: List[RawEntry],
207194
interactive_callback: Callable[[List[Dict]], int]) -> List[IdentifiedEntry]:
@@ -1349,21 +1336,6 @@ def _fuzzy_search(self, raw_entry: RawEntry,
13491336
"""Perform fuzzy search"""
13501337
query_string = raw_entry['query_string']
13511338

1352-
# Check if it's a well-known paper first
1353-
query_lower = query_string.lower()
1354-
for key, paper_data in self.well_known_papers.items():
1355-
if key in query_lower or fuzz.ratio(key, query_lower) > 85:
1356-
self.logger.info(f"Entry {raw_entry['id']} matched well-known paper: {paper_data['title']}")
1357-
return {
1358-
'id': raw_entry['id'],
1359-
'raw_text': raw_entry['raw_text'],
1360-
'doi': None,
1361-
'arxiv_id': paper_data.get('arxiv_id'),
1362-
'url': paper_data.get('url'),
1363-
'metadata': paper_data,
1364-
'status': 'identified'
1365-
}
1366-
13671339
# Multi-source query with fallback
13681340
candidates = []
13691341
semantic_results = [] # Initialize to avoid UnboundLocalError

tests/test_cli.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,10 @@ def test_input_type_and_output_format_choices(self):
5858
assert "{txt,bib}" in out
5959
assert "{bibtex,apa,mla}" in out
6060

61-
def test_nonexistent_file(self):
62-
code, _, _ = self._run(["process", "no_such_file.txt"])
63-
assert code != 0
61+
def test_nonexistent_file_treated_as_string_input(self):
62+
"""fix #36: non-file argument is treated as inline reference string, not an error."""
63+
code, out, err = self._run(["process", "no_such_file.txt", "--quiet"])
64+
assert code in (0, 1)
6465

6566
def test_invalid_output_format(self, create_test_file, sample_references):
6667
path = create_test_file(sample_references["doi_only"])
@@ -92,10 +93,35 @@ def _ns(**overrides):
9293

9394
# -- Missing / bad input --------------------------------------------------
9495

95-
def test_missing_input_file(self, capsys):
96-
code = cli.process_command(self._ns(input_file="nope.txt"))
97-
assert code == 1
98-
assert "Input file not found" in capsys.readouterr().err
96+
def test_string_input_passed_directly(self, capsys):
97+
"""fix #36: non-file argument is treated as inline reference content."""
98+
captured = {}
99+
100+
def _fake(*, input_content, **kw):
101+
captured['content'] = input_content
102+
return {"results": ["OK"], "report": {"total": 1, "succeeded": 1, "failed_entries": []}}
103+
104+
with patch("onecite.cli.process_references", side_effect=_fake):
105+
code = cli.process_command(self._ns(input_file="10.1038/nature14539", quiet=True))
106+
107+
assert code == 0
108+
assert captured['content'] == "10.1038/nature14539"
109+
110+
def test_stdin_input(self, capsys, monkeypatch):
111+
"""fix #36: '-' reads from stdin."""
112+
import io
113+
monkeypatch.setattr("sys.stdin", io.StringIO("10.1038/nature14539\n"))
114+
captured = {}
115+
116+
def _fake(*, input_content, **kw):
117+
captured['content'] = input_content
118+
return {"results": ["OK"], "report": {"total": 1, "succeeded": 1, "failed_entries": []}}
119+
120+
with patch("onecite.cli.process_references", side_effect=_fake):
121+
code = cli.process_command(self._ns(input_file="-", quiet=True))
122+
123+
assert code == 0
124+
assert "10.1038/nature14539" in captured['content']
99125

100126
# -- quiet + output file --------------------------------------------------
101127

tests/test_pipeline_unit.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -487,14 +487,24 @@ def test_timeout_returns_empty(self):
487487

488488
class TestIdentifierFuzzySearch:
489489

490-
def test_well_known_paper_shortcut(self):
491-
"""Papers in the built-in lookup table should be resolved instantly."""
490+
def test_no_hardcoded_well_known_papers(self):
491+
"""fix #19: IdentifierModule must not have a well_known_papers shortcut."""
492+
ident = IdentifierModule()
493+
assert not hasattr(ident, 'well_known_papers'), (
494+
"well_known_papers shortcut should have been removed (#19)")
495+
496+
def test_attention_query_goes_through_normal_search(self):
497+
"""fix #19: 'attention is all you need' must go through normal multi-source search."""
492498
ident = IdentifierModule()
493499
entry = {"id": 1, "raw_text": "Attention is all you need",
494500
"query_string": "Attention is all you need"}
495-
r = ident._fuzzy_search(entry, lambda _: -1)
501+
arxiv_result = {"source": "arxiv", "arxiv_id": "1706.03762",
502+
"doi": "10.48550/arXiv.1706.03762",
503+
"title": "Attention Is All You Need",
504+
"url": "https://arxiv.org/abs/1706.03762"}
505+
with patch.object(ident, "_search_crossref", return_value=[arxiv_result]):
506+
r = ident._fuzzy_search(entry, lambda _: -1)
496507
assert r["status"] == "identified"
497-
assert r["arxiv_id"] == "1706.03762"
498508

499509
def test_pmid_shortcut(self):
500510
ident = IdentifierModule()

0 commit comments

Comments
 (0)