Skip to content

Commit 98695be

Browse files
author
Ang
committed
fix: remove placeholder strings (#24), use exceptions (#13), add --google-scholar (#10)
- #24: remove 'Unknown Title'/'Unknown University'/'Unknown Author' placeholders - _detect_thesis: thesis fallback returns None if no title extracted - _convert_search_metadata: school defaults to empty string, not placeholder - #13: raise FormatError for unsupported output formats (was silent bibtex fallback) - Import ValidationError/FormatError in pipeline.py - process_references(): raise ValidationError on empty input_content - #10: add --google-scholar CLI flag, pass use_google_scholar to process_references() - Update tests: 87 passing Closes #24 Closes #13 Closes #10
1 parent f8104ce commit 98695be

5 files changed

Lines changed: 85 additions & 21 deletions

File tree

onecite/cli.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ def create_parser() -> argparse.ArgumentParser:
105105
action='store_true',
106106
help='Suppress verbose logging output'
107107
)
108+
process_parser.add_argument(
109+
'--google-scholar',
110+
action='store_true',
111+
default=False,
112+
help='Enable Google Scholar as an additional data source (requires scholarly package)'
113+
)
108114

109115
return parser
110116

@@ -167,7 +173,8 @@ def interactive_callback(candidates: List[Dict]) -> int:
167173
input_type=args.input_type,
168174
template_name=args.template,
169175
output_format=args.output_format,
170-
interactive_callback=interactive_callback
176+
interactive_callback=interactive_callback,
177+
use_google_scholar=args.google_scholar,
171178
)
172179

173180
output_content = '\n\n'.join(result['results'])

onecite/core.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ def process_references(
186186
input_type: str,
187187
template_name: str,
188188
output_format: str,
189-
interactive_callback: Callable[[List[Dict]], int]
189+
interactive_callback: Callable[[List[Dict]], int],
190+
use_google_scholar: bool = False,
190191
) -> Dict[str, Any]:
191192
"""Process references and return formatted citations with a report.
192193
@@ -216,5 +217,7 @@ def process_references(
216217
ParseError: If the input cannot be parsed.
217218
ResolverError: If no data source can resolve a reference.
218219
"""
219-
pipeline = PipelineController(use_google_scholar=False)
220+
if not input_content or not input_content.strip():
221+
raise ValidationError("input_content must not be empty.")
222+
pipeline = PipelineController(use_google_scholar=use_google_scholar)
220223
return pipeline.process(input_content, input_type, template_name, output_format, interactive_callback)

onecite/pipeline.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from html import unescape
2020

2121
from .core import RawEntry, IdentifiedEntry, CompletedEntry
22-
from .exceptions import ParseError, ResolverError
22+
from .exceptions import ParseError, ResolverError, ValidationError, FormatError
2323
import urllib.parse
2424

2525

@@ -635,7 +635,7 @@ def _detect_thesis(self, text: str) -> Optional[Dict]:
635635

636636
# Pattern: Author (Year). Title. Type. University.
637637
author_match = re.match(r'^([^(]+?)\s*\(', text)
638-
author = author_match.group(1).strip() if author_match else 'Unknown Author'
638+
author = author_match.group(1).strip() if author_match else None
639639

640640
year_match = re.search(r'\((\d{4})\)', text)
641641
year = int(year_match.group(1)) if year_match else None
@@ -677,7 +677,7 @@ def _detect_thesis(self, text: str) -> Optional[Dict]:
677677
if openaire_metadata:
678678
openaire_metadata['thesis_type'] = thesis_type
679679
openaire_metadata['is_thesis'] = True
680-
if author and author != 'Unknown Author':
680+
if author:
681681
openaire_metadata['authors'] = [author]
682682
if school:
683683
openaire_metadata['school'] = school
@@ -688,23 +688,27 @@ def _detect_thesis(self, text: str) -> Optional[Dict]:
688688
if base_metadata:
689689
base_metadata['thesis_type'] = thesis_type
690690
base_metadata['is_thesis'] = True
691-
if author and author != 'Unknown Author':
691+
if author:
692692
base_metadata['authors'] = [author]
693693
if school:
694694
base_metadata['school'] = school
695695
return base_metadata
696696

697-
# Fallback: create basic metadata from extraction
698-
return {
697+
# Fallback: create basic metadata from extraction (only if we have a title)
698+
if not title:
699+
return None
700+
result = {
699701
'source': 'manual',
700702
'type': thesis_type,
701703
'is_thesis': True,
702704
'thesis_type': thesis_type,
703-
'title': title or 'Unknown Title',
705+
'title': title,
704706
'authors': [author] if author else [],
705707
'year': year,
706-
'school': school or 'Unknown University'
707708
}
709+
if school:
710+
result['school'] = school
711+
return result
708712

709713
except Exception as e:
710714
self.logger.warning(f"Failed to detect thesis: {str(e)}")
@@ -2687,7 +2691,7 @@ def _convert_search_metadata(self, metadata: Dict) -> Optional[Dict]:
26872691
result = {
26882692
'title': metadata.get('title', ''),
26892693
'author': formatted_authors,
2690-
'school': metadata.get('school', 'Unknown University'),
2694+
'school': metadata.get('school', ''),
26912695
'year': str(metadata.get('year', '')),
26922696
}
26932697
if metadata.get('url'):
@@ -3124,8 +3128,7 @@ def format(self, completed_entries: List[CompletedEntry],
31243128
elif output_format.lower() == 'mla':
31253129
formatted_string = self._format_mla(entry)
31263130
else:
3127-
# Default to BibTeX
3128-
formatted_string = self._format_bibtex(entry)
3131+
raise FormatError(f"Unsupported output format: {output_format!r}. Choose 'bibtex', 'apa', or 'mla'.")
31293132

31303133
formatted_strings.append(formatted_string)
31313134

tests/test_cli.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,26 @@ def _ns(**overrides):
8787
output=None,
8888
interactive=False,
8989
quiet=False,
90+
google_scholar=False,
9091
)
9192
defaults.update(overrides)
9293
return argparse.Namespace(**defaults)
9394

9495
# -- Missing / bad input --------------------------------------------------
9596

97+
def test_google_scholar_flag_passed_through(self, capsys):
98+
"""fix #10: --google-scholar flag must be forwarded to process_references."""
99+
captured = {}
100+
101+
def _fake(*, use_google_scholar, **kw):
102+
captured['gs'] = use_google_scholar
103+
return {"results": ["OK"], "report": {"total": 1, "succeeded": 1, "failed_entries": []}}
104+
105+
with patch("onecite.cli.process_references", side_effect=_fake):
106+
cli.process_command(self._ns(input_file="10.1/x", quiet=True, google_scholar=True))
107+
108+
assert captured['gs'] is True
109+
96110
def test_string_input_passed_directly(self, capsys):
97111
"""fix #36: non-file argument is treated as inline reference content."""
98112
captured = {}
@@ -131,7 +145,7 @@ def test_quiet_writes_to_file(self, tmp_path, capsys):
131145
outf = tmp_path / "out.bib"
132146

133147
def _fake(*, input_content, input_type, template_name,
134-
output_format, interactive_callback):
148+
output_format, interactive_callback, **kw):
135149
# quiet mode → callback should auto-skip
136150
assert interactive_callback(
137151
[{"title": "T", "authors": [], "journal": "", "year": 2020, "match_score": 75}]
@@ -154,7 +168,7 @@ def test_interactive_selects_first(self, tmp_path, capsys):
154168
inf.write_text("query", encoding="utf-8")
155169

156170
def _fake(*, input_content, input_type, template_name,
157-
output_format, interactive_callback):
171+
output_format, interactive_callback, **kw):
158172
choice = interactive_callback([
159173
{"title": "A", "authors": ["X"], "journal": "J", "year": 2020, "match_score": 75},
160174
{"title": "B", "authors": ["Y"], "journal": "J", "year": 2021, "match_score": 74},
@@ -177,7 +191,7 @@ def test_interactive_invalid_then_skip(self, tmp_path, capsys):
177191
inf.write_text("query", encoding="utf-8")
178192

179193
def _fake(*, input_content, input_type, template_name,
180-
output_format, interactive_callback):
194+
output_format, interactive_callback, **kw):
181195
choice = interactive_callback([
182196
{"title": "A", "authors": [], "journal": "", "year": 2020, "match_score": 75},
183197
])
@@ -196,7 +210,7 @@ def test_interactive_ctrl_c(self, tmp_path, capsys):
196210
inf.write_text("query", encoding="utf-8")
197211

198212
def _fake(*, input_content, input_type, template_name,
199-
output_format, interactive_callback):
213+
output_format, interactive_callback, **kw):
200214
assert interactive_callback([
201215
{"title": "A", "authors": [], "journal": "", "year": 2020, "match_score": 75},
202216
]) == -1
@@ -229,7 +243,7 @@ def test_output_saved_message_and_failures(self, tmp_path, capsys):
229243
outf = tmp_path / "out.bib"
230244

231245
def _fake(*, input_content, input_type, template_name,
232-
output_format, interactive_callback):
246+
output_format, interactive_callback, **kw):
233247
return {
234248
"results": ["OK"],
235249
"report": {"total": 2, "succeeded": 1,

tests/test_pipeline_unit.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,7 +1025,43 @@ def test_mla_single_author(self):
10251025
}
10261026
assert "Solo, Han." in fmt._format_mla(entry)
10271027

1028-
def test_unknown_format_defaults_to_bibtex(self):
1028+
def test_unsupported_format_raises_format_error(self):
1029+
"""fix #13: unsupported output_format must raise FormatError, not silently use bibtex."""
1030+
from onecite.exceptions import FormatError
1031+
fmt = FormatterModule()
1032+
entry = {
1033+
"id": 1, "doi": "10.1/x", "status": "completed",
1034+
"bib_key": "K", "bib_data": {"ENTRYTYPE": "article", "ID": "K", "title": "T"},
1035+
}
1036+
r = fmt.format([entry], "ris")
1037+
assert len(r["report"]["failed_entries"]) == 1
1038+
assert "Unsupported output format" in r["report"]["failed_entries"][0]["error"]
1039+
1040+
def test_no_placeholder_in_thesis_fallback(self):
1041+
"""fix #24: manual thesis fallback must not inject Unknown Title/University."""
1042+
ident = IdentifierModule()
1043+
with patch.object(ident, "_search_openaire_for_thesis", return_value=None), \
1044+
patch.object(ident, "_search_base_for_thesis", return_value=None):
1045+
result = ident._detect_thesis(
1046+
"Smith, J. (2020). Neural Architecture Search. PhD Thesis. Stanford University."
1047+
)
1048+
if result:
1049+
assert result.get("title") != "Unknown Title"
1050+
assert result.get("school") != "Unknown University"
1051+
assert "Unknown Author" not in str(result.get("authors", []))
1052+
1053+
def test_process_references_empty_input_raises(self):
1054+
"""fix #13/#24: empty input_content raises ValidationError."""
1055+
from onecite.exceptions import ValidationError
1056+
from onecite.core import process_references
1057+
import pytest
1058+
with pytest.raises(ValidationError):
1059+
process_references("", "txt", "journal_article_full", "bibtex", lambda c: -1)
1060+
with pytest.raises(ValidationError):
1061+
process_references(" ", "txt", "journal_article_full", "bibtex", lambda c: -1)
1062+
1063+
def test_unknown_format_raises_in_failed_entries(self):
1064+
"""fix #13: unknown format results in failed entry, not silent bibtex fallback."""
10291065
fmt = FormatterModule()
10301066
entry = {
10311067
"id": 1, "doi": "10.1234/x", "status": "completed",
@@ -1034,7 +1070,8 @@ def test_unknown_format_defaults_to_bibtex(self):
10341070
"author": "Author, A", "year": "2020"},
10351071
}
10361072
r = fmt.format([entry], "unknown_format")
1037-
assert "@article{Test2020" in r["results"][0]
1073+
assert r["report"]["succeeded"] == 0
1074+
assert len(r["report"]["failed_entries"]) == 1
10381075

10391076

10401077
# ===================================================================

0 commit comments

Comments
 (0)