Skip to content

Commit e22863d

Browse files
Fix Grobid/TEI generation (#669)
* update * test * notify on grobid version mismatch * test * update * show running version * update * update * reset * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revisions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add note/explanation --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 8259f63 commit e22863d

3 files changed

Lines changed: 108 additions & 99 deletions

File tree

colrev/env/grobid_service.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ class GrobidService:
1515
"""An environment service for machine readability/annotation (PDF to TEI conversion)"""
1616

1717
GROBID_URL = "http://localhost:8070"
18-
GROBID_IMAGE = "lfoppiano/grobid:latest-crf"
18+
# Important: do not use :latest versions or :SNAPSHOT versions
19+
# as they may change without notice
20+
GROBID_IMAGE = "lfoppiano/grobid:0.8.2"
1921

2022
def __init__(self) -> None:
2123
colrev.env.docker_manager.DockerManager.build_docker_image(
@@ -24,6 +26,17 @@ def __init__(self) -> None:
2426
self.start()
2527
self.check_grobid_availability()
2628

29+
def _ensure_correct_version(self) -> None:
30+
response = requests.get(self.GROBID_URL + "/api/version", timeout=10)
31+
running_version = response.json()["version"]
32+
if running_version != self.GROBID_IMAGE.split(":")[1]:
33+
logging.warning(
34+
"GROBID version mismatch. Expected: %s, currently running: %s",
35+
self.GROBID_IMAGE.split(":")[1],
36+
running_version,
37+
)
38+
raise Exception
39+
2740
def check_grobid_availability(self, *, wait: bool = True) -> bool:
2841
"""Check whether the GROBID service is available"""
2942
i = 0
@@ -33,6 +46,9 @@ def check_grobid_availability(self, *, wait: bool = True) -> bool:
3346
try:
3447
ret = requests.get(self.GROBID_URL + "/api/isalive", timeout=30)
3548
if ret.text == "true":
49+
# When GROBID is running, it may not be the same version as expected
50+
# in self.GROBID_IMAGE, possibly leading to failing tests.
51+
self._ensure_correct_version()
3652
return True
3753
except requests.exceptions.ConnectionError:
3854
pass

tests/1_env/tei_test.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,27 +44,36 @@ def test_tei_creation(script_loc) -> None: # type: ignore
4444
tei_file = script_loc.parent.joinpath("data/WagnerLukyanenkoParEtAl2022.tei.xml")
4545
pdf_path = script_loc.parent.joinpath("data/WagnerLukyanenkoParEtAl2022.pdf")
4646

47-
tei_file.unlink(missing_ok=True)
47+
tmp_tei_file = tei_file.with_name(tei_file.stem + "_tmp.tei.xml")
48+
if tmp_tei_file.exists():
49+
tmp_tei_file.unlink(missing_ok=True)
50+
tei_file.rename(tmp_tei_file)
4851

49-
colrev.env.tei_parser.TEIParser(pdf_path=pdf_path, tei_path=tei_file)
52+
try:
5053

51-
with open(tei_file) as file:
52-
tei_content = file.read()
54+
colrev.env.tei_parser.TEIParser(pdf_path=pdf_path, tei_path=tei_file)
5355

54-
tei_content = re.sub(
55-
r'(ident="GROBID" when=")[^"]+(">)', r"\g<1>NA\g<2>", tei_content
56-
)
56+
with open(tei_file) as file:
57+
tei_content = file.read()
58+
59+
tei_content = re.sub(
60+
r'(ident="GROBID" when=")[^"]+(">)', r"\g<1>NA\g<2>", tei_content
61+
)
5762

58-
with open(tei_file, "w") as file:
59-
file.write(tei_content)
63+
with open(tei_file, "w") as file:
64+
file.write(tei_content)
65+
except Exception as exc:
66+
print("Restoring original TEI file")
67+
tmp_tei_file.rename(tei_file)
68+
raise exc
6069

6170

6271
@pytest.mark.skipif(
6372
platform.system() != "Linux", reason="Docker tests only run on Linux runners"
6473
)
6574
def test_tei_version(tei_doc) -> None: # type: ignore
6675
"""Test the tei version"""
67-
assert "0.8.3-SNAPSHOT" == tei_doc.get_grobid_version()
76+
assert "0.8.2" == tei_doc.get_grobid_version()
6877

6978

7079
@pytest.mark.skipif(

0 commit comments

Comments
 (0)