Skip to content

Commit b77d3de

Browse files
Refactor tei (#627)
* remove tei from review_manager * extract get_grobid_service * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * deactivate tei-tests (docker) for macos * deactivate remaining tests * only on Linux runners * fix import statements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * obsidian: do not run in silent mode * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tei: start GROBID only when necessary * fix windows path handling * update * update * update * update * update * update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update * update gh-tests * remove redundant import --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 78111f4 commit b77d3de

23 files changed

Lines changed: 81 additions & 123 deletions

colrev/env/grobid_service.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@
44

55
import logging
66
import time
7-
import typing
87

98
import docker
109
import requests
1110

1211
import colrev.env.docker_manager
13-
import colrev.env.environment_manager
1412

1513

1614
class GrobidService:
@@ -19,17 +17,11 @@ class GrobidService:
1917
GROBID_URL = "http://localhost:8070"
2018
GROBID_IMAGE = "lfoppiano/grobid:0.8.1"
2119

22-
def __init__(
23-
self,
24-
*,
25-
environment_manager: typing.Optional[
26-
colrev.env.environment_manager.EnvironmentManager
27-
] = None,
28-
) -> None:
20+
def __init__(self) -> None:
2921
colrev.env.docker_manager.DockerManager.build_docker_image(
3022
imagename=self.GROBID_IMAGE
3123
)
32-
self.start(environment_manager)
24+
self.start()
3325
self.check_grobid_availability()
3426

3527
def check_grobid_availability(self, *, wait: bool = True) -> bool:
@@ -52,12 +44,7 @@ def check_grobid_availability(self, *, wait: bool = True) -> bool:
5244
raise requests.exceptions.ConnectionError()
5345
return True
5446

55-
def start(
56-
self,
57-
environment_manager: typing.Optional[
58-
colrev.env.environment_manager.EnvironmentManager
59-
] = None,
60-
) -> None:
47+
def start(self) -> None:
6148
"""Start the GROBID service"""
6249
# pylint: disable=consider-using-with
6350

@@ -79,7 +66,5 @@ def start(
7966
ports={8070: 8070, 8071: 8071},
8067
detach=True,
8168
)
82-
if environment_manager:
83-
environment_manager.register_ports(["8070", "8071"])
8469

8570
self.check_grobid_availability()

colrev/env/local_index_builder.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,6 @@ def _index_tei_document(self, recs_to_index: list) -> None:
372372
if not tei_path.is_file():
373373
print(f"Create tei for {record_dict[Fields.FILE]}")
374374
tei = colrev.env.tei_parser.TEIParser(
375-
environment_manager=self.environment_manager,
376375
pdf_path=Path(record_dict[Fields.FILE]),
377376
tei_path=tei_path,
378377
)

colrev/env/tei_parser.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class TEIParser:
4343
def __init__(
4444
self,
4545
*,
46-
environment_manager: colrev.env.environment_manager.EnvironmentManager,
4746
pdf_path: typing.Optional[Path] = None,
4847
tei_path: typing.Optional[Path] = None,
4948
):
@@ -53,8 +52,6 @@ def __init__(
5352
- pfd_path and tei_path: create TEI and save in tei_path
5453
- tei_path: read TEI from file
5554
"""
56-
57-
self.environment_manager = environment_manager
5855
# pylint: disable=consider-using-with
5956
assert pdf_path is not None or tei_path is not None
6057
if pdf_path is not None:
@@ -71,6 +68,11 @@ def __init__(
7168
load_from_tei = True
7269

7370
if pdf_path is not None and not load_from_tei:
71+
# TODO / TBD:
72+
# Do not run in continuous-integration environment
73+
# if not self.review_manager.in_ci_environment():
74+
grobid_service = colrev.env.grobid_service.GrobidService()
75+
grobid_service.start()
7476
self._create_tei()
7577

7678
elif tei_path is not None:
@@ -88,9 +90,7 @@ def _read_from_tei(self): # type: ignore
8890

8991
def _create_tei(self) -> None:
9092
"""Create the TEI (based on GROBID)"""
91-
grobid_service = colrev.env.grobid_service.GrobidService(
92-
environment_manager=self.environment_manager
93-
)
93+
grobid_service = colrev.env.grobid_service.GrobidService()
9494
grobid_service.start()
9595
# Note: we have more control and transparency over the consolidation
9696
# if we do it in the colrev process

colrev/loader/md.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import requests
1010

1111
import colrev.env.grobid_service
12+
import colrev.loader.load_utils
1213
import colrev.loader.loader
1314
from colrev.constants import Fields
1415

colrev/ops/colrev_pandas.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from bib_dedupe.bib_dedupe import match
1313
from bib_dedupe.bib_dedupe import prep
1414

15+
import colrev.env.tei_parser
1516
import colrev.ops.check
1617
import colrev.review_manager
1718
from colrev.constants import Fields
@@ -174,17 +175,13 @@ def add_from_tei(
174175
if fields is None:
175176
fields = [Fields.ABSTRACT, Fields.KEYWORDS]
176177

177-
review_manager = colrev.review_manager.ReviewManager(
178-
path_str=project_path, force_mode=True
179-
)
180-
181178
def extract_abstract(record: pd.Series) -> str:
182179
if Fields.ABSTRACT in record and not pd.isnull(record[Fields.ABSTRACT]):
183180
return record[Fields.ABSTRACT]
184181

185182
try:
186183
tei_filename = colrev.record.record.Record(record).get_tei_filename()
187-
tei = review_manager.get_tei(tei_path=tei_filename)
184+
tei = colrev.env.tei_parser.TEIParser(tei_path=tei_filename)
188185
return tei.get_abstract()
189186
except FileNotFoundError:
190187
return ""
@@ -194,7 +191,7 @@ def extract_keywords(record: pd.Series) -> str:
194191
return record[Fields.KEYWORDS]
195192
try:
196193
tei_filename = colrev.record.record.Record(record).get_tei_filename()
197-
tei = review_manager.get_tei(tei_path=tei_filename)
194+
tei = colrev.env.tei_parser.TEIParser(tei_path=tei_filename)
198195
return ", ".join(tei.get_paper_keywords())
199196
except FileNotFoundError:
200197
return ""

colrev/ops/data.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import typing
66
from pathlib import Path
77

8+
import colrev.env.tei_parser
89
import colrev.packages.grobid_tei.src.grobid_tei
910
import colrev.process.operation
1011
from colrev.constants import Colors
@@ -75,7 +76,7 @@ def reading_heuristics(self) -> list:
7576
if not tei_file.is_file():
7677
missing.append(required_records_id)
7778

78-
tei_doc = self.review_manager.get_tei(
79+
tei_doc = colrev.env.tei_parser.TEIParser(
7980
tei_path=tei_file,
8081
)
8182
tei_doc.mark_references(records=records)

colrev/ops/distribute.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import shutil
77
from pathlib import Path
88

9+
import colrev.env.tei_parser
910
import colrev.process.operation
1011
import colrev.settings
1112
from colrev.constants import Fields
@@ -78,11 +79,8 @@ def main(self, *, path: Path, target: Path) -> None:
7879
input(path)
7980

8081
if path.suffix == ".pdf":
81-
grobid_service = self.review_manager.get_grobid_service()
8282

83-
grobid_service.start()
84-
85-
tei = self.review_manager.get_tei(
83+
tei = colrev.env.tei_parser.TEIParser(
8684
pdf_path=path,
8785
)
8886
record = tei.get_metadata()

colrev/ops/pdf_get.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from multiprocessing.pool import ThreadPool as Pool
88
from pathlib import Path
99

10+
import colrev.env.tei_parser
1011
import colrev.exceptions as colrev_exceptions
1112
import colrev.process.operation
1213
import colrev.record.record_pdf
@@ -359,14 +360,12 @@ def check_existing_unlinked_pdfs(
359360
if len(unlinked_pdfs) == 0:
360361
return records
361362

362-
grobid_service = self.review_manager.get_grobid_service()
363-
grobid_service.start()
364363
self.review_manager.logger.info("Check unlinked PDFs")
365364
for file in unlinked_pdfs:
366365
msg = f"Check unlinked PDF: {file.relative_to(self.review_manager.path)}"
367366
self.review_manager.logger.info(msg)
368367
if file.stem not in records.keys():
369-
tei = self.review_manager.get_tei(pdf_path=file)
368+
tei = colrev.env.tei_parser.TEIParser(pdf_path=file)
370369
pdf_record = tei.get_metadata()
371370

372371
if "error" in pdf_record:

colrev/ops/screen.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import math
66
from pathlib import Path
77

8+
import colrev.env.tei_parser
89
import colrev.exceptions as colrev_exceptions
910
import colrev.process.operation
1011
import colrev.record.record
@@ -433,7 +434,7 @@ def add_abstracts_from_tei(self) -> None:
433434
continue
434435

435436
try:
436-
tei = self.review_manager.get_tei(
437+
tei = colrev.env.tei_parser.TEIParser(
437438
pdf_path=Path(record_dict[Fields.FILE]),
438439
tei_path=colrev.record.record.Record(
439440
record_dict

colrev/ops/search_api_feed.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import json
66
import time
7+
import typing
78
from copy import deepcopy
89
from random import randint
910

@@ -20,6 +21,10 @@
2021
from colrev.writer.write_utils import to_string
2122
from colrev.writer.write_utils import write_file
2223

24+
if typing.TYPE_CHECKING:
25+
import colrev.review_manager
26+
import colrev.settings
27+
2328

2429
# Keep in mind the need for lock-mechanisms, e.g., in concurrent prep operations
2530
class SearchAPIFeed:

0 commit comments

Comments
 (0)