Skip to content

Commit d842a29

Browse files
author
Gerit Wagner
committed
record.set_text_from_pdf(): add first_pages param
1 parent 3484bba commit d842a29

6 files changed

Lines changed: 14 additions & 8 deletions

File tree

colrev/ops/pdf_prep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def prepare_pdf(self, item: dict) -> dict:
137137
record_dict, path=self.review_manager.path
138138
)
139139
if record_dict[Fields.FILE].endswith(".pdf"):
140-
record.set_text_from_pdf()
140+
record.set_text_from_pdf(first_pages=True)
141141
original_filename = record_dict[Fields.FILE]
142142

143143
self.review_manager.logger.debug(f"Start PDF prep of {record_dict[Fields.ID]}")

colrev/packages/files_dir/src/files_dir.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def _get_grobid_metadata(self, *, file_path: Path) -> dict:
315315
record = colrev.record.record_pdf.PDFRecord(
316316
record_dict, path=self.review_manager.path
317317
)
318-
record.set_text_from_pdf()
318+
record.set_text_from_pdf(first_pages=True)
319319
record_dict = record.get_data()
320320
if Fields.TEXT_FROM_PDF in record_dict:
321321
text: str = record_dict[Fields.TEXT_FROM_PDF]
@@ -676,7 +676,7 @@ def _add_doi_from_pdf_if_not_available(self, record_dict: dict) -> None:
676676
record_dict, path=self.review_manager.path
677677
)
678678
if Fields.DOI not in record_dict:
679-
record.set_text_from_pdf()
679+
record.set_text_from_pdf(first_pages=True)
680680
res = re.findall(self._doi_regex, record.data[Fields.TEXT_FROM_PDF])
681681
if res:
682682
record.data[Fields.DOI] = res[0].upper()

colrev/packages/genai/src/genai_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def _run_prompt(self) -> None:
110110
record = colrev.record.record_pdf.PDFRecord(
111111
record_dict, path=self.review_manager.path
112112
)
113-
record.set_text_from_pdf(all_pages=True)
113+
record.set_text_from_pdf()
114114

115115
user_message = self.settings.prompt.format(
116116
record_id=record_dict[Fields.ID],

colrev/packages/ocrmypdf/src/ocrmypdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def _apply_ocr(
8282
record.add_field_provenance_note(
8383
key=Fields.FILE, note="pdf_processed with OCRMYPDF"
8484
)
85-
record.set_text_from_pdf()
85+
record.set_text_from_pdf(first_pages=True)
8686
return record
8787

8888
def prep_pdf(

colrev/record/qm/quality_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def run(self, *, record: colrev.record.record.Record) -> None:
8686
):
8787
# The following should be improved.
8888
record = colrev.record.record_pdf.PDFRecord(record.data, path=self.path)
89-
record.set_text_from_pdf()
89+
record.set_text_from_pdf(first_pages=True)
9090

9191
for checker in self.checkers:
9292
if checker.msg in self.defects_to_ignore:

colrev/record/record_pdf.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,17 @@ def set_nr_pages_in_pdf(self) -> None:
9898
pages_in_file = doc.page_count
9999
self.data[Fields.NR_PAGES_IN_FILE] = pages_in_file
100100

101-
def set_text_from_pdf(self) -> None:
101+
def set_text_from_pdf(self, *, first_pages: bool = False) -> None:
102102
"""Set the text_from_pdf field based on the PDF"""
103103
self.data[Fields.TEXT_FROM_PDF] = ""
104104
self.set_nr_pages_in_pdf()
105-
text = self.extract_text_by_page(pages=[0, 1, 2])
105+
106+
if first_pages:
107+
pages = [0, 1, 2]
108+
else:
109+
pages = list(range(self.data[Fields.NR_PAGES_IN_FILE]))
110+
111+
text = self.extract_text_by_page(pages=pages)
106112
text_from_pdf = text.replace("\n", " ").replace("\x0c", "")
107113
self.data[Fields.TEXT_FROM_PDF] = text_from_pdf
108114

0 commit comments

Comments
 (0)