Skip to content

Commit 24b6b8a

Browse files
committed
refactor: pdf-incomplete
1 parent d0583c4 commit 24b6b8a

1 file changed

Lines changed: 102 additions & 84 deletions

File tree

colrev/record/qm/pdf_checkers/pdf_incomplete.py

Lines changed: 102 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import re
77
from pathlib import Path
8+
from typing import Optional
89

910
import colrev.env.utils
1011
import colrev.record.qm.quality_model
@@ -53,100 +54,117 @@ def run(self, *, record: colrev.record.record_pdf.PDFRecord) -> None:
5354
record.remove_field_provenance_note(key=Fields.FILE, note=self.msg)
5455

5556
def _pages_match_pdf(self, *, record: colrev.record.record_pdf.PDFRecord) -> bool:
57+
nr_pages_in_file = self._get_pdf_page_count(record=record)
58+
if nr_pages_in_file is None:
59+
return False
5660

57-
def longer_with_appendix(
58-
*,
59-
record: colrev.record.record_pdf.PDFRecord,
60-
nr_pages: int,
61-
) -> bool:
62-
if 10 < nr_pages < record.data[Fields.NR_PAGES_IN_FILE]:
63-
text = record.extract_text_by_page(
64-
pages=list(
65-
range(nr_pages + 1, record.data[Fields.NR_PAGES_IN_FILE] + 1)
66-
)
67-
)
68-
if "appendi" in text.lower():
69-
return True
61+
if self._contains_full_version_purchase_notice(record=record):
7062
return False
7163

72-
def roman_to_int(input_str: str) -> int:
73-
input_str = input_str.lower()
74-
roman = {
75-
"i": 1,
76-
"v": 5,
77-
"x": 10,
78-
"l": 50,
79-
"c": 100,
80-
"d": 500,
81-
"m": 1000,
82-
"iv": 4,
83-
"ix": 9,
84-
"xl": 40,
85-
"xc": 90,
86-
"cd": 400,
87-
"cm": 900,
88-
}
89-
i = 0
90-
num = 0
91-
while i < len(input_str):
92-
if i + 1 < len(input_str) and input_str[i : i + 2] in roman:
93-
num += roman[input_str[i : i + 2]]
94-
i += 2
95-
else:
96-
num += roman[input_str[i]]
97-
i += 1
98-
return num
99-
100-
def get_nr_pages(*, pages: str) -> int:
101-
pages_str = pages
102-
103-
roman_pages_matched = re.match(ROMAN_PAGES_PATTERN, pages)
104-
if roman_pages_matched:
105-
start_page, end_page = map(
106-
roman_to_int, roman_pages_matched.group().split("--")
107-
)
108-
pages_str = f"{start_page}--{end_page}"
109-
110-
roman_page_matched = re.match(ROMAN_PAGE_PATTERN, pages)
111-
if roman_page_matched:
112-
page = roman_page_matched.group()
113-
pages_str = f"{roman_to_int(page)}"
64+
expected_page_count = self._get_expected_page_count(record=record)
65+
if expected_page_count is None:
66+
return True
11467

115-
if "--" in pages_str:
116-
start_page, end_page = map(int, pages_str.split("--"))
117-
nr_pages = end_page - start_page + 1
118-
elif "-" in pages_str:
119-
start_page, end_page = map(int, pages_str.split("-"))
120-
nr_pages = end_page - start_page + 1
121-
else:
122-
nr_pages = 1
123-
return nr_pages
68+
if self._longer_with_appendix(
69+
record=record,
70+
nr_pages=expected_page_count,
71+
nr_pages_in_file=nr_pages_in_file,
72+
):
73+
return True
12474

125-
# Get nr pages from PDF (set in quality_model)
126-
if Fields.NR_PAGES_IN_FILE not in record.data:
127-
return False
75+
return expected_page_count == nr_pages_in_file
12876

129-
# Not complete if there is a FULL_VERSION_PURCHASE_NOTICE
130-
if any(
131-
FULL_VERSION_PURCHASE_NOTICE
77+
def _get_pdf_page_count(
78+
self, *, record: colrev.record.record_pdf.PDFRecord
79+
) -> Optional[int]:
80+
if Fields.NR_PAGES_IN_FILE not in record.data:
81+
return None
82+
return record.data[Fields.NR_PAGES_IN_FILE]
83+
84+
def _contains_full_version_purchase_notice(
85+
self, *, record: colrev.record.record_pdf.PDFRecord
86+
) -> bool:
87+
return any(
88+
full_version_purchase_notice
13289
in record.data[Fields.TEXT_FROM_PDF].lower().replace(" ", "")
133-
for FULL_VERSION_PURCHASE_NOTICE in FULL_VERSION_PURCHASE_NOTICES
134-
):
135-
return False
136-
137-
# Get nr pages from pages field
90+
for full_version_purchase_notice in FULL_VERSION_PURCHASE_NOTICES
91+
)
92+
93+
def _longer_with_appendix(
94+
self,
95+
*,
96+
record: colrev.record.record_pdf.PDFRecord,
97+
nr_pages: int,
98+
nr_pages_in_file: int,
99+
) -> bool:
100+
if 10 < nr_pages < nr_pages_in_file:
101+
text = record.extract_text_by_page(
102+
pages=list(range(nr_pages + 1, nr_pages_in_file + 1))
103+
)
104+
if "appendi" in text.lower():
105+
return True
106+
return False
107+
108+
def _roman_to_int(self, input_str: str) -> int:
109+
input_str = input_str.lower()
110+
roman = {
111+
"i": 1,
112+
"v": 5,
113+
"x": 10,
114+
"l": 50,
115+
"c": 100,
116+
"d": 500,
117+
"m": 1000,
118+
"iv": 4,
119+
"ix": 9,
120+
"xl": 40,
121+
"xc": 90,
122+
"cd": 400,
123+
"cm": 900,
124+
}
125+
i = 0
126+
num = 0
127+
while i < len(input_str):
128+
if i + 1 < len(input_str) and input_str[i : i + 2] in roman:
129+
num += roman[input_str[i : i + 2]]
130+
i += 2
131+
else:
132+
num += roman[input_str[i]]
133+
i += 1
134+
return num
135+
136+
def _normalize_pages_str(self, *, pages: str) -> str:
137+
pages_str = pages
138+
139+
roman_pages_matched = re.match(ROMAN_PAGES_PATTERN, pages)
140+
if roman_pages_matched:
141+
start_page, end_page = map(
142+
self._roman_to_int, roman_pages_matched.group().split("--")
143+
)
144+
pages_str = f"{start_page}--{end_page}"
145+
146+
roman_page_matched = re.match(ROMAN_PAGE_PATTERN, pages)
147+
if roman_page_matched:
148+
page = roman_page_matched.group()
149+
pages_str = f"{self._roman_to_int(page)}"
150+
151+
return pages_str
152+
153+
def _get_expected_page_count(
154+
self, *, record: colrev.record.record_pdf.PDFRecord
155+
) -> Optional[int]:
138156
try:
139-
nr_pages = get_nr_pages(pages=record.data[Fields.PAGES])
157+
pages_str = self._normalize_pages_str(pages=record.data[Fields.PAGES])
158+
if "--" in pages_str:
159+
start_page, end_page = map(int, pages_str.split("--"))
160+
return end_page - start_page + 1
161+
if "-" in pages_str:
162+
start_page, end_page = map(int, pages_str.split("-"))
163+
return end_page - start_page + 1
164+
return 1
140165
except ValueError:
141166
# e.g., S49--S50
142-
return True
143-
144-
# Special case: if the PDF has more pages than the pages field, it may be complete
145-
if longer_with_appendix(record=record, nr_pages=nr_pages):
146-
return True
147-
148-
# If the PDF has the same number of pages as the pages field, it is complete
149-
return nr_pages == record.data[Fields.NR_PAGES_IN_FILE]
167+
return None
150168

151169

152170
def register(quality_model: colrev.record.qm.quality_model.QualityModel) -> None:

0 commit comments

Comments
 (0)