|
5 | 5 |
|
6 | 6 | import re |
7 | 7 | from pathlib import Path |
| 8 | +from typing import Optional |
8 | 9 |
|
9 | 10 | import colrev.env.utils |
10 | 11 | import colrev.record.qm.quality_model |
@@ -53,100 +54,117 @@ def run(self, *, record: colrev.record.record_pdf.PDFRecord) -> None: |
53 | 54 | record.remove_field_provenance_note(key=Fields.FILE, note=self.msg) |
54 | 55 |
|
55 | 56 | def _pages_match_pdf(self, *, record: colrev.record.record_pdf.PDFRecord) -> bool: |
| 57 | + nr_pages_in_file = self._get_pdf_page_count(record=record) |
| 58 | + if nr_pages_in_file is None: |
| 59 | + return False |
56 | 60 |
|
57 | | - def longer_with_appendix( |
58 | | - *, |
59 | | - record: colrev.record.record_pdf.PDFRecord, |
60 | | - nr_pages: int, |
61 | | - ) -> bool: |
62 | | - if 10 < nr_pages < record.data[Fields.NR_PAGES_IN_FILE]: |
63 | | - text = record.extract_text_by_page( |
64 | | - pages=list( |
65 | | - range(nr_pages + 1, record.data[Fields.NR_PAGES_IN_FILE] + 1) |
66 | | - ) |
67 | | - ) |
68 | | - if "appendi" in text.lower(): |
69 | | - return True |
| 61 | + if self._contains_full_version_purchase_notice(record=record): |
70 | 62 | return False |
71 | 63 |
|
72 | | - def roman_to_int(input_str: str) -> int: |
73 | | - input_str = input_str.lower() |
74 | | - roman = { |
75 | | - "i": 1, |
76 | | - "v": 5, |
77 | | - "x": 10, |
78 | | - "l": 50, |
79 | | - "c": 100, |
80 | | - "d": 500, |
81 | | - "m": 1000, |
82 | | - "iv": 4, |
83 | | - "ix": 9, |
84 | | - "xl": 40, |
85 | | - "xc": 90, |
86 | | - "cd": 400, |
87 | | - "cm": 900, |
88 | | - } |
89 | | - i = 0 |
90 | | - num = 0 |
91 | | - while i < len(input_str): |
92 | | - if i + 1 < len(input_str) and input_str[i : i + 2] in roman: |
93 | | - num += roman[input_str[i : i + 2]] |
94 | | - i += 2 |
95 | | - else: |
96 | | - num += roman[input_str[i]] |
97 | | - i += 1 |
98 | | - return num |
99 | | - |
100 | | - def get_nr_pages(*, pages: str) -> int: |
101 | | - pages_str = pages |
102 | | - |
103 | | - roman_pages_matched = re.match(ROMAN_PAGES_PATTERN, pages) |
104 | | - if roman_pages_matched: |
105 | | - start_page, end_page = map( |
106 | | - roman_to_int, roman_pages_matched.group().split("--") |
107 | | - ) |
108 | | - pages_str = f"{start_page}--{end_page}" |
109 | | - |
110 | | - roman_page_matched = re.match(ROMAN_PAGE_PATTERN, pages) |
111 | | - if roman_page_matched: |
112 | | - page = roman_page_matched.group() |
113 | | - pages_str = f"{roman_to_int(page)}" |
| 64 | + expected_page_count = self._get_expected_page_count(record=record) |
| 65 | + if expected_page_count is None: |
| 66 | + return True |
114 | 67 |
|
115 | | - if "--" in pages_str: |
116 | | - start_page, end_page = map(int, pages_str.split("--")) |
117 | | - nr_pages = end_page - start_page + 1 |
118 | | - elif "-" in pages_str: |
119 | | - start_page, end_page = map(int, pages_str.split("-")) |
120 | | - nr_pages = end_page - start_page + 1 |
121 | | - else: |
122 | | - nr_pages = 1 |
123 | | - return nr_pages |
| 68 | + if self._longer_with_appendix( |
| 69 | + record=record, |
| 70 | + nr_pages=expected_page_count, |
| 71 | + nr_pages_in_file=nr_pages_in_file, |
| 72 | + ): |
| 73 | + return True |
124 | 74 |
|
125 | | - # Get nr pages from PDF (set in quality_model) |
126 | | - if Fields.NR_PAGES_IN_FILE not in record.data: |
127 | | - return False |
| 75 | + return expected_page_count == nr_pages_in_file |
128 | 76 |
|
129 | | - # Not complete if there is a FULL_VERSION_PURCHASE_NOTICE |
130 | | - if any( |
131 | | - FULL_VERSION_PURCHASE_NOTICE |
| 77 | + def _get_pdf_page_count( |
| 78 | + self, *, record: colrev.record.record_pdf.PDFRecord |
| 79 | + ) -> Optional[int]: |
| 80 | + if Fields.NR_PAGES_IN_FILE not in record.data: |
| 81 | + return None |
| 82 | + return record.data[Fields.NR_PAGES_IN_FILE] |
| 83 | + |
| 84 | + def _contains_full_version_purchase_notice( |
| 85 | + self, *, record: colrev.record.record_pdf.PDFRecord |
| 86 | + ) -> bool: |
| 87 | + return any( |
| 88 | + full_version_purchase_notice |
132 | 89 | in record.data[Fields.TEXT_FROM_PDF].lower().replace(" ", "") |
133 | | - for FULL_VERSION_PURCHASE_NOTICE in FULL_VERSION_PURCHASE_NOTICES |
134 | | - ): |
135 | | - return False |
136 | | - |
137 | | - # Get nr pages from pages field |
| 90 | + for full_version_purchase_notice in FULL_VERSION_PURCHASE_NOTICES |
| 91 | + ) |
| 92 | + |
| 93 | + def _longer_with_appendix( |
| 94 | + self, |
| 95 | + *, |
| 96 | + record: colrev.record.record_pdf.PDFRecord, |
| 97 | + nr_pages: int, |
| 98 | + nr_pages_in_file: int, |
| 99 | + ) -> bool: |
| 100 | + if 10 < nr_pages < nr_pages_in_file: |
| 101 | + text = record.extract_text_by_page( |
| 102 | + pages=list(range(nr_pages + 1, nr_pages_in_file + 1)) |
| 103 | + ) |
| 104 | + if "appendi" in text.lower(): |
| 105 | + return True |
| 106 | + return False |
| 107 | + |
| 108 | + def _roman_to_int(self, input_str: str) -> int: |
| 109 | + input_str = input_str.lower() |
| 110 | + roman = { |
| 111 | + "i": 1, |
| 112 | + "v": 5, |
| 113 | + "x": 10, |
| 114 | + "l": 50, |
| 115 | + "c": 100, |
| 116 | + "d": 500, |
| 117 | + "m": 1000, |
| 118 | + "iv": 4, |
| 119 | + "ix": 9, |
| 120 | + "xl": 40, |
| 121 | + "xc": 90, |
| 122 | + "cd": 400, |
| 123 | + "cm": 900, |
| 124 | + } |
| 125 | + i = 0 |
| 126 | + num = 0 |
| 127 | + while i < len(input_str): |
| 128 | + if i + 1 < len(input_str) and input_str[i : i + 2] in roman: |
| 129 | + num += roman[input_str[i : i + 2]] |
| 130 | + i += 2 |
| 131 | + else: |
| 132 | + num += roman[input_str[i]] |
| 133 | + i += 1 |
| 134 | + return num |
| 135 | + |
| 136 | + def _normalize_pages_str(self, *, pages: str) -> str: |
| 137 | + pages_str = pages |
| 138 | + |
| 139 | + roman_pages_matched = re.match(ROMAN_PAGES_PATTERN, pages) |
| 140 | + if roman_pages_matched: |
| 141 | + start_page, end_page = map( |
| 142 | + self._roman_to_int, roman_pages_matched.group().split("--") |
| 143 | + ) |
| 144 | + pages_str = f"{start_page}--{end_page}" |
| 145 | + |
| 146 | + roman_page_matched = re.match(ROMAN_PAGE_PATTERN, pages) |
| 147 | + if roman_page_matched: |
| 148 | + page = roman_page_matched.group() |
| 149 | + pages_str = f"{self._roman_to_int(page)}" |
| 150 | + |
| 151 | + return pages_str |
| 152 | + |
| 153 | + def _get_expected_page_count( |
| 154 | + self, *, record: colrev.record.record_pdf.PDFRecord |
| 155 | + ) -> Optional[int]: |
138 | 156 | try: |
139 | | - nr_pages = get_nr_pages(pages=record.data[Fields.PAGES]) |
| 157 | + pages_str = self._normalize_pages_str(pages=record.data[Fields.PAGES]) |
| 158 | + if "--" in pages_str: |
| 159 | + start_page, end_page = map(int, pages_str.split("--")) |
| 160 | + return end_page - start_page + 1 |
| 161 | + if "-" in pages_str: |
| 162 | + start_page, end_page = map(int, pages_str.split("-")) |
| 163 | + return end_page - start_page + 1 |
| 164 | + return 1 |
140 | 165 | except ValueError: |
141 | 166 | # e.g., S49--S50 |
142 | | - return True |
143 | | - |
144 | | - # Special case: if the PDF has more pages than the pages field, it may be complete |
145 | | - if longer_with_appendix(record=record, nr_pages=nr_pages): |
146 | | - return True |
147 | | - |
148 | | - # If the PDF has the same number of pages as the pages field, it is complete |
149 | | - return nr_pages == record.data[Fields.NR_PAGES_IN_FILE] |
| 167 | + return None |
150 | 168 |
|
151 | 169 |
|
152 | 170 | def register(quality_model: colrev.record.qm.quality_model.QualityModel) -> None: |
|
0 commit comments