Skip to content

Commit 62f27f6

Browse files
committed
Does not process pages that have been unchecked in the editor.
1 parent 232aa42 commit 62f27f6

6 files changed

Lines changed: 99 additions & 15 deletions

File tree

src/bigocrpdf/image_application.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ def on_open(
119119
win.open_image(file_path)
120120
logger.info(f"Opened image: {file_path}")
121121
else:
122-
logger.warning(f"Could not open: path={file_path}, has_open_image={hasattr(win, 'open_image')}")
122+
logger.warning(
123+
f"Could not open: path={file_path}, has_open_image={hasattr(win, 'open_image')}"
124+
)
123125
else:
124126
logger.warning(f"on_open: win={win}, files={files}")
125127

src/bigocrpdf/services/rapidocr_service/pdf_extractor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ def extract(
270270
pdf_path: Path,
271271
output_dir: Path,
272272
page_range: tuple[int, int] | None = None,
273+
skip_pages: set[int] | None = None,
273274
) -> list[Path | None]:
274275
"""Extract native images from PDF ensuring correct page mapping.
275276
@@ -279,6 +280,12 @@ def extract(
279280
280281
For images stored in formats that OpenCV/PIL cannot decode
281282
(JBIG2, CCITT), falls back to pdftoppm page rendering.
283+
284+
Args:
285+
pdf_path: Path to the PDF file.
286+
output_dir: Directory to extract images to.
287+
page_range: Optional (start, end) 1-indexed page range.
288+
skip_pages: Optional set of 1-indexed page numbers to skip entirely.
282289
"""
283290
output_dir.mkdir(parents=True, exist_ok=True)
284291

@@ -320,9 +327,15 @@ def extract(
320327
raise RuntimeError(f"Failed to extract images: {e}") from e
321328

322329
# 4. Process extracted files based on mapping
330+
_skip = skip_pages or set()
323331
fallback_pages: list[int] = []
324332
for i in range(num_pages_to_process):
325333
current_page = start_page + i
334+
335+
# Skip excluded pages entirely (no extraction, no rendering)
336+
if current_page in _skip:
337+
continue
338+
326339
img_indices = image_mapping.get(current_page, [])
327340

328341
if not img_indices:

src/bigocrpdf/services/rapidocr_service/pipeline_chunked_ocr.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ def _run_chunked_ocr_pipeline(
8787
total_confidence = 0.0
8888
num_chunks = (total_pages + CHUNK_SIZE - 1) // CHUNK_SIZE
8989

90+
# Build set of pages to skip (deleted or excluded from OCR)
91+
skip_pages: set[int] = set()
92+
for rot in page_rotations:
93+
if rot.deleted or not rot.included_for_ocr:
94+
skip_pages.add(rot.page_number)
95+
if skip_pages:
96+
logger.info(f"Skipping {len(skip_pages)} excluded page(s): {sorted(skip_pages)}")
97+
9098
if progress_callback:
9199
progress_callback(5, 100, _("Starting OCR..."))
92100

@@ -103,6 +111,26 @@ def _run_chunked_ocr_pipeline(
103111
chunk_start = chunk_idx * CHUNK_SIZE
104112
chunk_end = min(chunk_start + CHUNK_SIZE, total_pages)
105113

114+
# Check if ALL pages in this chunk are excluded — skip entirely
115+
chunk_page_nums = set(range(chunk_start + 1, chunk_end + 1))
116+
if chunk_page_nums <= skip_pages:
117+
# All pages in chunk are excluded; add blank placeholders
118+
for page_num in sorted(chunk_page_nums):
119+
abs_idx = page_num - 1
120+
if abs_idx < len(all_rotation_dicts):
121+
mb = all_rotation_dicts[abs_idx]["mediabox"]
122+
c.setPageSize((mb[2] - mb[0], mb[3] - mb[1]))
123+
else:
124+
c.setPageSize((595, 842))
125+
c.showPage()
126+
page_standalone_flags.append(False)
127+
stats.pages_processed += 1
128+
logger.info(
129+
f"Chunk {chunk_idx + 1}/{num_chunks} skipped "
130+
f"(all {len(chunk_page_nums)} pages excluded)"
131+
)
132+
continue
133+
106134
for f in images_dir.glob("*"):
107135
try:
108136
f.unlink()
@@ -113,6 +141,7 @@ def _run_chunked_ocr_pipeline(
113141
input_pdf,
114142
output_dir=images_dir,
115143
page_range=(chunk_start + 1, chunk_end),
144+
skip_pages=skip_pages,
116145
)
117146

118147
work_items = self._build_chunk_work_items(
@@ -201,7 +230,7 @@ def _build_chunk_work_items(
201230

202231
if page_num in native_text_pages:
203232
effective_path = None
204-
elif rot.deleted:
233+
elif rot.deleted or not rot.included_for_ocr:
205234
effective_path = None
206235
else:
207236
effective_path = img_path

src/bigocrpdf/services/rapidocr_service/pipeline_mixed_content.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,14 @@ def _process_mixed_content_pdf(
133133
try:
134134
self._wait_for_ocr_ready(ocr_proc)
135135
with pikepdf.open(input_pdf, allow_overwriting_input=True) as pdf:
136+
# Build set of excluded pages from editor modifications
137+
excluded_pages: set[int] = set()
138+
if self.config.page_modifications:
139+
for mod in self.config.page_modifications:
140+
pn = mod.get("page_number")
141+
if pn and (mod.get("deleted") or not mod.get("included_for_ocr", True)):
142+
excluded_pages.add(pn)
143+
136144
self._ocr_image_pages(
137145
pdf,
138146
image_positions,
@@ -142,10 +150,21 @@ def _process_mixed_content_pdf(
142150
ocr_texts,
143151
ocr_proc,
144152
progress_callback,
153+
excluded_pages=excluded_pages,
145154
)
155+
156+
# Remove excluded pages before saving
157+
if excluded_pages:
158+
for idx in sorted(excluded_pages, reverse=True):
159+
if 0 < idx <= len(pdf.pages):
160+
del pdf.pages[idx - 1]
161+
logger.info(f"Removed excluded page {idx} from output")
162+
146163
if progress_callback:
147164
progress_callback(90, 100, _("Saving PDF..."))
148-
stats.pages_processed = len(image_positions)
165+
stats.pages_processed = len(image_positions) - len(
166+
excluded_pages & set(image_positions.keys())
167+
)
149168
pdf.save(output_pdf)
150169
finally:
151170
self._stop_ocr_subprocess(ocr_proc)
@@ -170,19 +189,28 @@ def _ocr_image_pages(
170189
ocr_texts: list[str],
171190
ocr_proc: subprocess.Popen,
172191
progress_callback: Callable[[int, int, str], None] | None,
192+
excluded_pages: set[int] | None = None,
173193
) -> None:
174194
"""OCR all image-bearing pages, modifying the PDF in place."""
175195
enhance = getattr(self.config, "enhance_embedded_images", False)
176196
logger.info(f"Mixed content: enhance_embedded_images={enhance}")
177197
processed_images = 0
178198
current_img_idx = 0
199+
_excluded = excluded_pages or set()
179200

180201
for page_num in sorted(image_positions.keys()):
181202
if hasattr(self, "cancel_event") and self.cancel_event.is_set():
182203
logger.info("Processing cancelled by user in mixed content mode")
183204
raise InterruptedError("Processing cancelled by user")
184205

185206
page_imgs = image_positions[page_num]
207+
208+
# Skip excluded pages entirely (no preprocessing, no OCR)
209+
if page_num in _excluded:
210+
logger.info(f"Page {page_num}: excluded from OCR, skipping ({len(page_imgs)} image(s))")
211+
current_img_idx += len(page_imgs)
212+
continue
213+
186214
page = pdf.pages[page_num - 1]
187215
mediabox = page.mediabox
188216
page_height = float(mediabox[3]) - float(mediabox[1])

src/bigocrpdf/services/rapidocr_service/rotation.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class PageRotation:
3434
original_pdf_rotation: int = 0
3535
editor_rotation: int = 0
3636
deleted: bool = False
37+
included_for_ocr: bool = True
3738
mediabox: list[float] | None = None
3839

3940
@property
@@ -144,6 +145,7 @@ def apply_editor_modifications(
144145
mod = mod_lookup[rot.page_number]
145146
rot.editor_rotation = mod.get("rotation", 0)
146147
rot.deleted = mod.get("deleted", False)
148+
rot.included_for_ocr = mod.get("included_for_ocr", True)
147149

148150
return rotations
149151

@@ -192,10 +194,11 @@ def apply_final_rotation_to_pdf(
192194
f"Page {page_num}: {current_rot}° + {rot_info.editor_rotation}° = {new_rot}°"
193195
)
194196

195-
# Mark for deletion
196-
if rot_info.deleted:
197+
# Mark for deletion (deleted or excluded from OCR)
198+
if rot_info.deleted or not rot_info.included_for_ocr:
197199
pages_to_delete.append(i)
198-
logger.info(f"Page {page_num}: marked for deletion")
200+
reason = "deleted" if rot_info.deleted else "excluded from OCR"
201+
logger.info(f"Page {page_num}: marked for removal ({reason})")
199202

200203
# Delete in reverse order
201204
for idx in reversed(pages_to_delete):

src/bigocrpdf/utils/pdf_utils.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -206,10 +206,10 @@ def extract_images_for_odf(
206206

207207

208208
def open_file_with_default_app(file_path: str) -> bool:
209-
"""Open a file using the system's default application via Gtk.FileLauncher.
209+
"""Open a file using the system's default application.
210210
211-
Uses the GTK4 portal-aware launcher instead of raw subprocess calls,
212-
which is non-blocking on the UI thread and Flatpak-compatible.
211+
Uses Gio.AppInfo to directly launch the default handler without
212+
showing an application chooser dialog.
213213
214214
Args:
215215
file_path: Path to the file to open
@@ -224,17 +224,26 @@ def open_file_with_default_app(file_path: str) -> bool:
224224
try:
225225
import gi
226226

227-
gi.require_version("Gtk", "4.0")
228227
gi.require_version("Gio", "2.0")
229-
from gi.repository import Gio, Gtk
228+
from gi.repository import Gio
230229

231230
gfile = Gio.File.new_for_path(file_path)
232-
launcher = Gtk.FileLauncher.new(gfile)
233-
launcher.launch(None, None, None)
231+
uri = gfile.get_uri()
232+
Gio.AppInfo.launch_default_for_uri(uri, None)
234233
return True
235234
except Exception as e:
236-
logger.error(f"Failed to open file {file_path}: {e}")
237-
return False
235+
logger.warning(f"Gio launch failed for {file_path}: {e}, trying xdg-open")
236+
try:
237+
import subprocess
238+
subprocess.Popen(
239+
["xdg-open", file_path],
240+
stdout=subprocess.DEVNULL,
241+
stderr=subprocess.DEVNULL,
242+
)
243+
return True
244+
except Exception as e2:
245+
logger.error(f"Failed to open file {file_path}: {e2}")
246+
return False
238247

239248

240249
def render_pdf_page_to_png(

0 commit comments

Comments
 (0)