Does not process pages that have been unchecked in the editor.

bigbruno · bigbruno · commit 62f27f6b89d8 · 2026-02-24T22:12:28.000-03:00
diff --git a/src/bigocrpdf/image_application.py b/src/bigocrpdf/image_application.py
@@ -119,7 +119,9 @@ def on_open(
                 win.open_image(file_path)
                 logger.info(f"Opened image: {file_path}")
             else:
-                logger.warning(f"Could not open: path={file_path}, has_open_image={hasattr(win, 'open_image')}")
+                logger.warning(
+                    f"Could not open: path={file_path}, has_open_image={hasattr(win, 'open_image')}"
+                )
         else:
             logger.warning(f"on_open: win={win}, files={files}")
 
diff --git a/src/bigocrpdf/services/rapidocr_service/pdf_extractor.py b/src/bigocrpdf/services/rapidocr_service/pdf_extractor.py
@@ -270,6 +270,7 @@ def extract(
         pdf_path: Path,
         output_dir: Path,
         page_range: tuple[int, int] | None = None,
+        skip_pages: set[int] | None = None,
     ) -> list[Path | None]:
         """Extract native images from PDF ensuring correct page mapping.
 
@@ -279,6 +280,12 @@ def extract(
 
         For images stored in formats that OpenCV/PIL cannot decode
         (JBIG2, CCITT), falls back to pdftoppm page rendering.
+
+        Args:
+            pdf_path: Path to the PDF file.
+            output_dir: Directory to extract images to.
+            page_range: Optional (start, end) 1-indexed page range.
+            skip_pages: Optional set of 1-indexed page numbers to skip entirely.
         """
         output_dir.mkdir(parents=True, exist_ok=True)
 
@@ -320,9 +327,15 @@ def extract(
             raise RuntimeError(f"Failed to extract images: {e}") from e
 
         # 4. Process extracted files based on mapping
+        _skip = skip_pages or set()
         fallback_pages: list[int] = []
         for i in range(num_pages_to_process):
             current_page = start_page + i
+
+            # Skip excluded pages entirely (no extraction, no rendering)
+            if current_page in _skip:
+                continue
+
             img_indices = image_mapping.get(current_page, [])
 
             if not img_indices:
diff --git a/src/bigocrpdf/services/rapidocr_service/pipeline_chunked_ocr.py b/src/bigocrpdf/services/rapidocr_service/pipeline_chunked_ocr.py
@@ -87,6 +87,14 @@ def _run_chunked_ocr_pipeline(
         total_confidence = 0.0
         num_chunks = (total_pages + CHUNK_SIZE - 1) // CHUNK_SIZE
 
+        # Build set of pages to skip (deleted or excluded from OCR)
+        skip_pages: set[int] = set()
+        for rot in page_rotations:
+            if rot.deleted or not rot.included_for_ocr:
+                skip_pages.add(rot.page_number)
+        if skip_pages:
+            logger.info(f"Skipping {len(skip_pages)} excluded page(s): {sorted(skip_pages)}")
+
         if progress_callback:
             progress_callback(5, 100, _("Starting OCR..."))
 
@@ -103,6 +111,26 @@ def _run_chunked_ocr_pipeline(
                     chunk_start = chunk_idx * CHUNK_SIZE
                     chunk_end = min(chunk_start + CHUNK_SIZE, total_pages)
 
+                    # Check if ALL pages in this chunk are excluded — skip entirely
+                    chunk_page_nums = set(range(chunk_start + 1, chunk_end + 1))
+                    if chunk_page_nums <= skip_pages:
+                        # All pages in chunk are excluded; add blank placeholders
+                        for page_num in sorted(chunk_page_nums):
+                            abs_idx = page_num - 1
+                            if abs_idx < len(all_rotation_dicts):
+                                mb = all_rotation_dicts[abs_idx]["mediabox"]
+                                c.setPageSize((mb[2] - mb[0], mb[3] - mb[1]))
+                            else:
+                                c.setPageSize((595, 842))
+                            c.showPage()
+                            page_standalone_flags.append(False)
+                            stats.pages_processed += 1
+                        logger.info(
+                            f"Chunk {chunk_idx + 1}/{num_chunks} skipped "
+                            f"(all {len(chunk_page_nums)} pages excluded)"
+                        )
+                        continue
+
                     for f in images_dir.glob("*"):
                         try:
                             f.unlink()
@@ -113,6 +141,7 @@ def _run_chunked_ocr_pipeline(
                         input_pdf,
                         output_dir=images_dir,
                         page_range=(chunk_start + 1, chunk_end),
+                        skip_pages=skip_pages,
                     )
 
                     work_items = self._build_chunk_work_items(
@@ -201,7 +230,7 @@ def _build_chunk_work_items(
 
             if page_num in native_text_pages:
                 effective_path = None
-            elif rot.deleted:
+            elif rot.deleted or not rot.included_for_ocr:
                 effective_path = None
             else:
                 effective_path = img_path
diff --git a/src/bigocrpdf/services/rapidocr_service/pipeline_mixed_content.py b/src/bigocrpdf/services/rapidocr_service/pipeline_mixed_content.py
@@ -133,6 +133,14 @@ def _process_mixed_content_pdf(
             try:
                 self._wait_for_ocr_ready(ocr_proc)
                 with pikepdf.open(input_pdf, allow_overwriting_input=True) as pdf:
+                    # Build set of excluded pages from editor modifications
+                    excluded_pages: set[int] = set()
+                    if self.config.page_modifications:
+                        for mod in self.config.page_modifications:
+                            pn = mod.get("page_number")
+                            if pn and (mod.get("deleted") or not mod.get("included_for_ocr", True)):
+                                excluded_pages.add(pn)
+
                     self._ocr_image_pages(
                         pdf,
                         image_positions,
@@ -142,10 +150,21 @@ def _process_mixed_content_pdf(
                         ocr_texts,
                         ocr_proc,
                         progress_callback,
+                        excluded_pages=excluded_pages,
                     )
+
+                    # Remove excluded pages before saving
+                    if excluded_pages:
+                        for idx in sorted(excluded_pages, reverse=True):
+                            if 0 < idx <= len(pdf.pages):
+                                del pdf.pages[idx - 1]
+                                logger.info(f"Removed excluded page {idx} from output")
+
                     if progress_callback:
                         progress_callback(90, 100, _("Saving PDF..."))
-                    stats.pages_processed = len(image_positions)
+                    stats.pages_processed = len(image_positions) - len(
+                        excluded_pages & set(image_positions.keys())
+                    )
                     pdf.save(output_pdf)
             finally:
                 self._stop_ocr_subprocess(ocr_proc)
@@ -170,19 +189,28 @@ def _ocr_image_pages(
         ocr_texts: list[str],
         ocr_proc: subprocess.Popen,
         progress_callback: Callable[[int, int, str], None] | None,
+        excluded_pages: set[int] | None = None,
     ) -> None:
         """OCR all image-bearing pages, modifying the PDF in place."""
         enhance = getattr(self.config, "enhance_embedded_images", False)
         logger.info(f"Mixed content: enhance_embedded_images={enhance}")
         processed_images = 0
         current_img_idx = 0
+        _excluded = excluded_pages or set()
 
         for page_num in sorted(image_positions.keys()):
             if hasattr(self, "cancel_event") and self.cancel_event.is_set():
                 logger.info("Processing cancelled by user in mixed content mode")
                 raise InterruptedError("Processing cancelled by user")
 
             page_imgs = image_positions[page_num]
+
+            # Skip excluded pages entirely (no preprocessing, no OCR)
+            if page_num in _excluded:
+                logger.info(f"Page {page_num}: excluded from OCR, skipping ({len(page_imgs)} image(s))")
+                current_img_idx += len(page_imgs)
+                continue
+
             page = pdf.pages[page_num - 1]
             mediabox = page.mediabox
             page_height = float(mediabox[3]) - float(mediabox[1])
diff --git a/src/bigocrpdf/services/rapidocr_service/rotation.py b/src/bigocrpdf/services/rapidocr_service/rotation.py
@@ -34,6 +34,7 @@ class PageRotation:
     original_pdf_rotation: int = 0
     editor_rotation: int = 0
     deleted: bool = False
+    included_for_ocr: bool = True
     mediabox: list[float] | None = None
 
     @property
@@ -144,6 +145,7 @@ def apply_editor_modifications(
             mod = mod_lookup[rot.page_number]
             rot.editor_rotation = mod.get("rotation", 0)
             rot.deleted = mod.get("deleted", False)
+            rot.included_for_ocr = mod.get("included_for_ocr", True)
 
     return rotations
 
@@ -192,10 +194,11 @@ def apply_final_rotation_to_pdf(
                     f"Page {page_num}: {current_rot}° + {rot_info.editor_rotation}° = {new_rot}°"
                 )
 
-            # Mark for deletion
-            if rot_info.deleted:
+            # Mark for deletion (deleted or excluded from OCR)
+            if rot_info.deleted or not rot_info.included_for_ocr:
                 pages_to_delete.append(i)
-                logger.info(f"Page {page_num}: marked for deletion")
+                reason = "deleted" if rot_info.deleted else "excluded from OCR"
+                logger.info(f"Page {page_num}: marked for removal ({reason})")
 
         # Delete in reverse order
         for idx in reversed(pages_to_delete):
diff --git a/src/bigocrpdf/utils/pdf_utils.py b/src/bigocrpdf/utils/pdf_utils.py
@@ -206,10 +206,10 @@ def extract_images_for_odf(
 
 
 def open_file_with_default_app(file_path: str) -> bool:
-    """Open a file using the system's default application via Gtk.FileLauncher.
+    """Open a file using the system's default application.
 
-    Uses the GTK4 portal-aware launcher instead of raw subprocess calls,
-    which is non-blocking on the UI thread and Flatpak-compatible.
+    Uses Gio.AppInfo to directly launch the default handler without
+    showing an application chooser dialog.
 
     Args:
         file_path: Path to the file to open
@@ -224,17 +224,26 @@ def open_file_with_default_app(file_path: str) -> bool:
     try:
         import gi
 
-        gi.require_version("Gtk", "4.0")
         gi.require_version("Gio", "2.0")
-        from gi.repository import Gio, Gtk
+        from gi.repository import Gio
 
         gfile = Gio.File.new_for_path(file_path)
-        launcher = Gtk.FileLauncher.new(gfile)
-        launcher.launch(None, None, None)
+        uri = gfile.get_uri()
+        Gio.AppInfo.launch_default_for_uri(uri, None)
         return True
     except Exception as e:
-        logger.error(f"Failed to open file {file_path}: {e}")
-        return False
+        logger.warning(f"Gio launch failed for {file_path}: {e}, trying xdg-open")
+        try:
+            import subprocess
+            subprocess.Popen(
+                ["xdg-open", file_path],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            return True
+        except Exception as e2:
+            logger.error(f"Failed to open file {file_path}: {e2}")
+            return False
 
 
 def render_pdf_page_to_png(