@@ -133,6 +133,14 @@ def _process_mixed_content_pdf(
133133 try :
134134 self ._wait_for_ocr_ready (ocr_proc )
135135 with pikepdf .open (input_pdf , allow_overwriting_input = True ) as pdf :
136+ # Build set of excluded pages from editor modifications
137+ excluded_pages : set [int ] = set ()
138+ if self .config .page_modifications :
139+ for mod in self .config .page_modifications :
140+ pn = mod .get ("page_number" )
141+ if pn and (mod .get ("deleted" ) or not mod .get ("included_for_ocr" , True )):
142+ excluded_pages .add (pn )
143+
136144 self ._ocr_image_pages (
137145 pdf ,
138146 image_positions ,
@@ -142,10 +150,21 @@ def _process_mixed_content_pdf(
142150 ocr_texts ,
143151 ocr_proc ,
144152 progress_callback ,
153+ excluded_pages = excluded_pages ,
145154 )
155+
156+ # Remove excluded pages before saving
157+ if excluded_pages :
158+ for idx in sorted (excluded_pages , reverse = True ):
159+ if 0 < idx <= len (pdf .pages ):
160+ del pdf .pages [idx - 1 ]
161+ logger .info (f"Removed excluded page { idx } from output" )
162+
146163 if progress_callback :
147164 progress_callback (90 , 100 , _ ("Saving PDF..." ))
148- stats .pages_processed = len (image_positions )
165+ stats .pages_processed = len (image_positions ) - len (
166+ excluded_pages & set (image_positions .keys ())
167+ )
149168 pdf .save (output_pdf )
150169 finally :
151170 self ._stop_ocr_subprocess (ocr_proc )
@@ -170,19 +189,28 @@ def _ocr_image_pages(
170189 ocr_texts : list [str ],
171190 ocr_proc : subprocess .Popen ,
172191 progress_callback : Callable [[int , int , str ], None ] | None ,
192+ excluded_pages : set [int ] | None = None ,
173193 ) -> None :
174194 """OCR all image-bearing pages, modifying the PDF in place."""
175195 enhance = getattr (self .config , "enhance_embedded_images" , False )
176196 logger .info (f"Mixed content: enhance_embedded_images={ enhance } " )
177197 processed_images = 0
178198 current_img_idx = 0
199+ _excluded = excluded_pages or set ()
179200
180201 for page_num in sorted (image_positions .keys ()):
181202 if hasattr (self , "cancel_event" ) and self .cancel_event .is_set ():
182203 logger .info ("Processing cancelled by user in mixed content mode" )
183204 raise InterruptedError ("Processing cancelled by user" )
184205
185206 page_imgs = image_positions [page_num ]
207+
208+ # Skip excluded pages entirely (no preprocessing, no OCR)
209+ if page_num in _excluded :
210+ logger .info (f"Page { page_num } : excluded from OCR, skipping ({ len (page_imgs )} image(s))" )
211+ current_img_idx += len (page_imgs )
212+ continue
213+
186214 page = pdf .pages [page_num - 1 ]
187215 mediabox = page .mediabox
188216 page_height = float (mediabox [3 ]) - float (mediabox [1 ])
0 commit comments