2020_FITZ_TIMEOUT = 300 # seconds
2121
2222
23- def _fitz_open_with_timeout (filepath : str , timeout : int = _FITZ_TIMEOUT ):
24- """Open a PDF with fitz, raising TimeoutError if it hangs beyond `timeout` seconds."""
23+ def _fitz_open_with_timeout (filepath : str , timeout : int = _FITZ_TIMEOUT , should_stop = None ):
24+ """Open a PDF with fitz, raising TimeoutError if it hangs beyond `timeout` seconds.
25+
26+ If `should_stop` callable is provided, the wait is interrupted early when it
27+ returns True, raising TimeoutError so the caller can exit cleanly.
28+ """
2529 result = [None ]
2630 exc = [None ]
2731
@@ -33,7 +37,14 @@ def _open():
3337
3438 t = threading .Thread (target = _open , daemon = True )
3539 t .start ()
36- t .join (timeout )
40+ deadline = timeout
41+ poll_interval = 0.5 # check stop flag every 500ms
42+ elapsed = 0.0
43+ while t .is_alive () and elapsed < deadline :
44+ t .join (poll_interval )
45+ elapsed += poll_interval
46+ if should_stop and should_stop ():
47+ raise TimeoutError (f"fitz.open() aborted by stop request for { filepath } " )
3748 if t .is_alive ():
3849 raise TimeoutError (f"fitz.open() timed out after { timeout } s for { filepath } " )
3950 if exc [0 ] is not None :
@@ -85,12 +96,12 @@ def guess_category(filepath: str) -> str:
8596 return "core"
8697
8798
88- def generate_thumbnail (filepath : str , output_path : str , size : tuple = (300 , 400 )) -> bool :
99+ def generate_thumbnail (filepath : str , output_path : str , size : tuple = (300 , 400 ), should_stop = None ) -> bool :
89100 """Generate a thumbnail from the first page of a PDF or from an image."""
90101 try :
91102 ext = Path (filepath ).suffix .lower ()
92103 if ext == ".pdf" :
93- doc = _fitz_open_with_timeout (filepath )
104+ doc = _fitz_open_with_timeout (filepath , should_stop = should_stop )
94105 if len (doc ) == 0 :
95106 return False
96107 page = doc [0 ]
@@ -114,11 +125,11 @@ def generate_thumbnail(filepath: str, output_path: str, size: tuple = (300, 400)
114125 return False
115126
116127
117- def extract_text_from_pdf (filepath : str ) -> list [dict ]:
128+ def extract_text_from_pdf (filepath : str , should_stop = None ) -> list [dict ]:
118129 """Extract text from all pages of a PDF. Returns list of {page, content}."""
119130 pages = []
120131 try :
121- doc = _fitz_open_with_timeout (filepath )
132+ doc = _fitz_open_with_timeout (filepath , should_stop = should_stop )
122133 for i , page in enumerate (doc ):
123134 page_text = page .get_text ().strip ()
124135 if page_text :
@@ -140,11 +151,13 @@ def _count_eligible_files(directory: Path, extensions: set) -> int:
140151 return count
141152
142153
143- def scan_library (library_path : str , data_path : str , session : Session , on_progress = None ):
154+ def scan_library (library_path : str , data_path : str , session : Session , on_progress = None , should_stop = None ):
144155 """Scan the library directory and register all files in the database.
145156
146157 on_progress(scanned_books, total_books, scanned_maps, total_maps, scanned_tokens, total_tokens)
147158 is called after each file is processed if provided.
159+
160+ should_stop() is an optional callable that returns True when the scan should abort early.
148161 """
149162 library = Path (library_path )
150163 books_dir = library / "books"
@@ -214,13 +227,18 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
214227 scanned_tokens ,
215228 total_tokens ,
216229 )
230+ if should_stop and should_stop ():
231+ logger .info ("scan_library: stop requested during books scan." )
232+ return stats
217233
218234 relative_path = os .path .relpath (filepath , library_path )
219235
220236 existing = session .query (Book ).filter_by (filepath = filepath ).first ()
221237 if existing :
238+ logger .debug (f"File scan: already registered, skipping: { filename } " )
222239 continue
223240
241+ logger .debug (f"File scan: new book found: { filename } " )
224242 category = guess_category (relative_path )
225243 title = Path (filename ).stem .replace ("_" , " " ).replace ("-" , " " ).strip ()
226244
@@ -246,12 +264,12 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
246264 "books" ,
247265 f"{ slugify (title )} _{ hashlib .md5 (filepath .encode ()).hexdigest ()[:8 ]} .webp" ,
248266 )
249- if generate_thumbnail (filepath , thumb_path ):
267+ if generate_thumbnail (filepath , thumb_path , should_stop = should_stop ):
250268 book .has_thumbnail = True
251269
252270 if ext == ".pdf" :
253271 try :
254- doc = _fitz_open_with_timeout (filepath )
272+ doc = _fitz_open_with_timeout (filepath , should_stop = should_stop )
255273 book .page_count = len (doc )
256274 doc .close ()
257275 except Exception as e :
@@ -291,13 +309,18 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
291309 scanned_tokens ,
292310 total_tokens ,
293311 )
312+ if should_stop and should_stop ():
313+ logger .info ("scan_library: stop requested during maps scan." )
314+ return stats
294315
295316 relative_path = os .path .relpath (filepath , library_path )
296317
297318 existing = session .query (GenericMap ).filter_by (filepath = filepath ).first ()
298319 if existing :
320+ logger .debug (f"File scan: already registered, skipping: { filename } " )
299321 continue
300322
323+ logger .debug (f"File scan: new map found: { filename } " )
301324 title = Path (filename ).stem .replace ("_" , " " ).replace ("-" , " " ).strip ()
302325
303326 try :
@@ -318,7 +341,7 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
318341 "maps" ,
319342 f"{ slugify (title )} _{ hashlib .md5 (filepath .encode ()).hexdigest ()[:8 ]} .webp" ,
320343 )
321- if generate_thumbnail (filepath , thumb_path ):
344+ if generate_thumbnail (filepath , thumb_path , should_stop = should_stop ):
322345 gmap .has_thumbnail = True
323346
324347 session .add (gmap )
@@ -353,13 +376,18 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
353376 scanned_tokens ,
354377 total_tokens ,
355378 )
379+ if should_stop and should_stop ():
380+ logger .info ("scan_library: stop requested during tokens scan." )
381+ return stats
356382
357383 relative_path = os .path .relpath (filepath , library_path )
358384
359385 existing = session .query (Token ).filter_by (filepath = filepath ).first ()
360386 if existing :
387+ logger .debug (f"File scan: already registered, skipping: { filename } " )
361388 continue
362389
390+ logger .debug (f"File scan: new token found: { filename } " )
363391 title = Path (filename ).stem .replace ("_" , " " ).replace ("-" , " " ).strip ()
364392
365393 try :
@@ -380,7 +408,7 @@ def scan_library(library_path: str, data_path: str, session: Session, on_progres
380408 "tokens" ,
381409 f"{ slugify (title )} _{ hashlib .md5 (filepath .encode ()).hexdigest ()[:8 ]} .webp" ,
382410 )
383- if generate_thumbnail (filepath , thumb_path , size = (200 , 200 )):
411+ if generate_thumbnail (filepath , thumb_path , size = (200 , 200 ), should_stop = should_stop ):
384412 token .has_thumbnail = True
385413
386414 session .add (token )
@@ -499,12 +527,12 @@ def _apply_tags_from_library(library_path: str, session: Session) -> None:
499527 session .commit ()
500528
501529
502- def index_book_text (book : Book , data_path : str , session : Session ):
530+ def index_book_text (book : Book , data_path : str , session : Session , should_stop = None ):
503531 """Extract and index text from a PDF for full-text search."""
504532 if book .indexed or book .index_failed or book .mime_type != "application/pdf" :
505533 return False
506534
507- pages = extract_text_from_pdf (book .filepath )
535+ pages = extract_text_from_pdf (book .filepath , should_stop = should_stop )
508536 if not pages :
509537 book .index_error = "No text extracted"
510538 book .index_failed = True
@@ -523,5 +551,5 @@ def index_book_text(book: Book, data_path: str, session: Session):
523551 book .indexed = True
524552 book .index_error = ""
525553 session .commit ()
526- logger .info (f"Indexed { len (pages )} pages for: { book .title } " )
554+ logger .info (f"Indexed { len (pages )} pages for: { book .filename } (' { book . title } ') " )
527555 return True
0 commit comments