@@ -338,36 +338,65 @@ def fetch_contents_for_results(results, max_fetch=None,
338338 len (to_fetch ), target_ok , max_chars )
339339 t0 = time .time ()
340340 ok_count = 0
341+ url_timings = [] # list of (url, elapsed, ok, chars, method)
341342 def _do (r ):
342- return r , fetch_page_content (r ['url' ], max_chars = max_chars ,
343+ url = r ['url' ]
344+ fetch_t0 = time .time ()
345+ content = fetch_page_content (url , max_chars = max_chars ,
343346 pdf_max_chars = _lib .FETCH_MAX_CHARS_PDF )
344- with ThreadPoolExecutor (max_workers = 8 ) as pool :
347+ fetch_elapsed = time .time () - fetch_t0
348+ return r , content , fetch_elapsed
349+ with ThreadPoolExecutor (max_workers = 16 ) as pool :
345350 futs = {pool .submit (_do , r ): r for r in to_fetch }
346351 pending = set (futs .keys ())
352+ cancelled_urls = []
347353 try :
348354 for fut in as_completed (futs , timeout = 90 ):
349355 pending .discard (fut )
350356 try :
351- result , content = fut .result ()
352- if content and len (content ) > 50 :
357+ result , content , fetch_elapsed = fut .result ()
358+ url = result ['url' ]
359+ ok = bool (content and len (content ) > 50 )
360+ chars = len (content ) if content else 0
361+ url_timings .append ((url , fetch_elapsed , ok , chars ))
362+ if ok :
353363 result ['full_content' ] = content
354364 ok_count += 1
365+ # Log individual slow fetches (>5s)
366+ if fetch_elapsed > 5 :
367+ logger .info ('[Fetch] ⚠ SLOW url=%.80s %.1fs ok=%s chars=%d' ,
368+ url , fetch_elapsed , ok , chars )
355369 except Exception as e :
356370 logger .warning ('[Fetch] fetch_contents thread error: %s' , e , exc_info = True )
357371 # Race-to-N: once we have enough content, stop waiting
358372 if ok_count >= target_ok and pending :
359373 elapsed_so_far = time .time () - t0
374+ cancelled_urls = [futs [p ]['url' ][:60 ] for p in pending ]
360375 logger .info ('[Fetch] Race-to-N: got %d/%d pages in %.1fs, '
361- 'cancelling %d slow fetches' ,
376+ 'cancelling %d slow fetches: %s ' ,
362377 ok_count , len (to_fetch ), elapsed_so_far ,
363- len (pending ))
378+ len (pending ),
379+ ', ' .join (cancelled_urls [:5 ]))
364380 for p in pending :
365381 p .cancel ()
366382 break
367383 except TimeoutError :
368384 logger .warning ('[Fetch] fetch_contents: as_completed timeout (90s)' , exc_info = True )
369385 elapsed = time .time () - t0
370- logger .info ('[Fetch] fetch_contents done: %d/%d got content in %.1fs' , ok_count , len (to_fetch ), elapsed )
386+
387+ # Summarize URL timing breakdown
388+ if url_timings :
389+ url_timings .sort (key = lambda x : - x [1 ]) # slowest first
390+ slow_summary = ' ' .join (
391+ f'[{ "✓" if ok else "✗" } ]{ url [:50 ]} ={ et :.1f} s'
392+ for url , et , ok , _chars in url_timings [:8 ]
393+ )
394+ logger .info ('[Fetch] fetch_contents done: %d/%d got content in %.1fs '
395+ 'slowest: %s' ,
396+ ok_count , len (to_fetch ), elapsed , slow_summary )
397+ else :
398+ logger .info ('[Fetch] fetch_contents done: %d/%d got content in %.1fs' ,
399+ ok_count , len (to_fetch ), elapsed )
371400 return results
372401
373402
@@ -385,7 +414,7 @@ def _do(u):
385414 pdf_max_chars = pdf_max_chars , timeout = timeout )
386415 # Total deadline = per-request timeout + generous buffer for download + parsing
387416 deadline = max (timeout * 4 , 120 )
388- with ThreadPoolExecutor (max_workers = 4 ) as pool :
417+ with ThreadPoolExecutor (max_workers = 8 ) as pool :
389418 futs = {pool .submit (_do , u ): u for u in urls }
390419 done_count = 0
391420 try :
@@ -418,3 +447,17 @@ def extract_urls_from_text(text):
418447 u = u .rstrip ('.,;:!?' )
419448 if u not in seen and len (u ) > 10 : seen .add (u ); unique .append (u )
420449 return unique [:5 ]
450+
451+
452+
453+ def get_fetch_cache_stats () -> dict :
454+ """Return diagnostic stats for all fetch caches.
455+
456+ Useful for admin endpoints and periodic health checks.
457+ Includes hit rates, eviction counts, and current sizes.
458+ """
459+ return {
460+ 'fetch_cache' : _fetch_cache .stats ,
461+ 'html_head_cache' : _html_head_cache .stats ,
462+ 'circuit_breaker' : _circuit .get_status (),
463+ }
0 commit comments