77from itertools import groupby
88from pathlib import Path
99from typing import Any , Dict , Iterable , List , Optional , Tuple , Union
10+ from urllib .parse import urlparse
1011
1112from playwright .async_api import Page
1213from pydantic import BaseModel , Field
1819# UTILITY FUNCTIONS
1920# ============================================================================
2021
22+
2123def get_time () -> str :
2224 """Get the current time as a formatted string.
2325
@@ -224,19 +226,24 @@ async def _detect_page_type(self, page: Page) -> Tuple[str, Optional[str]]:
224226 - ("UNSUPPORTED_PAGE", "download") for download files
225227 """
226228 try :
227- # === Layer 1: URL Extension Check (Fastest, Most Reliable) ===
229+ # === Layer 1: URL Path Extension Check (Fastest, Most Reliable) ===
230+ # IMPORTANT: Only check path component, NOT query params or fragments
231+ # This correctly handles: https://example.com/page?file=test.pdf (NOT a PDF page)
232+ # vs: https://example.com/documents/file.pdf (IS a PDF page)
228233 url = page .url .lower ()
234+ parsed_url = urlparse (url )
235+ url_path = parsed_url .path # Only the path: "/documents/file.pdf"
229236
230- # PDF detection via URL
231- if url .endswith ('.pdf' ):
232- logging .info (f'Detected PDF via URL suffix : { url } ' )
237+ # PDF detection via URL path extension
238+ if url_path .endswith ('.pdf' ):
239+ logging .info (f'Detected PDF via URL path : { url_path } (full URL: { url } ) ' )
233240 return ('UNSUPPORTED_PAGE' , 'pdf' )
234241
235- # Download file detection via URL
236- download_extensions = ['.zip' , '.rar' , '.exe' , '.dmg' , '.pkg' , '.deb' , '.tar' , '.gz' ]
242+ # Download file detection via URL path extension
243+ download_extensions = ['.zip' , '.rar' , '.exe' , '.dmg' , '.pkg' , '.deb' , '.tar' , '.gz' , '.7z' ]
237244 for ext in download_extensions :
238- if url .endswith (ext ):
239- logging .info (f'Detected download file via URL: { url } ' )
245+ if url_path .endswith (ext ):
246+ logging .info (f'Detected download file via URL path : { url_path } ' )
240247 return ('UNSUPPORTED_PAGE' , 'download' )
241248
242249 # === Layer 2: PDF Embed Element Detection (Catches Chromium PDF Viewer) ===
@@ -312,12 +319,9 @@ async def crawl(
312319 # Multi-layer detection of unsupported page types (PDF, plugins, etc.)
313320 page_status , page_type = await self ._detect_page_type (page )
314321 if page_status == 'UNSUPPORTED_PAGE' :
315- logging .warning (f'Detected unsupported page type: { page_type } , skipping crawl' )
316- return CrawlResultModel (
317- flat_element_map = ElementMap (data = {}),
318- element_tree = {},
319- page_status = page_status ,
320- page_type = page_type
322+ logging .info (
323+ f'Detected unsupported page type: { page_type } , '
324+ f'attempting to extract available data in degraded mode'
321325 )
322326
323327 try :
@@ -327,7 +331,9 @@ async def crawl(
327331 _ , merged_id_map = await self .crawl_all_frames (page = page , enable_highlight = highlight )
328332 return CrawlResultModel (
329333 flat_element_map = ElementMap (data = merged_id_map or {}),
330- element_tree = {}
334+ element_tree = {},
335+ page_status = page_status , # Pass detected page status for multi-frame pages
336+ page_type = page_type # Pass detected page type for multi-frame pages
331337 )
332338 except Exception :
333339 pass
@@ -350,7 +356,9 @@ async def crawl(
350356
351357 result = CrawlResultModel (
352358 flat_element_map = ElementMap (data = flat_elements or {}),
353- element_tree = self .element_tree or {}
359+ element_tree = self .element_tree or {},
360+ page_status = page_status , # Pass detected page status
361+ page_type = page_type # Pass detected page type
354362 )
355363
356364 if cache_dom and self .element_tree :
@@ -372,7 +380,10 @@ async def crawl(
372380
373381 except Exception as e :
374382 logging .error (f'JavaScript injection failed during element detection: { e } ' )
375- return CrawlResultModel ()
383+ return CrawlResultModel (
384+ page_status = page_status , # Pass detected page status even on failure
385+ page_type = page_type # Pass detected page type even on failure
386+ )
376387
377388 def extract_interactive_elements (self , get_new_elems : bool = False ) -> Dict :
378389 """Extract interactive elements with comprehensive attribute
@@ -541,7 +552,7 @@ async def _accumulate_iframe_offsets(f):
541552 el = await cur .frame_element ()
542553 rect = await el .evaluate ('(el) => el.getBoundingClientRect()' )
543554 total_left += rect .get ('left' , 0 ) or 0
544- total_top += rect .get ('top' , 0 ) or 0
555+ total_top += rect .get ('top' , 0 ) or 0
545556 except Exception :
546557 pass
547558 cur = parent
@@ -579,7 +590,7 @@ async def _accumulate_iframe_offsets(f):
579590
580591 # Get frame URL for later action execution in correct frame context
581592 frame_url = frame .url
582-
593+
583594 for k , v in (iframe_id_map or {}).items ():
584595 try :
585596 # frame document -> frame viewport
0 commit comments