Update benchmark

dilithjay · dilithjay · commit ad96aeae60f4 · 2025-06-28T14:27:57.000-03:00
diff --git a/README.md b/README.md
@@ -120,22 +120,22 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
 
 | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
 | --- | --- | --- | --- | --- | --- |
-| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
-| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
-| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
-| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
-| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
-| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
-| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
-| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
-| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
-| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
-| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
-| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
-| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
-| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
-| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
-| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
-| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
-| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
-| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
+| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
+| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
+| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
+| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
+| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
+| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
+| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
+| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
+| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
+| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
+| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
+| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
+| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
+| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
+| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
+| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
+| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
+| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
+| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
diff --git a/docs/benchmark.csv b/docs/benchmark.csv
@@ -1,20 +1,20 @@
 Model,Mean Similarity,Std. Dev.,Time (s),Cost($)
-gemini-2.0-flash,0.829,0.102,7.41,0.00048
-gemini-2.0-flash-001,0.814,0.176,6.85,0.000421
-gemini-1.5-flash,0.797,0.143,9.54,0.000238
-gemini-2.0-pro-exp,0.764,0.227,11.95,TBA
-AUTO,0.760,0.184,5.14,0.000217
-gemini-2.0-flash-thinking-exp,0.746,0.266,10.46,TBA
-gemini-1.5-pro,0.732,0.265,11.44,0.003332
-accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks),0.687,0.221,8.07,0.000419
-gpt-4o,0.687,0.247,10.16,0.004736
-accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks),0.675,0.184,5.98,0.000226
-gpt-4o-mini,0.642,0.213,9.71,0.000275
-gemma-3-27b-it (via OpenRouter),0.628,0.299,18.79,0.000096
-gemini-1.5-flash-8b,0.551,0.223,3.91,0.000055
-Llama-Vision-Free (via Together AI),0.531,0.198,6.93,0
-Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI),0.524,0.192,3.68,0.00006
-qwen/qwen-2.5-vl-7b-instruct (via OpenRouter),0.482,0.209,11.53,0.000052
-Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI),0.461,0.306,19.26,0.000426
-Llama-3.2-11B-Vision-Instruct (via Hugging Face),0.451,0.257,4.54,0
-microsoft/phi-4-multimodal-instruct (via OpenRouter),0.366,0.287,10.8,0.000019
+AUTO,0.9055321411253106,0.11206803679187702,9.56385833566839,0.0006787363636363636
+gemini-2.5-flash,0.8946347816279391,0.1482605007689307,54.099135637283325,0.010509618181818182
+gemini-1.5-flash,0.8640438576932735,0.19414638131679113,15.467159444635564,0.0004385318181818182
+gemini-2.0-flash,0.8973145898835583,0.1264561140762329,9.905281933871182,0.0007786000000000001
+gemini-2.5-pro,0.8490041811304497,0.29783800660442017,101.94726826927878,0.018589318181818183
+gemini-1.5-pro,0.8677550096512131,0.2834151596237667,15.033603364771063,0.006371590909090909
+meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo,0.5457486292722399,0.23911002078944002,29.259384870529175,0.011025927272727273
+google/gemma-3-27b-it,0.681351785269864,0.3342178855563382,19.41191567074169,0.00026869090909090914
+accounts/fireworks/models/llama4-maverick-instruct-basic,0.7677804033672144,0.23377978676593608,12.122042699293656,0.00150424
+microsoft/phi-4-multimodal-instruct,0.6218563462318033,0.3203799068287538,13.148698221553456,0.0005001727272727273
+qwen/qwen-2.5-vl-7b-instruct,0.5585216467675268,0.347746697400353,17.708573601462625,0.0008551818181818182
+claude-opus-4-20250514,0.7722977101059199,0.23820952402930706,20.034648851914838,0.09206590909090909
+claude-sonnet-4-20250514,0.8042789811996047,0.18950547739236565,19.26913606036793,0.020705454545454548
+claude-3-7-sonnet-20250219,0.6333232182237497,0.3691448669615299,14.244156620719217,0.01763318181818182
+claude-3-5-sonnet-20241022,0.8510506363972368,0.20859878148975006,15.994478832591664,0.01757590909090909
+gpt-4.1,0.6502514117129463,0.3419725663519831,33.72405509515242,0.014434545454545453
+gpt-4.1-mini,0.7228987200296618,0.2692251738340237,20.910677563060414,0.0035145090909090913
+gpt-4o,0.747764892526804,0.2839489784158823,26.799152395941995,0.014775909090909091
+gpt-4o-mini,0.7329809456847031,0.23116793518719836,18.175196842713788,0.006497863636363636
diff --git a/docs/benchmark.rst b/docs/benchmark.rst
@@ -79,117 +79,117 @@ Here are the detailed parsing performance results for various models:
      - Time (s)
      - Cost ($)
    * - 1
-     - gemini-2.0-flash
-     - 0.829
-     - 0.102
-     - 7.41
-     - 0.00048
+     - AUTO
+     - 0.906
+     - 0.112
+     - 9.56
+     - 0.00068
    * - 2
-     - gemini-2.0-flash-001
-     - 0.814
-     - 0.176
-     - 6.85
-     - 0.000421
+     - gemini-2.0-flash
+     - 0.897
+     - 0.126
+     - 9.91
+     - 0.00078
    * - 3
-     - gemini-1.5-flash
-     - 0.797
-     - 0.143
-     - 9.54
-     - 0.000238
+     - gemini-2.5-flash
+     - 0.895
+     - 0.148
+     - 54.10
+     - 0.01051
    * - 4
-     - gemini-2.0-pro-exp
-     - 0.764
-     - 0.227
-     - 11.95
-     - TBA
+     - gemini-1.5-pro
+     - 0.868
+     - 0.283
+     - 15.03
+     - 0.00637
    * - 5
-     - AUTO
-     - 0.76
-     - 0.184
-     - 5.14
-     - 0.000217
+     - gemini-1.5-flash
+     - 0.864
+     - 0.194
+     - 15.47
+     - 0.00044
    * - 6
-     - gemini-2.0-flash-thinking-exp
-     - 0.746
-     - 0.266
-     - 10.46
-     - TBA
+     - claude-3-5-sonnet-20241022
+     - 0.851
+     - 0.209
+     - 15.99
+     - 0.01758
    * - 7
-     - gemini-1.5-pro
-     - 0.732
-     - 0.265
-     - 11.44
-     - 0.003332
+     - gemini-2.5-pro
+     - 0.849
+     - 0.298
+     - 101.95
+     - 0.01859
    * - 8
-     - accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks)
-     - 0.687
-     - 0.221
-     - 8.07
-     - 0.000419
+     - claude-sonnet-4-20250514
+     - 0.804
+     - 0.190
+     - 19.27
+     - 0.02071
    * - 9
-     - gpt-4o
-     - 0.687
-     - 0.247
-     - 10.16
-     - 0.004736
+     - claude-opus-4-20250514
+     - 0.772
+     - 0.238
+     - 20.03
+     - 0.09207
    * - 10
-     - accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks)
-     - 0.675
-     - 0.184
-     - 5.98
-     - 0.000226
+     - accounts/fireworks/models/llama4-maverick-instruct-basic
+     - 0.768
+     - 0.234
+     - 12.12
+     - 0.00150
    * - 11
-     - gpt-4o-mini
-     - 0.642
-     - 0.213
-     - 9.71
-     - 0.000275
+     - gpt-4o
+     - 0.748
+     - 0.284
+     - 26.80
+     - 0.01478
    * - 12
-     - gemma-3-27b-it (via OpenRouter)
-     - 0.628
-     - 0.299
-     - 18.79
-     - 0.000096
+     - gpt-4o-mini
+     - 0.733
+     - 0.231
+     - 18.18
+     - 0.00650
    * - 13
-     - gemini-1.5-flash-8b
-     - 0.551
-     - 0.223
-     - 3.91
-     - 0.000055
+     - gpt-4.1-mini
+     - 0.723
+     - 0.269
+     - 20.91
+     - 0.00351
    * - 14
-     - Llama-Vision-Free (via Together AI)
-     - 0.531
-     - 0.198
-     - 6.93
-     - 0
+     - google/gemma-3-27b-it
+     - 0.681
+     - 0.334
+     - 19.41
+     - 0.00027
    * - 15
-     - Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI)
-     - 0.524
-     - 0.192
-     - 3.68
-     - 0.00006
+     - gpt-4.1
+     - 0.650
+     - 0.342
+     - 33.72
+     - 0.01443
    * - 16
-     - qwen/qwen-2.5-vl-7b-instruct (via OpenRouter)
-     - 0.482
-     - 0.209
-     - 11.53
-     - 0.000052
+     - claude-3-7-sonnet-20250219
+     - 0.633
+     - 0.369
+     - 14.24
+     - 0.01763
    * - 17
-     - Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI)
-     - 0.461
-     - 0.306
-     - 19.26
-     - 0.000426
+     - microsoft/phi-4-multimodal-instruct
+     - 0.622
+     - 0.320
+     - 13.15
+     - 0.00050
    * - 18
-     - Llama-3.2-11B-Vision-Instruct (via Hugging Face)
-     - 0.451
-     - 0.257
-     - 4.54
-     - 0
+     - qwen/qwen-2.5-vl-7b-instruct
+     - 0.559
+     - 0.348
+     - 17.71
+     - 0.00086
    * - 19
-     - microsoft/phi-4-multimodal-instruct (via OpenRouter)
-     - 0.366
-     - 0.287
-     - 10.8
-     - 0.000019
+     - meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
+     - 0.546
+     - 0.239
+     - 29.26
+     - 0.01103
     
diff --git a/docs/update_benchmarks.py b/docs/update_benchmarks.py
@@ -18,7 +18,7 @@ def generate_markdown_table(df):
     header = "| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |\n"
     sep = "| --- | --- | --- | --- | --- | --- |\n"
     rows = [
-        f"| {i+1} | {row['Model']} | {row['Mean Similarity']} | {row['Std. Dev.']} | {row['Time (s)']} | {row['Cost($)']} |"
+        f"| {i+1} | {row['Model']} | {row['Mean Similarity']:.3f} | {row['Std. Dev.']:.3f} | {row['Time (s)']:.2f} | {row['Cost($)']:.5f} |"
         for i, row in df.iterrows()
     ]
     return header + sep + "\n".join(rows)
@@ -27,7 +27,7 @@ def generate_markdown_table(df):
 def generate_rst_table(df):
     header = "\n   * - Rank\n     - Model\n     - Mean Similarity\n     - Std. Dev.\n     - Time (s)\n     - Cost ($)"
     rows = [
-        f"   * - {i+1}\n     - {row['Model']}\n     - {row['Mean Similarity']}\n     - {row['Std. Dev.']}\n     - {row['Time (s)']}\n     - {row['Cost($)']}"
+        f"   * - {i+1}\n     - {row['Model']}\n     - {row['Mean Similarity']:.3f}\n     - {row['Std. Dev.']:.3f}\n     - {row['Time (s)']:.2f}\n     - {row['Cost($)']:.5f}"
         for i, row in df.iterrows()
     ]
     return header + "\n" + "\n".join(rows)
diff --git a/examples/outputs/benchmark.md b/examples/outputs/benchmark.md
@@ -39,7 +39,9 @@ After horizontal line
 
 Here comes a link: [example-link](https://www.example.com).
 
-Email: <mail@example.com>
+1
+
+Email: mail@example.com
 
 Here comes Python code:
 
@@ -69,6 +71,8 @@ And a second table:
 | **A2** | _data 11_ | _data 12_ |
 | **A3** | _data 21_ | _data 22_ |
 
+2
+
 
 <div style="background-color: #FFFFE0; padding: 10px; font-family: 'Comic Sans MS', 'Chalkboard SE', 'Bradley Hand', cursive, sans-serif; line-height: 1.8; border: 1px solid #E0E0C1;">
 
diff --git a/examples/outputs/costco_bill.md b/examples/outputs/costco_bill.md
@@ -1,5 +1,5 @@
 <div>
-<h1>Costco</h1>
+<h1>COSTCO</h1>
 <p>WHOLESALE</p>
 <p>
 Irvine #454<br>
@@ -19,7 +19,7 @@ Irvine, CA 92618<br>
 <tr><td>E</td><td>370586</td><td>ORG. DATES</td><td>11.99</td><td></td></tr>
 <tr><td>E</td><td>1280655</td><td>ORG CSR KIT</td><td>8.99</td><td></td></tr>
 <tr><td>E</td><td>1280655</td><td>ORG CSR KIT</td><td>8.99</td><td></td></tr>
-<tr><td>E</td><td>161750</td><td>KS UNS CASHE</td><td>13.99</td><td></td></tr>
+<tr><td>E</td><td>1617507</td><td>KS UNS CASHE</td><td>13.99</td><td></td></tr>
 <tr><td>E</td><td>1308623</td><td>SUJA WELLNES</td><td>15.39</td><td></td></tr>
 <tr><td>E</td><td>1900000000</td><td>CA REDEMP VA</td><td>0.50</td><td></td></tr>
 <tr><td>E</td><td>1308623</td><td>SUJA WELLNES</td><td>15.39</td><td></td></tr>
diff --git a/tests/api_cost_mapping.json b/tests/api_cost_mapping.json
diff --git a/tests/benchmark.py b/tests/benchmark.py