Skip to content

Commit ad96aea

Browse files
committed
Update benchmark
1 parent 3af6a80 commit ad96aea

8 files changed

Lines changed: 239 additions & 188 deletions

File tree

README.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -120,22 +120,22 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
120120

121121
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
122122
| --- | --- | --- | --- | --- | --- |
123-
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
124-
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
125-
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
126-
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
127-
| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
128-
| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
129-
| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
130-
| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
131-
| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
132-
| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
133-
| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
134-
| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
135-
| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
136-
| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
137-
| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
138-
| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
139-
| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
140-
| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
141-
| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
123+
| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
124+
| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
125+
| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
126+
| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
127+
| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
128+
| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
129+
| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
130+
| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
131+
| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
132+
| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
133+
| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
134+
| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
135+
| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
136+
| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
137+
| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
138+
| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
139+
| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
140+
| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
141+
| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |

docs/benchmark.csv

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
Model,Mean Similarity,Std. Dev.,Time (s),Cost($)
2-
gemini-2.0-flash,0.829,0.102,7.41,0.00048
3-
gemini-2.0-flash-001,0.814,0.176,6.85,0.000421
4-
gemini-1.5-flash,0.797,0.143,9.54,0.000238
5-
gemini-2.0-pro-exp,0.764,0.227,11.95,TBA
6-
AUTO,0.760,0.184,5.14,0.000217
7-
gemini-2.0-flash-thinking-exp,0.746,0.266,10.46,TBA
8-
gemini-1.5-pro,0.732,0.265,11.44,0.003332
9-
accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks),0.687,0.221,8.07,0.000419
10-
gpt-4o,0.687,0.247,10.16,0.004736
11-
accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks),0.675,0.184,5.98,0.000226
12-
gpt-4o-mini,0.642,0.213,9.71,0.000275
13-
gemma-3-27b-it (via OpenRouter),0.628,0.299,18.79,0.000096
14-
gemini-1.5-flash-8b,0.551,0.223,3.91,0.000055
15-
Llama-Vision-Free (via Together AI),0.531,0.198,6.93,0
16-
Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI),0.524,0.192,3.68,0.00006
17-
qwen/qwen-2.5-vl-7b-instruct (via OpenRouter),0.482,0.209,11.53,0.000052
18-
Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI),0.461,0.306,19.26,0.000426
19-
Llama-3.2-11B-Vision-Instruct (via Hugging Face),0.451,0.257,4.54,0
20-
microsoft/phi-4-multimodal-instruct (via OpenRouter),0.366,0.287,10.8,0.000019
2+
AUTO,0.9055321411253106,0.11206803679187702,9.56385833566839,0.0006787363636363636
3+
gemini-2.5-flash,0.8946347816279391,0.1482605007689307,54.099135637283325,0.010509618181818182
4+
gemini-1.5-flash,0.8640438576932735,0.19414638131679113,15.467159444635564,0.0004385318181818182
5+
gemini-2.0-flash,0.8973145898835583,0.1264561140762329,9.905281933871182,0.0007786000000000001
6+
gemini-2.5-pro,0.8490041811304497,0.29783800660442017,101.94726826927878,0.018589318181818183
7+
gemini-1.5-pro,0.8677550096512131,0.2834151596237667,15.033603364771063,0.006371590909090909
8+
meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo,0.5457486292722399,0.23911002078944002,29.259384870529175,0.011025927272727273
9+
google/gemma-3-27b-it,0.681351785269864,0.3342178855563382,19.41191567074169,0.00026869090909090914
10+
accounts/fireworks/models/llama4-maverick-instruct-basic,0.7677804033672144,0.23377978676593608,12.122042699293656,0.00150424
11+
microsoft/phi-4-multimodal-instruct,0.6218563462318033,0.3203799068287538,13.148698221553456,0.0005001727272727273
12+
qwen/qwen-2.5-vl-7b-instruct,0.5585216467675268,0.347746697400353,17.708573601462625,0.0008551818181818182
13+
claude-opus-4-20250514,0.7722977101059199,0.23820952402930706,20.034648851914838,0.09206590909090909
14+
claude-sonnet-4-20250514,0.8042789811996047,0.18950547739236565,19.26913606036793,0.020705454545454548
15+
claude-3-7-sonnet-20250219,0.6333232182237497,0.3691448669615299,14.244156620719217,0.01763318181818182
16+
claude-3-5-sonnet-20241022,0.8510506363972368,0.20859878148975006,15.994478832591664,0.01757590909090909
17+
gpt-4.1,0.6502514117129463,0.3419725663519831,33.72405509515242,0.014434545454545453
18+
gpt-4.1-mini,0.7228987200296618,0.2692251738340237,20.910677563060414,0.0035145090909090913
19+
gpt-4o,0.747764892526804,0.2839489784158823,26.799152395941995,0.014775909090909091
20+
gpt-4o-mini,0.7329809456847031,0.23116793518719836,18.175196842713788,0.006497863636363636

docs/benchmark.rst

Lines changed: 95 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -79,117 +79,117 @@ Here are the detailed parsing performance results for various models:
7979
- Time (s)
8080
- Cost ($)
8181
* - 1
82-
- gemini-2.0-flash
83-
- 0.829
84-
- 0.102
85-
- 7.41
86-
- 0.00048
82+
- AUTO
83+
- 0.906
84+
- 0.112
85+
- 9.56
86+
- 0.00068
8787
* - 2
88-
- gemini-2.0-flash-001
89-
- 0.814
90-
- 0.176
91-
- 6.85
92-
- 0.000421
88+
- gemini-2.0-flash
89+
- 0.897
90+
- 0.126
91+
- 9.91
92+
- 0.00078
9393
* - 3
94-
- gemini-1.5-flash
95-
- 0.797
96-
- 0.143
97-
- 9.54
98-
- 0.000238
94+
- gemini-2.5-flash
95+
- 0.895
96+
- 0.148
97+
- 54.10
98+
- 0.01051
9999
* - 4
100-
- gemini-2.0-pro-exp
101-
- 0.764
102-
- 0.227
103-
- 11.95
104-
- TBA
100+
- gemini-1.5-pro
101+
- 0.868
102+
- 0.283
103+
- 15.03
104+
- 0.00637
105105
* - 5
106-
- AUTO
107-
- 0.76
108-
- 0.184
109-
- 5.14
110-
- 0.000217
106+
- gemini-1.5-flash
107+
- 0.864
108+
- 0.194
109+
- 15.47
110+
- 0.00044
111111
* - 6
112-
- gemini-2.0-flash-thinking-exp
113-
- 0.746
114-
- 0.266
115-
- 10.46
116-
- TBA
112+
- claude-3-5-sonnet-20241022
113+
- 0.851
114+
- 0.209
115+
- 15.99
116+
- 0.01758
117117
* - 7
118-
- gemini-1.5-pro
119-
- 0.732
120-
- 0.265
121-
- 11.44
122-
- 0.003332
118+
- gemini-2.5-pro
119+
- 0.849
120+
- 0.298
121+
- 101.95
122+
- 0.01859
123123
* - 8
124-
- accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks)
125-
- 0.687
126-
- 0.221
127-
- 8.07
128-
- 0.000419
124+
- claude-sonnet-4-20250514
125+
- 0.804
126+
- 0.190
127+
- 19.27
128+
- 0.02071
129129
* - 9
130-
- gpt-4o
131-
- 0.687
132-
- 0.247
133-
- 10.16
134-
- 0.004736
130+
- claude-opus-4-20250514
131+
- 0.772
132+
- 0.238
133+
- 20.03
134+
- 0.09207
135135
* - 10
136-
- accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks)
137-
- 0.675
138-
- 0.184
139-
- 5.98
140-
- 0.000226
136+
- accounts/fireworks/models/llama4-maverick-instruct-basic
137+
- 0.768
138+
- 0.234
139+
- 12.12
140+
- 0.00150
141141
* - 11
142-
- gpt-4o-mini
143-
- 0.642
144-
- 0.213
145-
- 9.71
146-
- 0.000275
142+
- gpt-4o
143+
- 0.748
144+
- 0.284
145+
- 26.80
146+
- 0.01478
147147
* - 12
148-
- gemma-3-27b-it (via OpenRouter)
149-
- 0.628
150-
- 0.299
151-
- 18.79
152-
- 0.000096
148+
- gpt-4o-mini
149+
- 0.733
150+
- 0.231
151+
- 18.18
152+
- 0.00650
153153
* - 13
154-
- gemini-1.5-flash-8b
155-
- 0.551
156-
- 0.223
157-
- 3.91
158-
- 0.000055
154+
- gpt-4.1-mini
155+
- 0.723
156+
- 0.269
157+
- 20.91
158+
- 0.00351
159159
* - 14
160-
- Llama-Vision-Free (via Together AI)
161-
- 0.531
162-
- 0.198
163-
- 6.93
164-
- 0
160+
- google/gemma-3-27b-it
161+
- 0.681
162+
- 0.334
163+
- 19.41
164+
- 0.00027
165165
* - 15
166-
- Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI)
167-
- 0.524
168-
- 0.192
169-
- 3.68
170-
- 0.00006
166+
- gpt-4.1
167+
- 0.650
168+
- 0.342
169+
- 33.72
170+
- 0.01443
171171
* - 16
172-
- qwen/qwen-2.5-vl-7b-instruct (via OpenRouter)
173-
- 0.482
174-
- 0.209
175-
- 11.53
176-
- 0.000052
172+
- claude-3-7-sonnet-20250219
173+
- 0.633
174+
- 0.369
175+
- 14.24
176+
- 0.01763
177177
* - 17
178-
- Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI)
179-
- 0.461
180-
- 0.306
181-
- 19.26
182-
- 0.000426
178+
- microsoft/phi-4-multimodal-instruct
179+
- 0.622
180+
- 0.320
181+
- 13.15
182+
- 0.00050
183183
* - 18
184-
- Llama-3.2-11B-Vision-Instruct (via Hugging Face)
185-
- 0.451
186-
- 0.257
187-
- 4.54
188-
- 0
184+
- qwen/qwen-2.5-vl-7b-instruct
185+
- 0.559
186+
- 0.348
187+
- 17.71
188+
- 0.00086
189189
* - 19
190-
- microsoft/phi-4-multimodal-instruct (via OpenRouter)
191-
- 0.366
192-
- 0.287
193-
- 10.8
194-
- 0.000019
190+
- meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo
191+
- 0.546
192+
- 0.239
193+
- 29.26
194+
- 0.01103
195195

docs/update_benchmarks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def generate_markdown_table(df):
1818
header = "| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |\n"
1919
sep = "| --- | --- | --- | --- | --- | --- |\n"
2020
rows = [
21-
f"| {i+1} | {row['Model']} | {row['Mean Similarity']} | {row['Std. Dev.']} | {row['Time (s)']} | {row['Cost($)']} |"
21+
f"| {i+1} | {row['Model']} | {row['Mean Similarity']:.3f} | {row['Std. Dev.']:.3f} | {row['Time (s)']:.2f} | {row['Cost($)']:.5f} |"
2222
for i, row in df.iterrows()
2323
]
2424
return header + sep + "\n".join(rows)
@@ -27,7 +27,7 @@ def generate_markdown_table(df):
2727
def generate_rst_table(df):
2828
header = "\n * - Rank\n - Model\n - Mean Similarity\n - Std. Dev.\n - Time (s)\n - Cost ($)"
2929
rows = [
30-
f" * - {i+1}\n - {row['Model']}\n - {row['Mean Similarity']}\n - {row['Std. Dev.']}\n - {row['Time (s)']}\n - {row['Cost($)']}"
30+
f" * - {i+1}\n - {row['Model']}\n - {row['Mean Similarity']:.3f}\n - {row['Std. Dev.']:.3f}\n - {row['Time (s)']:.2f}\n - {row['Cost($)']:.5f}"
3131
for i, row in df.iterrows()
3232
]
3333
return header + "\n" + "\n".join(rows)

examples/outputs/benchmark.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ After horizontal line
3939

4040
Here comes a link: [example-link](https://www.example.com).
4141

42-
Email: <mail@example.com>
42+
1
43+
44+
Email: mail@example.com
4345

4446
Here comes Python code:
4547

@@ -69,6 +71,8 @@ And a second table:
6971
| **A2** | _data 11_ | _data 12_ |
7072
| **A3** | _data 21_ | _data 22_ |
7173

74+
2
75+
7276

7377
<div style="background-color: #FFFFE0; padding: 10px; font-family: 'Comic Sans MS', 'Chalkboard SE', 'Bradley Hand', cursive, sans-serif; line-height: 1.8; border: 1px solid #E0E0C1;">
7478

examples/outputs/costco_bill.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<div>
2-
<h1>Costco</h1>
2+
<h1>COSTCO</h1>
33
<p>WHOLESALE</p>
44
<p>
55
Irvine #454<br>
@@ -19,7 +19,7 @@ Irvine, CA 92618<br>
1919
<tr><td>E</td><td>370586</td><td>ORG. DATES</td><td>11.99</td><td></td></tr>
2020
<tr><td>E</td><td>1280655</td><td>ORG CSR KIT</td><td>8.99</td><td></td></tr>
2121
<tr><td>E</td><td>1280655</td><td>ORG CSR KIT</td><td>8.99</td><td></td></tr>
22-
<tr><td>E</td><td>161750</td><td>KS UNS CASHE</td><td>13.99</td><td></td></tr>
22+
<tr><td>E</td><td>1617507</td><td>KS UNS CASHE</td><td>13.99</td><td></td></tr>
2323
<tr><td>E</td><td>1308623</td><td>SUJA WELLNES</td><td>15.39</td><td></td></tr>
2424
<tr><td>E</td><td>1900000000</td><td>CA REDEMP VA</td><td>0.50</td><td></td></tr>
2525
<tr><td>E</td><td>1308623</td><td>SUJA WELLNES</td><td>15.39</td><td></td></tr>

0 commit comments

Comments
 (0)