Update benchmark with gemini 3 and gpt 5.2 models

dilithjay · web-flow · commit c957b89e3b5c · 2026-02-16T22:54:43.000-04:00
diff --git a/README.md b/README.md
@@ -118,30 +118,33 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
 
 | Rank | Model | SequenceMatcher Similarity | TFIDF Similarity | Time (s) | Cost ($) |
 | --- | --- | --- | --- | --- | --- |
-| 1 | AUTO (with auto-selected model) | 0.899 (±0.131) | 0.960 (±0.066) | 21.17 | 0.00066 |
-| 2 | AUTO | 0.895 (±0.112) | 0.973 (±0.046) | 9.29 | 0.00063 |
-| 3 | gemini-2.5-flash | 0.886 (±0.164) | 0.986 (±0.027) | 52.55 | 0.01226 |
-| 4 | mistral-ocr-latest | 0.882 (±0.106) | 0.932 (±0.091) | 5.75 | 0.00121 |
-| 5 | gemini-2.5-pro | 0.876 (±0.195) | 0.976 (±0.049) | 22.65 | 0.02408 |
-| 6 | gemini-2.0-flash | 0.875 (±0.148) | 0.977 (±0.037) | 11.96 | 0.00079 |
-| 7 | claude-3-5-sonnet-20241022 | 0.858 (±0.184) | 0.930 (±0.098) | 17.32 | 0.01804 |
-| 8 | gemini-1.5-flash | 0.842 (±0.214) | 0.969 (±0.037) | 15.58 | 0.00043 |
-| 9 | gpt-5-mini | 0.819 (±0.201) | 0.917 (±0.104) | 52.84 | 0.00811 |
-| 10 | gpt-5 | 0.807 (±0.215) | 0.919 (±0.088) | 98.12 | 0.05505 |
-| 11 | claude-sonnet-4-20250514 | 0.801 (±0.188) | 0.905 (±0.136) | 22.02 | 0.02056 |
-| 12 | claude-opus-4-20250514 | 0.789 (±0.220) | 0.886 (±0.148) | 29.55 | 0.09513 |
-| 13 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.772 (±0.203) | 0.930 (±0.117) | 16.02 | 0.00147 |
-| 14 | gemini-1.5-pro | 0.767 (±0.309) | 0.865 (±0.230) | 24.77 | 0.01139 |
-| 15 | gpt-4.1-mini | 0.754 (±0.249) | 0.803 (±0.193) | 23.28 | 0.00347 |
-| 16 | accounts/fireworks/models/llama4-scout-instruct-basic | 0.754 (±0.243) | 0.942 (±0.063) | 13.36 | 0.00087 |
-| 17 | gpt-4o | 0.752 (±0.269) | 0.896 (±0.123) | 28.87 | 0.01469 |
-| 18 | gpt-4o-mini | 0.728 (±0.241) | 0.850 (±0.128) | 18.96 | 0.00609 |
-| 19 | claude-3-7-sonnet-20250219 | 0.646 (±0.397) | 0.758 (±0.297) | 57.96 | 0.01730 |
-| 20 | gpt-4.1 | 0.637 (±0.301) | 0.787 (±0.185) | 35.37 | 0.01498 |
-| 21 | google/gemma-3-27b-it | 0.604 (±0.342) | 0.788 (±0.297) | 23.16 | 0.00020 |
-| 22 | ds4sd/SmolDocling-256M-preview | 0.603 (±0.292) | 0.705 (±0.262) | 507.74 | 0.00000 |
-| 23 | microsoft/phi-4-multimodal-instruct | 0.589 (±0.273) | 0.820 (±0.197) | 14.00 | 0.00045 |
-| 24 | qwen/qwen-2.5-vl-7b-instruct | 0.498 (±0.378) | 0.630 (±0.445) | 14.73 | 0.00056 |
+| 1 | gemini-3-pro-preview | 0.917 (±0.127) | 0.943 (±0.159) | 46.92 | 0.06288 |
+| 2 | AUTO (with auto-selected model) | 0.899 (±0.131) | 0.960 (±0.066) | 21.17 | 0.00066 |
+| 3 | AUTO | 0.895 (±0.112) | 0.973 (±0.046) | 9.29 | 0.00063 |
+| 4 | gpt-5.2 | 0.890 (±0.193) | 0.975 (±0.036) | 33.32 | 0.03959 |
+| 5 | gemini-2.5-flash | 0.886 (±0.164) | 0.986 (±0.027) | 52.55 | 0.01226 |
+| 6 | mistral-ocr-latest | 0.882 (±0.106) | 0.932 (±0.091) | 5.75 | 0.00121 |
+| 7 | gemini-2.5-pro | 0.876 (±0.195) | 0.976 (±0.049) | 22.65 | 0.02408 |
+| 8 | gemini-2.0-flash | 0.875 (±0.148) | 0.977 (±0.037) | 11.96 | 0.00079 |
+| 9 | claude-3-5-sonnet-20241022 | 0.858 (±0.184) | 0.930 (±0.098) | 17.32 | 0.01804 |
+| 10 | gemini-1.5-flash | 0.842 (±0.214) | 0.969 (±0.037) | 15.58 | 0.00043 |
+| 11 | gpt-5-mini | 0.819 (±0.201) | 0.917 (±0.104) | 52.84 | 0.00811 |
+| 12 | gpt-5 | 0.807 (±0.215) | 0.919 (±0.088) | 98.12 | 0.05505 |
+| 13 | claude-sonnet-4-20250514 | 0.801 (±0.188) | 0.905 (±0.136) | 22.02 | 0.02056 |
+| 14 | claude-opus-4-20250514 | 0.789 (±0.220) | 0.886 (±0.148) | 29.55 | 0.09513 |
+| 15 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.772 (±0.203) | 0.930 (±0.117) | 16.02 | 0.00147 |
+| 16 | gemini-1.5-pro | 0.767 (±0.309) | 0.865 (±0.230) | 24.77 | 0.01139 |
+| 17 | gemini-3-flash-preview | 0.766 (±0.293) | 0.858 (±0.210) | 39.38 | 0.00969 |
+| 18 | gpt-4.1-mini | 0.754 (±0.249) | 0.803 (±0.193) | 23.28 | 0.00347 |
+| 19 | accounts/fireworks/models/llama4-scout-instruct-basic | 0.754 (±0.243) | 0.942 (±0.063) | 13.36 | 0.00087 |
+| 20 | gpt-4o | 0.752 (±0.269) | 0.896 (±0.123) | 28.87 | 0.01469 |
+| 21 | gpt-4o-mini | 0.728 (±0.241) | 0.850 (±0.128) | 18.96 | 0.00609 |
+| 22 | claude-3-7-sonnet-20250219 | 0.646 (±0.397) | 0.758 (±0.297) | 57.96 | 0.01730 |
+| 23 | gpt-4.1 | 0.637 (±0.301) | 0.787 (±0.185) | 35.37 | 0.01498 |
+| 24 | google/gemma-3-27b-it | 0.604 (±0.342) | 0.788 (±0.297) | 23.16 | 0.00020 |
+| 25 | ds4sd/SmolDocling-256M-preview | 0.603 (±0.292) | 0.705 (±0.262) | 507.74 | 0.00000 |
+| 26 | microsoft/phi-4-multimodal-instruct | 0.589 (±0.273) | 0.820 (±0.197) | 14.00 | 0.00045 |
+| 27 | qwen/qwen-2.5-vl-7b-instruct | 0.498 (±0.378) | 0.630 (±0.445) | 14.73 | 0.00056 |
 
 ## Citation
 If you use Lexoid in production or publications, please cite accordingly and acknowledge usage. We appreciate the support 🙏
diff --git a/docs/benchmark.rst b/docs/benchmark.rst
@@ -89,144 +89,162 @@ Here are the detailed parsing performance results for various models, sorted by
      - Time (s)
      - Cost ($)
    * - 1
+     - gemini-3-pro-preview
+     - 0.917 (±0.127)
+     - 0.943 (±0.159)
+     - 46.92
+     - 0.06288
+   * - 2
      - AUTO (with auto-selected model)
      - 0.899 (±0.131)
      - 0.960 (±0.066)
      - 21.17
      - 0.00066
-   * - 2
+   * - 3
      - AUTO
      - 0.895 (±0.112)
      - 0.973 (±0.046)
      - 9.29
      - 0.00063
-   * - 3
+   * - 4
+     - gpt-5.2
+     - 0.890 (±0.193)
+     - 0.975 (±0.036)
+     - 33.32
+     - 0.03959
+   * - 5
      - gemini-2.5-flash
      - 0.886 (±0.164)
      - 0.986 (±0.027)
      - 52.55
      - 0.01226
-   * - 4
+   * - 6
      - mistral-ocr-latest
      - 0.882 (±0.106)
      - 0.932 (±0.091)
      - 5.75
      - 0.00121
-   * - 5
+   * - 7
      - gemini-2.5-pro
      - 0.876 (±0.195)
      - 0.976 (±0.049)
      - 22.65
      - 0.02408
-   * - 6
+   * - 8
      - gemini-2.0-flash
      - 0.875 (±0.148)
      - 0.977 (±0.037)
      - 11.96
      - 0.00079
-   * - 7
+   * - 9
      - claude-3-5-sonnet-20241022
      - 0.858 (±0.184)
      - 0.930 (±0.098)
      - 17.32
      - 0.01804
-   * - 8
+   * - 10
      - gemini-1.5-flash
      - 0.842 (±0.214)
      - 0.969 (±0.037)
      - 15.58
      - 0.00043
-   * - 9
+   * - 11
      - gpt-5-mini
      - 0.819 (±0.201)
      - 0.917 (±0.104)
      - 52.84
      - 0.00811
-   * - 10
+   * - 12
      - gpt-5
      - 0.807 (±0.215)
      - 0.919 (±0.088)
      - 98.12
      - 0.05505
-   * - 11
+   * - 13
      - claude-sonnet-4-20250514
      - 0.801 (±0.188)
      - 0.905 (±0.136)
      - 22.02
      - 0.02056
-   * - 12
+   * - 14
      - claude-opus-4-20250514
      - 0.789 (±0.220)
      - 0.886 (±0.148)
      - 29.55
      - 0.09513
-   * - 13
+   * - 15
      - accounts/fireworks/models/llama4-maverick-instruct-basic
      - 0.772 (±0.203)
      - 0.930 (±0.117)
      - 16.02
      - 0.00147
-   * - 14
+   * - 16
      - gemini-1.5-pro
      - 0.767 (±0.309)
      - 0.865 (±0.230)
      - 24.77
      - 0.01139
-   * - 15
+   * - 17
+     - gemini-3-flash-preview
+     - 0.766 (±0.293)
+     - 0.858 (±0.210)
+     - 39.38
+     - 0.00969
+   * - 18
      - gpt-4.1-mini
      - 0.754 (±0.249)
      - 0.803 (±0.193)
      - 23.28
      - 0.00347
-   * - 16
+   * - 19
      - accounts/fireworks/models/llama4-scout-instruct-basic
      - 0.754 (±0.243)
      - 0.942 (±0.063)
      - 13.36
      - 0.00087
-   * - 17
+   * - 20
      - gpt-4o
      - 0.752 (±0.269)
      - 0.896 (±0.123)
      - 28.87
      - 0.01469
-   * - 18
+   * - 21
      - gpt-4o-mini
      - 0.728 (±0.241)
      - 0.850 (±0.128)
      - 18.96
      - 0.00609
-   * - 19
+   * - 22
      - claude-3-7-sonnet-20250219
      - 0.646 (±0.397)
      - 0.758 (±0.297)
      - 57.96
      - 0.01730
-   * - 20
+   * - 23
      - gpt-4.1
      - 0.637 (±0.301)
      - 0.787 (±0.185)
      - 35.37
      - 0.01498
-   * - 21
+   * - 24
      - google/gemma-3-27b-it
      - 0.604 (±0.342)
      - 0.788 (±0.297)
      - 23.16
      - 0.00020
-   * - 22
+   * - 25
      - ds4sd/SmolDocling-256M-preview
      - 0.603 (±0.292)
      - 0.705 (±0.262)
      - 507.74
      - 0.00000
-   * - 23
+   * - 26
      - microsoft/phi-4-multimodal-instruct
      - 0.589 (±0.273)
      - 0.820 (±0.197)
      - 14.00
      - 0.00045
-   * - 24
+   * - 27
      - qwen/qwen-2.5-vl-7b-instruct
      - 0.498 (±0.378)
      - 0.630 (±0.445)
diff --git a/lexoid/core/parse_type/llm_parser.py b/lexoid/core/parse_type/llm_parser.py
@@ -450,6 +450,8 @@ def parse_image_with_gemini(
     }
     if kwargs["model"] == "gemini-2.5-pro":
         generation_config["thinkingConfig"] = {"thinkingBudget": 128}
+    elif kwargs["model"].startswith("gemini-3"):
+        generation_config["thinkingConfig"] = {"thinkingLevel": "low"}
 
     payload = {
         "contents": [
@@ -658,11 +660,9 @@ def create_response(
     completion_params = {
         "model": model,
         "messages": messages,
-        "max_tokens": max_tokens,
-        "temperature": temperature,
     }
 
-    if api == "openai" and (model in ["gpt-5", "gpt-5-mini"] or model.startswith("o")):
+    if api == "openai":
         # Unsupported in some models
         del completion_params["max_tokens"]
         del completion_params["temperature"]
diff --git a/tests/api_cost_mapping.json b/tests/api_cost_mapping.json
@@ -1,4 +1,16 @@
 {
+    "gemini-3-flash-preview": {
+        "input": 0.5,
+        "output": 3
+    },
+    "gemini-3-pro-preview": {
+        "input": 2,
+        "output": 12
+    },
+    "gemini-3-pro-image-preview": {
+        "input": 2,
+        "output": 12
+    },
     "gemini-2.5-flash": {
         "input": 0.3,
         "output": 2.5
@@ -45,6 +57,10 @@
         "input": 0.4,
         "output": 1.6
     },
+    "gpt-5.2": {
+        "input": 1.75,
+        "output": 14
+    },
     "gpt-5": {
         "input": 1.25,
         "output": 10
diff --git a/tests/benchmark.py b/tests/benchmark.py
@@ -10,14 +10,17 @@
 from dotenv import load_dotenv
 
 from lexoid.api import parse
-from benchmark_utils import calculate_similarities
+from benchmark_utils import calculate_similarities, METRIC_NAMES
 
 load_dotenv()
 
 config_options = {
     "parser_type": ["LLM_PARSE", "STATIC_PARSE", "AUTO"],
     "model": [
         # # Google models
+        "gemini-3-flash-preview",
+        # "gemini-3-pro-preview",
+        # "gemini-3-pro-image-preview",
         # "gemini-2.5-flash",
         # "gemini-2.5-pro",
         # "gemini-2.0-flash",
@@ -32,6 +35,7 @@
         # "claude-3-7-sonnet-20250219",
         # "claude-3-5-sonnet-20241022",
         # # OpenAI models
+        # "gpt-5.2",
         # "gpt-5",
         # "gpt-5-mini",
         # "gpt-4.1",
@@ -163,14 +167,14 @@ def run_benchmark_config(
             break  # Stop further iterations if an error occurs
 
     mean_similarity = (
-        {metric: mean([s[metric] for s in similarities]) for metric in similarities[0]}
+        {metric: mean([s[metric] for s in similarities]) for metric in METRIC_NAMES}
         if similarities
         else None
     )
     std_similarity = (
-        {metric: stdev([s[metric] for s in similarities]) for metric in similarities[0]}
+        {metric: stdev([s[metric] for s in similarities]) for metric in METRIC_NAMES}
         if len(similarities) > 1
-        else {metric: 0.0 for metric in similarities[0]}
+        else {metric: 0.0 for metric in METRIC_NAMES}
     )
 
     return BenchmarkResult(
@@ -196,15 +200,15 @@ def aggregate_results(results: List[BenchmarkResult]) -> BenchmarkResult:
         all_costs = [c for r in valid_results for c in r.cost]
         avg_similarity = {
             metric: mean([s[metric] for s in all_similarities])
-            for metric in all_similarities[0]
+            for metric in METRIC_NAMES
         }
         std_similarity = (
             {
                 metric: stdev([s[metric] for s in all_similarities])
-                for metric in all_similarities[0]
+                for metric in METRIC_NAMES
             }
             if len(all_similarities) > 1
-            else {metric: 0.0 for metric in avg_similarity}
+            else {metric: 0.0 for metric in METRIC_NAMES}
         )
         avg_execution_time = mean(all_execution_times)
         avg_cost = mean(all_costs)
@@ -449,6 +453,7 @@ def main():
 
     # Can be either a single file or directory
     input_path = "examples/inputs"
+    # input_path = "examples/inputs/grocery_bill.jpg"
     output_dir = "examples/outputs"
 
     run_id = "_".join(
diff --git a/tests/benchmark_utils.py b/tests/benchmark_utils.py
@@ -112,3 +112,13 @@ def calculate_similarities(
     similarities.update(precision_recall_f1_score(text1, text2))
 
     return similarities
+
+
+METRIC_NAMES = (
+    "sequence_matcher",
+    "cosine",
+    "jaccard",
+    "precision",
+    "recall",
+    "f1_score",
+)
diff --git a/tests/results.csv b/tests/results.csv
@@ -23,3 +23,6 @@ ds4sd/SmolDocling-256M-preview,0.603 (±0.292),0.705 (±0.262),0.645 (±0.245),0
 mistral-ocr-latest,0.882 (±0.106),0.932 (±0.091),0.904 (±0.104),0.923 (±0.097),0.977 (±0.034),0.946 (±0.061),5.754099847414555,0.001211538461538342
 gpt-5,0.807 (±0.215),0.919 (±0.088),0.855 (±0.131),0.977 (±0.024),0.871 (±0.126),0.917 (±0.078),98.12129604357942,0.05505420673076922
 gpt-5-mini,0.819 (±0.201),0.917 (±0.104),0.857 (±0.150),0.975 (±0.033),0.876 (±0.152),0.916 (±0.093),52.83561164752032,0.008113719551281968
+gemini-3-flash-preview,0.766 (±0.293),0.858 (±0.210),0.825 (±0.237),0.989 (±0.016),0.835 (±0.242),0.883 (±0.175),39.38287312643869,0.00968610714285712
+gemini-3-pro-preview,0.917 (±0.127),0.943 (±0.159),0.925 (±0.126),0.974 (±0.034),0.944 (±0.120),0.956 (±0.081),46.92351686954498,0.06287985714285714
+gpt-5.2,0.890 (±0.193),0.975 (±0.036),0.950 (±0.049),0.966 (±0.035),0.981 (±0.020),0.974 (±0.027),33.3172641311373,0.03959375