Merge pull request #1868 from tisnik/lcore-2493-token-estimator-benchmark-for-files

tisnik · web-flow · commit 5e90d3f0bd73 · 2026-06-08T12:49:50.000+02:00
LCORE-2493: token estimator benchmark for data files and sources
diff --git a/docs/benchmarks/tokenizer/all.svg b/docs/benchmarks/tokenizer/all.svg
diff --git a/docs/benchmarks/tokenizer/all.txt b/docs/benchmarks/tokenizer/all.txt
@@ -0,0 +1,15 @@
+--------------------------------------------------------------------------------------------------------------- benchmark: 10 tests ----------------------------------------------------------------------------------------------------------------
+Name (time in ns)                               Min                       Max                      Mean                  StdDev                    Median                    IQR             Outliers              OPS            Rounds  Iterations
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+test_estimate_empty_string                  73.2101 (1.0)          1,714.0200 (1.0)             85.9750 (1.0)           16.3932 (1.0)             82.1598 (1.0)          14.4902 (1.0)      8632;5289  11,631,282.7649 (1.0)       96834         100
+test_estimate_hello_world                3,069.9885 (41.93)       15,099.9986 (8.81)         5,548.5746 (64.54)      4,372.8983 (266.75)       3,730.9946 (45.41)     2,687.2221 (185.45)         1;1     180,226.4677 (0.02)          7           1
+test_pangram                             6,136.9792 (83.83)       41,826.0170 (24.40)        7,146.9992 (83.13)        813.4865 (49.62)        7,003.9823 (85.25)     1,098.0293 (75.78)     2897;151     139,918.8626 (0.01)      14997           1
+test_python_source_10_lines             24,937.9955 (340.64)     224,443.9966 (130.95)      26,398.0163 (307.04)     2,575.7834 (157.13)      25,807.0068 (314.11)    1,141.9834 (78.81)      625;856      37,881.6343 (0.00)      11169           1
+test_json_file_10_lines                 43,904.0014 (599.70)     122,676.0214 (71.57)       48,299.4729 (561.78)     4,326.5464 (263.92)      46,596.9788 (567.15)    6,118.5056 (422.25)     1654;77      20,704.1597 (0.00)       9923           1
+test_lorem_ipsum                        44,059.9979 (601.83)     122,688.0122 (71.58)       50,853.2198 (591.49)     5,390.6079 (328.83)      49,723.5014 (605.20)    9,600.9899 (662.58)     3231;17      19,664.4382 (0.00)       7440           1
+test_yaml_file_10_lines                 44,452.9869 (607.20)   1,089,611.0034 (635.70)      46,447.0401 (540.24)    10,816.7633 (659.83)      45,379.0126 (552.33)    1,267.0062 (87.44)      49;1253      21,529.8972 (0.00)       9888           1
+test_xml_file_10_lines                 114,177.0044 (>1000.0)    936,313.0084 (546.27)     126,445.2771 (>1000.0)   17,633.7439 (>1000.0)    122,102.4868 (>1000.0)  17,396.0289 (>1000.0)     261;21       7,908.5595 (0.00)       4198           1
+test_lorem_ipsum_times_10_times        405,982.0203 (>1000.0)    851,906.0211 (497.02)     444,199.8123 (>1000.0)   38,077.3221 (>1000.0)    430,426.9969 (>1000.0)  60,275.9974 (>1000.0)      358;2       2,251.2391 (0.00)       2039           1
+test_lorem_ipsum_times_100_times     4,065,133.9805 (>1000.0)  6,779,723.0149 (>1000.0)  4,248,999.0044 (>1000.0)  343,307.8870 (>1000.0)  4,109,136.0035 (>1000.0)  92,952.7669 (>1000.0)      27;44         235.3495 (0.00)        225           1
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
diff --git a/tests/benchmarks/data/js_10_lines.js b/tests/benchmarks/data/js_10_lines.js
@@ -0,0 +1,10 @@
+/*! jQuery UI - v1.11.4 - 2015-03-11
+* http://jqueryui.com
+* Copyright 2015 jQuery Foundation and other contributors; Licensed MIT */
+(function( factory ) {
+	if ( typeof define === "function" && define.amd ) {
+		define([ "jquery" ], factory );
+	} else {
+		factory( jQuery );
+	}
+}(function( $ ) {
diff --git a/tests/benchmarks/data/json_10_lines.json b/tests/benchmarks/data/json_10_lines.json
@@ -0,0 +1,10 @@
+{
+  "totals": {
+    "covered_lines": 5175,
+    "num_statements": 10097,
+    "percent_covered": 51.252847380410024,
+    "percent_covered_display": "51",
+    "percent_statements_covered": 51.252847380410024,
+    "percent_statements_covered_display": "51"
+  }
+}
diff --git a/tests/benchmarks/data/python_10_lines.py b/tests/benchmarks/data/python_10_lines.py
@@ -0,0 +1,10 @@
+"""Source to be tokenized."""
+
+from llama_stack_client import LlamaStackClient
+
+client = LlamaStackClient(base_url="http://localhost:8321")
+
+models = client.models.list()
+
+for model in models:
+    print(model)
diff --git a/tests/benchmarks/data/xml_10_lines.xml b/tests/benchmarks/data/xml_10_lines.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<testsuites name="pytest tests">
+  <testsuite name="pytest" errors="0" failures="0" skipped="1" tests="2550" time="36.936" timestamp="2026-05-31T18:56:11.947715+02:00" hostname="ptisnovs-thinkpadt14gen3.brq.csb">
+    <testcase classname="tests.unit.utils.test_vector_search.TestGetCrossEncoder" name="test_handles_import_error" time="11.055"/>
+    <testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_sorting_by_boosted_scores" time="0.001"/>
+    <testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_default_boost_factor" time="0.001"/>
+    <testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_none_scores_handled" time="0.001"/>
+    <testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_preserves_chunk_attributes" time="0.049"/>
+  </testsuite>
+</testsuites>
diff --git a/tests/benchmarks/data/yaml_10_lines.yml b/tests/benchmarks/data/yaml_10_lines.yml
@@ -0,0 +1,10 @@
+meta:
+  format: 3
+  version: 7.14.0
+  timestamp: '2026-05-18T16:57:39.462108'
+  branch_coverage: false
+  show_contexts: false
+totals:
+  excluded_lines: 0
+  percent_statements_covered: 51.252847380410024
+  percent_statements_covered_display: '51'
diff --git a/tests/benchmarks/test_token_estimator.py b/tests/benchmarks/test_token_estimator.py
@@ -56,3 +56,31 @@ def _test_lorem_ipsum_times_1000_times(benchmark: BenchmarkFixture) -> None:
     """The lorem ipsum tokenizes to the known cl100k_base count."""
     input_string = LOREM_IPSUM * 1000
     benchmark(estimate_tokens, input_string)
+
+
+def benchmark_file_tokenization(benchmark: BenchmarkFixture, filename: str) -> None:
+    """Read the given file and tokenize it."""
+    with open("tests/benchmarks/data/" + filename, encoding="utf-8") as fin:
+        input_string = fin.read()
+        # tokenize the file content
+        benchmark(estimate_tokens, input_string)
+
+
+def test_xml_file_10_lines(benchmark: BenchmarkFixture) -> None:
+    """Test tokenizing XML file containing just 10 lines."""
+    benchmark_file_tokenization(benchmark, "xml_10_lines.xml")
+
+
+def test_yaml_file_10_lines(benchmark: BenchmarkFixture) -> None:
+    """Test tokenizing YAML file containing just 10 lines."""
+    benchmark_file_tokenization(benchmark, "yaml_10_lines.yml")
+
+
+def test_json_file_10_lines(benchmark: BenchmarkFixture) -> None:
+    """Test tokenizing JSON file containing just 10 lines."""
+    benchmark_file_tokenization(benchmark, "json_10_lines.json")
+
+
+def test_python_source_10_lines(benchmark: BenchmarkFixture) -> None:
+    """Test tokenizing Python script containing just 10 lines."""
+    benchmark_file_tokenization(benchmark, "python_10_lines.py")