Skip to content

Commit 5e90d3f

Browse files
authored
Merge pull request #1868 from tisnik/lcore-2493-token-estimator-benchmark-for-files
LCORE-2493: token estimator benchmark for data files and sources
2 parents 2a71a89 + f0ac9be commit 5e90d3f

8 files changed

Lines changed: 828 additions & 0 deletions

File tree

docs/benchmarks/tokenizer/all.svg

Lines changed: 735 additions & 0 deletions
Loading

docs/benchmarks/tokenizer/all.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
--------------------------------------------------------------------------------------------------------------- benchmark: 10 tests ----------------------------------------------------------------------------------------------------------------
2+
Name (time in ns) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations
3+
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
4+
test_estimate_empty_string 73.2101 (1.0) 1,714.0200 (1.0) 85.9750 (1.0) 16.3932 (1.0) 82.1598 (1.0) 14.4902 (1.0) 8632;5289 11,631,282.7649 (1.0) 96834 100
5+
test_estimate_hello_world 3,069.9885 (41.93) 15,099.9986 (8.81) 5,548.5746 (64.54) 4,372.8983 (266.75) 3,730.9946 (45.41) 2,687.2221 (185.45) 1;1 180,226.4677 (0.02) 7 1
6+
test_pangram 6,136.9792 (83.83) 41,826.0170 (24.40) 7,146.9992 (83.13) 813.4865 (49.62) 7,003.9823 (85.25) 1,098.0293 (75.78) 2897;151 139,918.8626 (0.01) 14997 1
7+
test_python_source_10_lines 24,937.9955 (340.64) 224,443.9966 (130.95) 26,398.0163 (307.04) 2,575.7834 (157.13) 25,807.0068 (314.11) 1,141.9834 (78.81) 625;856 37,881.6343 (0.00) 11169 1
8+
test_json_file_10_lines 43,904.0014 (599.70) 122,676.0214 (71.57) 48,299.4729 (561.78) 4,326.5464 (263.92) 46,596.9788 (567.15) 6,118.5056 (422.25) 1654;77 20,704.1597 (0.00) 9923 1
9+
test_lorem_ipsum 44,059.9979 (601.83) 122,688.0122 (71.58) 50,853.2198 (591.49) 5,390.6079 (328.83) 49,723.5014 (605.20) 9,600.9899 (662.58) 3231;17 19,664.4382 (0.00) 7440 1
10+
test_yaml_file_10_lines 44,452.9869 (607.20) 1,089,611.0034 (635.70) 46,447.0401 (540.24) 10,816.7633 (659.83) 45,379.0126 (552.33) 1,267.0062 (87.44) 49;1253 21,529.8972 (0.00) 9888 1
11+
test_xml_file_10_lines 114,177.0044 (>1000.0) 936,313.0084 (546.27) 126,445.2771 (>1000.0) 17,633.7439 (>1000.0) 122,102.4868 (>1000.0) 17,396.0289 (>1000.0) 261;21 7,908.5595 (0.00) 4198 1
12+
test_lorem_ipsum_times_10_times 405,982.0203 (>1000.0) 851,906.0211 (497.02) 444,199.8123 (>1000.0) 38,077.3221 (>1000.0) 430,426.9969 (>1000.0) 60,275.9974 (>1000.0) 358;2 2,251.2391 (0.00) 2039 1
13+
test_lorem_ipsum_times_100_times 4,065,133.9805 (>1000.0) 6,779,723.0149 (>1000.0) 4,248,999.0044 (>1000.0) 343,307.8870 (>1000.0) 4,109,136.0035 (>1000.0) 92,952.7669 (>1000.0) 27;44 235.3495 (0.00) 225 1
14+
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
15+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
/*! jQuery UI - v1.11.4 - 2015-03-11
2+
* http://jqueryui.com
3+
* Copyright 2015 jQuery Foundation and other contributors; Licensed MIT */
4+
(function( factory ) {
5+
if ( typeof define === "function" && define.amd ) {
6+
define([ "jquery" ], factory );
7+
} else {
8+
factory( jQuery );
9+
}
10+
}(function( $ ) {
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"totals": {
3+
"covered_lines": 5175,
4+
"num_statements": 10097,
5+
"percent_covered": 51.252847380410024,
6+
"percent_covered_display": "51",
7+
"percent_statements_covered": 51.252847380410024,
8+
"percent_statements_covered_display": "51"
9+
}
10+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Source to be tokenized."""
2+
3+
from llama_stack_client import LlamaStackClient
4+
5+
client = LlamaStackClient(base_url="http://localhost:8321")
6+
7+
models = client.models.list()
8+
9+
for model in models:
10+
print(model)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<testsuites name="pytest tests">
3+
<testsuite name="pytest" errors="0" failures="0" skipped="1" tests="2550" time="36.936" timestamp="2026-05-31T18:56:11.947715+02:00" hostname="ptisnovs-thinkpadt14gen3.brq.csb">
4+
<testcase classname="tests.unit.utils.test_vector_search.TestGetCrossEncoder" name="test_handles_import_error" time="11.055"/>
5+
<testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_sorting_by_boosted_scores" time="0.001"/>
6+
<testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_default_boost_factor" time="0.001"/>
7+
<testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_none_scores_handled" time="0.001"/>
8+
<testcase classname="tests.unit.utils.test_vector_search.TestApplyByokRerankBoost" name="test_preserves_chunk_attributes" time="0.049"/>
9+
</testsuite>
10+
</testsuites>
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
meta:
2+
format: 3
3+
version: 7.14.0
4+
timestamp: '2026-05-18T16:57:39.462108'
5+
branch_coverage: false
6+
show_contexts: false
7+
totals:
8+
excluded_lines: 0
9+
percent_statements_covered: 51.252847380410024
10+
percent_statements_covered_display: '51'

tests/benchmarks/test_token_estimator.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,31 @@ def _test_lorem_ipsum_times_1000_times(benchmark: BenchmarkFixture) -> None:
5656
"""The lorem ipsum tokenizes to the known cl100k_base count."""
5757
input_string = LOREM_IPSUM * 1000
5858
benchmark(estimate_tokens, input_string)
59+
60+
61+
def benchmark_file_tokenization(benchmark: BenchmarkFixture, filename: str) -> None:
62+
"""Read the given file and tokenize it."""
63+
with open("tests/benchmarks/data/" + filename, encoding="utf-8") as fin:
64+
input_string = fin.read()
65+
# tokenize the file content
66+
benchmark(estimate_tokens, input_string)
67+
68+
69+
def test_xml_file_10_lines(benchmark: BenchmarkFixture) -> None:
70+
"""Test tokenizing XML file containing just 10 lines."""
71+
benchmark_file_tokenization(benchmark, "xml_10_lines.xml")
72+
73+
74+
def test_yaml_file_10_lines(benchmark: BenchmarkFixture) -> None:
75+
"""Test tokenizing YAML file containing just 10 lines."""
76+
benchmark_file_tokenization(benchmark, "yaml_10_lines.yml")
77+
78+
79+
def test_json_file_10_lines(benchmark: BenchmarkFixture) -> None:
80+
"""Test tokenizing JSON file containing just 10 lines."""
81+
benchmark_file_tokenization(benchmark, "json_10_lines.json")
82+
83+
84+
def test_python_source_10_lines(benchmark: BenchmarkFixture) -> None:
85+
"""Test tokenizing Python script containing just 10 lines."""
86+
benchmark_file_tokenization(benchmark, "python_10_lines.py")

0 commit comments

Comments
 (0)