Skip to content

Commit 79eb799

Browse files
committed
Bench upgrade
1 parent cb18236 commit 79eb799

20 files changed

Lines changed: 815 additions & 238 deletions

bench/ctable/bench_persistency.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,10 +182,10 @@ def bench_append_file():
182182
t_ro = blosc2.CTable.open(path, mode="r")
183183

184184
def bench_read_mem(t=t_mem_table):
185-
_ = t["id"].to_numpy()
185+
_ = t["id"][:]
186186

187187
def bench_read_file(t=t_ro):
188-
_ = t["id"].to_numpy()
188+
_ = t["id"][:]
189189

190190
t_m = tmin(bench_read_mem)
191191
t_f = tmin(bench_read_file)

bench/ctable/compact.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# of varying fractions of the table.
1010

1111
from dataclasses import dataclass
12-
from time import time
12+
from time import perf_counter as time
1313

1414
import numpy as np
1515

bench/ctable/ctable_v_pandas.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# 4. Row iteration
1313

1414
from dataclasses import dataclass
15-
from time import time
15+
from time import perf_counter as time
1616

1717
import numpy as np
1818
import pandas as pd
@@ -75,7 +75,7 @@ class Row:
7575

7676
# 2.5 Column access (full column)
7777
t0 = time()
78-
arr = ct["score"].to_numpy()
78+
arr = ct["score"][:]
7979
t_ct_col = time() - t0
8080

8181
t0 = time()

bench/ctable/delete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# int, slice, and list — with varying sizes.
1010

1111
from dataclasses import dataclass
12-
from time import time
12+
from time import perf_counter as time
1313

1414
import numpy as np
1515

bench/ctable/expected_size.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# is too small (M rows) vs correctly sized (N rows) during extend().
1010

1111
from dataclasses import dataclass
12-
from time import time
12+
from time import perf_counter as time
1313

1414
import numpy as np
1515

bench/ctable/extend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# 3. An existing CTable (previously created from Python lists, 1M rows)
1212

1313
from dataclasses import dataclass
14-
from time import time
14+
from time import perf_counter as time
1515

1616
import numpy as np
1717

bench/ctable/extend_vs_apend.py

Lines changed: 35 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,72 +5,55 @@
55
# SPDX-License-Identifier: BSD-3-Clause
66
#######################################################################
77

8-
# Benchmark for comparing append() (row by row) vs extend() (bulk),
9-
# to find the crossover point where extend() becomes worth it.
8+
# Benchmark: append() row-by-row vs extend() bulk insert.
9+
#
10+
# Compares three strategies at increasing N to find where extend() wins:
11+
# 1. append() x N — one call per row, Pydantic path
12+
# 2. extend() x N — extend([row]) per row, one at a time
13+
# 3. extend() x 1 — single bulk call with all N rows
1014

1115
from dataclasses import dataclass
12-
from time import time
16+
from time import perf_counter
1317

1418
import blosc2
1519

1620

1721
@dataclass
1822
class Row:
19-
id: int = blosc2.field(blosc2.int64(ge=0))
20-
c_val: complex = blosc2.field(blosc2.complex128(), default=0j)
21-
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
22-
active: bool = blosc2.field(blosc2.bool(), default=True)
23+
id: int = blosc2.field(blosc2.int64(ge=0))
24+
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
25+
active: bool = blosc2.field(blosc2.bool(), default=True)
2326

2427

25-
# Parameter — change N to test different crossover points
26-
N = 2
27-
print("append() vs extend() benchmark")
28-
for i in range(6):
29-
print("\n")
30-
print("%" * 100)
28+
SIZES = [10, 100, 1_000, 10_000, 100_000]
3129

30+
print(f"append() vs extend() | sizes: {SIZES}")
31+
print()
32+
print(f"{'N':>10} {'append×N (s)':>14} {'extend×N (s)':>14} {'extend×1 (s)':>14} {'speedup bulk':>13}")
33+
print(f"{'─'*10} {'─'*14} {'─'*14} {'─'*14} {'─'*13}")
3234

33-
# Base data generation
34-
data_list = [
35-
[i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N)
36-
]
35+
for N in SIZES:
36+
data = [[i, float(i % 100), i % 2 == 0] for i in range(N)]
3737

38-
# 1. N individual append() calls
39-
print(f"{N} individual append() calls")
40-
ct_append = blosc2.CTable(Row, expected_size=N)
41-
t0 = time()
42-
for row in data_list:
43-
ct_append.append(row)
44-
t_append = time() - t0
45-
print(f" Time: {t_append:.6f} s")
46-
print(f" Rows: {len(ct_append):,}")
38+
ct = blosc2.CTable(Row, expected_size=N)
39+
t0 = perf_counter()
40+
for row in data:
41+
ct.append(row)
42+
t_append = perf_counter() - t0
4743

48-
# 2. N individual extend() calls (one row at a time)
49-
print(f"{N} individual extend() calls (one row at a time)")
50-
ct_extend_one = blosc2.CTable(Row, expected_size=N)
51-
t0 = time()
52-
for row in data_list:
53-
ct_extend_one.extend([row])
54-
t_extend_one = time() - t0
55-
print(f" Time: {t_extend_one:.6f} s")
56-
print(f" Rows: {len(ct_extend_one):,}")
44+
ct = blosc2.CTable(Row, expected_size=N)
45+
t0 = perf_counter()
46+
for row in data:
47+
ct.extend([row])
48+
t_extend_one = perf_counter() - t0
5749

58-
# 3. Single extend() call with all N rows at once
59-
print(f"Single extend() call with all {N} rows at once")
60-
ct_extend_bulk = blosc2.CTable(Row, expected_size=N)
61-
t0 = time()
62-
ct_extend_bulk.extend(data_list)
63-
t_extend_bulk = time() - t0
64-
print(f" Time: {t_extend_bulk:.6f} s")
65-
print(f" Rows: {len(ct_extend_bulk):,}")
50+
ct = blosc2.CTable(Row, expected_size=N)
51+
t0 = perf_counter()
52+
ct.extend(data)
53+
t_extend_bulk = perf_counter() - t0
6654

67-
# Summary
68-
print("=" * 70)
69-
print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}")
70-
print("-" * 70)
71-
print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}")
72-
print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x")
73-
print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x")
74-
print("-" * 70)
55+
speedup = t_append / t_extend_bulk if t_extend_bulk > 0 else float("inf")
56+
print(f"{N:>10,} {t_append:>14.6f} {t_extend_one:>14.6f} {t_extend_bulk:>14.6f} {speedup:>12.1f}×")
7557

76-
N=N*2
58+
print()
59+
print("speedup bulk = append×N time / extend×1 time (higher is better for extend)")

bench/ctable/indexin.md

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
# CTable Index Benchmark | N=1,000,000 REPS=5
2+
3+
> Random data: sensor_id uniform random in [0, 100,000)
4+
> Sorted data: sensor_id = 0,0,…,1,1,…,2,2,… (clustered, ~10 rows/value)
5+
6+
7+
## BUCKET
8+
9+
> Stores min/max per chunk. Can skip chunks whose range doesn't overlap the
10+
> query. Only effective when data is sorted/clustered. Useless on random data.
11+
12+
### Range query — random data
13+
```
14+
──────────────────────────────────────────────────────────────────────
15+
Random data — BUCKET index
16+
──────────────────────────────────────────────────────────────────────
17+
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
18+
────────────── ───────── ───────── ───────── ────────
19+
0.1% 922 12.8 10.1 1.3×
20+
1% 9,879 13.7 14.7 0.9×
21+
5% 49,991 17.1 17.9 1.0×
22+
10% 99,775 19.8 21.0 0.9×
23+
25% 249,376 24.0 25.0 1.0×
24+
50% 499,826 24.0 27.2 0.9× (slower)
25+
75% 749,665 23.2 27.5 0.8× (slower)
26+
──────────────────────────────────────────────────────────────────────
27+
```
28+
29+
### Range query — sorted data
30+
```
31+
──────────────────────────────────────────────────────────────────────
32+
Sorted data — BUCKET index
33+
──────────────────────────────────────────────────────────────────────
34+
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
35+
────────────── ───────── ───────── ───────── ────────
36+
0.1% 990 11.9 2.5 4.8× ←
37+
1% 9,990 11.9 2.2 5.5× ←
38+
5% 49,990 12.0 3.1 3.9× ←
39+
10% 99,990 12.1 5.1 2.4× ←
40+
25% 249,990 11.7 9.3 1.3×
41+
50% 499,990 12.3 19.0 0.6× (slower)
42+
75% 749,990 11.9 35.9 0.3× (slower)
43+
──────────────────────────────────────────────────────────────────────
44+
```
45+
46+
47+
## PARTIAL
48+
49+
> Stores exact row positions. Works on any data layout.
50+
> Smaller index than FULL; slightly less overhead to build.
51+
52+
### Range query — random data
53+
```
54+
──────────────────────────────────────────────────────────────────────
55+
Random data — PARTIAL index
56+
──────────────────────────────────────────────────────────────────────
57+
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
58+
────────────── ───────── ───────── ───────── ────────
59+
0.1% 922 12.4 1.9 6.4× ←
60+
1% 9,879 14.4 2.5 5.8× ←
61+
5% 49,991 17.3 5.3 3.3× ←
62+
10% 99,775 20.1 8.8 2.3× ←
63+
25% 249,376 23.6 21.4 1.1×
64+
50% 499,826 26.2 46.4 0.6× (slower)
65+
75% 749,665 22.8 75.2 0.3× (slower)
66+
──────────────────────────────────────────────────────────────────────
67+
```
68+
69+
### Range query — sorted data
70+
```
71+
──────────────────────────────────────────────────────────────────────
72+
Sorted data — PARTIAL index
73+
──────────────────────────────────────────────────────────────────────
74+
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
75+
────────────── ───────── ───────── ───────── ────────
76+
0.1% 990 13.2 2.4 5.5× ←
77+
1% 9,990 12.8 2.0 6.4× ←
78+
5% 49,990 12.5 2.6 4.9× ←
79+
10% 99,990 12.7 4.0 3.1× ←
80+
25% 249,990 12.0 8.1 1.5×
81+
50% 499,990 11.9 18.5 0.6× (slower)
82+
75% 749,990 13.1 33.4 0.4× (slower)
83+
──────────────────────────────────────────────────────────────────────
84+
```
85+
86+
### Equality query — random data
87+
```
88+
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
89+
──────────── ────── ───────── ───────── ────────
90+
==0 12 12.6 2.0 6.3× ←
91+
==25,000 13 14.2 1.9 7.5× ←
92+
==50,000 9 12.6 1.9 6.7× ←
93+
==99,999 4 12.4 1.9 6.7× ←
94+
```
95+
96+
### Equality query — sorted data
97+
```
98+
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
99+
──────────── ────── ───────── ───────── ────────
100+
==0 10 11.8 1.9 6.3× ←
101+
==25,000 10 11.7 1.8 6.7× ←
102+
==50,000 10 12.0 1.7 7.0× ←
103+
==99,999 10 12.1 1.7 7.1× ←
104+
```
105+
106+
107+
## FULL
108+
109+
> Stores exact row positions with full chunk coverage.
110+
> Best query performance; larger index than PARTIAL.
111+
112+
### Range query — random data
113+
```
114+
──────────────────────────────────────────────────────────────────────
115+
Random data — FULL index
116+
──────────────────────────────────────────────────────────────────────
117+
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
118+
────────────── ───────── ───────── ───────── ────────
119+
0.1% 922 13.2 2.1 6.4× ←
120+
1% 9,879 15.3 2.8 5.5× ←
121+
5% 49,991 18.1 5.1 3.5× ←
122+
10% 99,775 20.5 11.0 1.9×
123+
25% 249,376 23.5 21.5 1.1×
124+
50% 499,826 25.4 46.1 0.6× (slower)
125+
75% 749,665 23.2 86.9 0.3× (slower)
126+
──────────────────────────────────────────────────────────────────────
127+
```
128+
129+
### Range query — sorted data
130+
```
131+
──────────────────────────────────────────────────────────────────────
132+
Sorted data — FULL index
133+
──────────────────────────────────────────────────────────────────────
134+
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
135+
────────────── ───────── ───────── ───────── ────────
136+
0.1% 990 12.0 1.9 6.4× ←
137+
1% 9,990 12.0 2.0 6.1× ←
138+
5% 49,990 11.5 2.8 4.1× ←
139+
10% 99,990 12.0 4.2 2.9× ←
140+
25% 249,990 11.9 7.8 1.5×
141+
50% 499,990 11.8 18.5 0.6× (slower)
142+
75% 749,990 11.5 44.5 0.3× (slower)
143+
──────────────────────────────────────────────────────────────────────
144+
```
145+
146+
### Equality query — random data
147+
```
148+
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
149+
──────────── ────── ───────── ───────── ────────
150+
==0 12 12.1 2.5 4.8× ←
151+
==25,000 13 12.0 2.0 6.1× ←
152+
==50,000 9 12.4 2.0 6.2× ←
153+
==99,999 4 12.6 2.0 6.4× ←
154+
```
155+
156+
### Equality query — sorted data
157+
```
158+
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
159+
──────────── ────── ───────── ───────── ────────
160+
==0 10 11.7 1.8 6.5× ←
161+
==25,000 10 11.5 1.7 6.6× ←
162+
==50,000 10 12.4 1.7 7.1× ←
163+
==99,999 10 12.3 1.8 7.0× ←
164+
```
165+
166+
### Cardinality comparison — sorted data, FULL index
167+
168+
> Shows how repetition level affects speedup (data always sorted).
169+
```
170+
CARDINALITY 0.1% sel 1% sel 5% sel 10% sel
171+
──────────────────────────────────────────────────────────────────────
172+
High rep (10 uniq) 9.1× 9.6× 8.9× 10.1×
173+
Med rep (1k uniq) 8.5× 6.2× 4.3× 3.5×
174+
Low rep (1M uniq) 6.4× 5.9× 4.2× 3.2×
175+
──────────────────────────────────────────────────────────────────────
176+
(speedup — higher is better)
177+
```
178+
179+
### Compound filter — sorted data, FULL index
180+
181+
> sensor_id > X AND region == Y | region in [0,8) → ~12.5% per value
182+
```
183+
────────────────────────────────────────────────────────────────────────────────
184+
QUERY ROWS NO IDX IDX:sid IDX:reg 2 IDX BEST
185+
────────────── ──────── ───────── ───────── ───────── ───────── ────────────
186+
0.1%+12.5% 127 14.6ms 2.6ms 15.0ms 14.4ms sid(5.6×)
187+
1%+12.5% 1,297 14.7ms 2.6ms 15.2ms 17.2ms sid(5.7×)
188+
5%+12.5% 6,268 16.2ms 4.5ms 16.8ms 20.3ms sid(3.6×)
189+
10%+12.5% 12,377 19.5ms 6.2ms 19.6ms 21.0ms sid(3.2×)
190+
────────────────────────────────────────────────────────────────────────────────
191+
```

0 commit comments

Comments
 (0)