Skip to content

Commit 9d542ec

Browse files
committed
Merge branch 'main' into ctable-varlen-cols
2 parents f205a1e + a2ad048 commit 9d542ec

21 files changed

Lines changed: 3097 additions & 714 deletions

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ else()
137137
include(FetchContent)
138138
FetchContent_Declare(blosc2
139139
GIT_REPOSITORY https://github.com/Blosc/c-blosc2
140-
GIT_TAG fac9c5a476d813355311ff76aeab5e0439a2e486
140+
GIT_TAG 96cf6006c5ade15a1743de73ebbaa08050c5085f
141141
# SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2
142142
)
143143
FetchContent_MakeAvailable(blosc2)

ROADMAP-TO-4.0.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ The constructor for the `Table` object should take some parameters to specify pr
2222
* `.where()`: an iterator for querying with conditions that are evaluated with the internal compute engine.
2323
* `.index()` for indexing a column and getting better performance in queries (desirable, but optional for 4.0).
2424

25-
In particular, it should try to mimic much of the functionality of data-querying libraries such as ``pandas`` (see [this blog](https://datapythonista.me/blog/whats-new-in-pandas-3) for much of the followin). Hence, one should be able to filter rows of the `Table` via querying on multiple columns (accessed via `.` or perhaps ``__getitem__``), with conditions to select rows implemented via `.index`, `.where` like so
25+
In particular, it should try to mimic much of the functionality of data-querying libraries such as ``pandas`` (see [this blog](https://datapythonista.me/blog/whats-new-in-pandas-3) for much of the following). Hence, one should be able to filter rows of the `Table` via querying on multiple columns (accessed via `.` or perhaps ``__getitem__``), with conditions to select rows implemented via `.index`, `.where` like so
2626

2727
```
2828
tbl.where((tbl.property_type == "hotel") & (tbl.country == "us"))

bench/indexing/duckdb_query_bench.py

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from __future__ import annotations
99

1010
import argparse
11+
import gc
1112
import math
1213
import os
1314
import re
@@ -271,6 +272,9 @@ def build_duckdb_file(
271272
*,
272273
layout: str,
273274
batch_size: int,
275+
duckdb_threads: int,
276+
duckdb_memory_limit: str | None,
277+
preserve_insertion_order: bool,
274278
) -> float:
275279
path.parent.mkdir(parents=True, exist_ok=True)
276280
_remove_duckdb_path(path)
@@ -283,7 +287,10 @@ def build_duckdb_file(
283287
start_time = time.perf_counter()
284288
con = duckdb.connect(str(path))
285289
try:
286-
con.execute("PRAGMA threads=8")
290+
con.execute(f"PRAGMA threads={int(duckdb_threads)}")
291+
con.execute(f"SET preserve_insertion_order={'true' if preserve_insertion_order else 'false'}")
292+
if duckdb_memory_limit is not None:
293+
con.execute(f"SET memory_limit = {duckdb_memory_limit!r}")
287294
con.execute(f"CREATE TABLE data (id {id_type}, payload FLOAT)")
288295
for start in range(0, size, batch_size):
289296
stop = min(start + batch_size, size)
@@ -305,6 +312,16 @@ def build_duckdb_file(
305312
con.execute("INSERT INTO data SELECT * FROM batch_arrow")
306313
con.unregister("batch_arrow")
307314

315+
# For random distributions, random_ids can be very large (e.g. 8 GB
316+
# for 1G float64 rows). Release it, and the final Arrow batch objects,
317+
# before building DuckDB's ART index; otherwise both the source payload
318+
# and DuckDB's index builder compete for memory.
319+
random_ids = None
320+
ids = None
321+
payload = None
322+
batch = None
323+
gc.collect()
324+
308325
if layout == "art-index":
309326
con.execute("CREATE INDEX data_id_idx ON data(id)")
310327
elif layout != "zonemap":
@@ -324,10 +341,23 @@ def _open_or_build_duckdb_file(
324341
*,
325342
layout: str,
326343
batch_size: int,
344+
duckdb_threads: int,
345+
duckdb_memory_limit: str | None,
346+
preserve_insertion_order: bool,
327347
) -> float:
328348
if _valid_duckdb_file(path, layout):
329349
return 0.0
330-
return build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size)
350+
return build_duckdb_file(
351+
size,
352+
dist,
353+
id_dtype,
354+
path,
355+
layout=layout,
356+
batch_size=batch_size,
357+
duckdb_threads=duckdb_threads,
358+
duckdb_memory_limit=duckdb_memory_limit,
359+
preserve_insertion_order=preserve_insertion_order,
360+
)
331361

332362

333363
def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]:
@@ -421,9 +451,22 @@ def benchmark_layout(
421451
batch_size: int,
422452
repeats: int,
423453
exact_query: bool,
454+
duckdb_threads: int,
455+
duckdb_memory_limit: str | None,
456+
preserve_insertion_order: bool,
424457
) -> dict:
425458
path = duckdb_path(outdir, size, dist, id_dtype, layout, batch_size)
426-
create_s = _open_or_build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size)
459+
create_s = _open_or_build_duckdb_file(
460+
size,
461+
dist,
462+
id_dtype,
463+
path,
464+
layout=layout,
465+
batch_size=batch_size,
466+
duckdb_threads=duckdb_threads,
467+
duckdb_memory_limit=duckdb_memory_limit,
468+
preserve_insertion_order=preserve_insertion_order,
469+
)
427470
lo, hi = _query_bounds(size, query_width, id_dtype)
428471

429472
cold_scan_elapsed, warm_scan_elapsed, third_scan_elapsed, scan_rows = benchmark_scan_once(
@@ -469,7 +512,7 @@ def benchmark_layout(
469512

470513
def parse_human_int(value: str) -> int:
471514
value = value.strip().lower().replace("_", "")
472-
multipliers = {"k": 1_000, "m": 1_000_000}
515+
multipliers = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000}
473516
if value[-1:] in multipliers:
474517
return int(float(value[:-1]) * multipliers[value[-1]])
475518
return int(value)
@@ -484,11 +527,16 @@ def print_results(
484527
query_width: int,
485528
id_dtype: np.dtype,
486529
exact_query: bool,
530+
duckdb_threads: int,
531+
duckdb_memory_limit: str | None,
532+
preserve_insertion_order: bool,
487533
) -> None:
488534
print("DuckDB range-query benchmark via SQL filtered reads")
489535
print(
490536
f"batch_size={batch_size:,}, repeats={repeats}, dist={dist}, query_width={query_width:,}, "
491-
f"dtype={id_dtype.name}, query_single_value={exact_query}"
537+
f"dtype={id_dtype.name}, query_single_value={exact_query}, duckdb_threads={duckdb_threads}, "
538+
f"duckdb_memory_limit={'auto' if duckdb_memory_limit is None else duckdb_memory_limit}, "
539+
f"preserve_insertion_order={preserve_insertion_order}"
492540
)
493541
print("Note: 'zonemap' is DuckDB's default table layout with automatic min/max pruning.")
494542
print(" 'art-index' adds an explicit secondary index on id.")
@@ -561,10 +609,29 @@ def main() -> None:
561609
help="Batch size used while loading the table. Default: 1.25M.",
562610
)
563611
parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Benchmark repeats. Default: 3.")
612+
parser.add_argument(
613+
"--duckdb-threads",
614+
type=int,
615+
default=8,
616+
help="DuckDB thread count used while building tables/indexes. Default: 8.",
617+
)
618+
parser.add_argument(
619+
"--duckdb-memory-limit",
620+
default=None,
621+
help="Optional DuckDB memory_limit setting, e.g. '12GB'. Default: DuckDB auto limit.",
622+
)
623+
parser.add_argument(
624+
"--preserve-insertion-order",
625+
action=argparse.BooleanOptionalAction,
626+
default=False,
627+
help="DuckDB preserve_insertion_order setting during build. Default: false to reduce memory.",
628+
)
564629
args = parser.parse_args()
565630

566631
if args.query_single_value and args.query_width != 1:
567632
raise ValueError("--query-single-value requires --query-width 1")
633+
if args.duckdb_threads <= 0:
634+
raise ValueError("--duckdb-threads must be positive")
568635

569636
id_dtype = np.dtype(args.dtype)
570637
sizes = SIZES if args.size == "all" else (parse_human_int(args.size),)
@@ -586,6 +653,9 @@ def main() -> None:
586653
args.batch_size,
587654
args.repeats,
588655
args.query_single_value,
656+
args.duckdb_threads,
657+
args.duckdb_memory_limit,
658+
args.preserve_insertion_order,
589659
)
590660
)
591661

@@ -597,6 +667,9 @@ def main() -> None:
597667
query_width=args.query_width,
598668
id_dtype=id_dtype,
599669
exact_query=args.query_single_value,
670+
duckdb_threads=args.duckdb_threads,
671+
duckdb_memory_limit=args.duckdb_memory_limit,
672+
preserve_insertion_order=args.preserve_insertion_order,
600673
)
601674

602675

0 commit comments

Comments
 (0)