88from __future__ import annotations
99
1010import argparse
11+ import gc
1112import math
1213import os
1314import re
@@ -271,6 +272,9 @@ def build_duckdb_file(
271272 * ,
272273 layout : str ,
273274 batch_size : int ,
275+ duckdb_threads : int ,
276+ duckdb_memory_limit : str | None ,
277+ preserve_insertion_order : bool ,
274278) -> float :
275279 path .parent .mkdir (parents = True , exist_ok = True )
276280 _remove_duckdb_path (path )
@@ -283,7 +287,10 @@ def build_duckdb_file(
283287 start_time = time .perf_counter ()
284288 con = duckdb .connect (str (path ))
285289 try :
286- con .execute ("PRAGMA threads=8" )
290+ con .execute (f"PRAGMA threads={ int (duckdb_threads )} " )
291+ con .execute (f"SET preserve_insertion_order={ 'true' if preserve_insertion_order else 'false' } " )
292+ if duckdb_memory_limit is not None :
293+ con .execute (f"SET memory_limit = { duckdb_memory_limit !r} " )
287294 con .execute (f"CREATE TABLE data (id { id_type } , payload FLOAT)" )
288295 for start in range (0 , size , batch_size ):
289296 stop = min (start + batch_size , size )
@@ -305,6 +312,16 @@ def build_duckdb_file(
305312 con .execute ("INSERT INTO data SELECT * FROM batch_arrow" )
306313 con .unregister ("batch_arrow" )
307314
315+ # For random distributions, random_ids can be very large (e.g. 8 GB
316+ # for 1G float64 rows). Release it, and the final Arrow batch objects,
317+ # before building DuckDB's ART index; otherwise both the source payload
318+ # and DuckDB's index builder compete for memory.
319+ random_ids = None
320+ ids = None
321+ payload = None
322+ batch = None
323+ gc .collect ()
324+
308325 if layout == "art-index" :
309326 con .execute ("CREATE INDEX data_id_idx ON data(id)" )
310327 elif layout != "zonemap" :
@@ -324,10 +341,23 @@ def _open_or_build_duckdb_file(
324341 * ,
325342 layout : str ,
326343 batch_size : int ,
344+ duckdb_threads : int ,
345+ duckdb_memory_limit : str | None ,
346+ preserve_insertion_order : bool ,
327347) -> float :
328348 if _valid_duckdb_file (path , layout ):
329349 return 0.0
330- return build_duckdb_file (size , dist , id_dtype , path , layout = layout , batch_size = batch_size )
350+ return build_duckdb_file (
351+ size ,
352+ dist ,
353+ id_dtype ,
354+ path ,
355+ layout = layout ,
356+ batch_size = batch_size ,
357+ duckdb_threads = duckdb_threads ,
358+ duckdb_memory_limit = duckdb_memory_limit ,
359+ preserve_insertion_order = preserve_insertion_order ,
360+ )
331361
332362
333363def _query_bounds (size : int , query_width : int , dtype : np .dtype ) -> tuple [object , object ]:
@@ -421,9 +451,22 @@ def benchmark_layout(
421451 batch_size : int ,
422452 repeats : int ,
423453 exact_query : bool ,
454+ duckdb_threads : int ,
455+ duckdb_memory_limit : str | None ,
456+ preserve_insertion_order : bool ,
424457) -> dict :
425458 path = duckdb_path (outdir , size , dist , id_dtype , layout , batch_size )
426- create_s = _open_or_build_duckdb_file (size , dist , id_dtype , path , layout = layout , batch_size = batch_size )
459+ create_s = _open_or_build_duckdb_file (
460+ size ,
461+ dist ,
462+ id_dtype ,
463+ path ,
464+ layout = layout ,
465+ batch_size = batch_size ,
466+ duckdb_threads = duckdb_threads ,
467+ duckdb_memory_limit = duckdb_memory_limit ,
468+ preserve_insertion_order = preserve_insertion_order ,
469+ )
427470 lo , hi = _query_bounds (size , query_width , id_dtype )
428471
429472 cold_scan_elapsed , warm_scan_elapsed , third_scan_elapsed , scan_rows = benchmark_scan_once (
@@ -469,7 +512,7 @@ def benchmark_layout(
469512
470513def parse_human_int (value : str ) -> int :
471514 value = value .strip ().lower ().replace ("_" , "" )
472- multipliers = {"k" : 1_000 , "m" : 1_000_000 }
515+ multipliers = {"k" : 1_000 , "m" : 1_000_000 , "g" : 1_000_000_000 }
473516 if value [- 1 :] in multipliers :
474517 return int (float (value [:- 1 ]) * multipliers [value [- 1 ]])
475518 return int (value )
@@ -484,11 +527,16 @@ def print_results(
484527 query_width : int ,
485528 id_dtype : np .dtype ,
486529 exact_query : bool ,
530+ duckdb_threads : int ,
531+ duckdb_memory_limit : str | None ,
532+ preserve_insertion_order : bool ,
487533) -> None :
488534 print ("DuckDB range-query benchmark via SQL filtered reads" )
489535 print (
490536 f"batch_size={ batch_size :,} , repeats={ repeats } , dist={ dist } , query_width={ query_width :,} , "
491- f"dtype={ id_dtype .name } , query_single_value={ exact_query } "
537+ f"dtype={ id_dtype .name } , query_single_value={ exact_query } , duckdb_threads={ duckdb_threads } , "
538+ f"duckdb_memory_limit={ 'auto' if duckdb_memory_limit is None else duckdb_memory_limit } , "
539+ f"preserve_insertion_order={ preserve_insertion_order } "
492540 )
493541 print ("Note: 'zonemap' is DuckDB's default table layout with automatic min/max pruning." )
494542 print (" 'art-index' adds an explicit secondary index on id." )
@@ -561,10 +609,29 @@ def main() -> None:
561609 help = "Batch size used while loading the table. Default: 1.25M." ,
562610 )
563611 parser .add_argument ("--repeats" , type = int , default = DEFAULT_REPEATS , help = "Benchmark repeats. Default: 3." )
612+ parser .add_argument (
613+ "--duckdb-threads" ,
614+ type = int ,
615+ default = 8 ,
616+ help = "DuckDB thread count used while building tables/indexes. Default: 8." ,
617+ )
618+ parser .add_argument (
619+ "--duckdb-memory-limit" ,
620+ default = None ,
621+ help = "Optional DuckDB memory_limit setting, e.g. '12GB'. Default: DuckDB auto limit." ,
622+ )
623+ parser .add_argument (
624+ "--preserve-insertion-order" ,
625+ action = argparse .BooleanOptionalAction ,
626+ default = False ,
627+ help = "DuckDB preserve_insertion_order setting during build. Default: false to reduce memory." ,
628+ )
564629 args = parser .parse_args ()
565630
566631 if args .query_single_value and args .query_width != 1 :
567632 raise ValueError ("--query-single-value requires --query-width 1" )
633+ if args .duckdb_threads <= 0 :
634+ raise ValueError ("--duckdb-threads must be positive" )
568635
569636 id_dtype = np .dtype (args .dtype )
570637 sizes = SIZES if args .size == "all" else (parse_human_int (args .size ),)
@@ -586,6 +653,9 @@ def main() -> None:
586653 args .batch_size ,
587654 args .repeats ,
588655 args .query_single_value ,
656+ args .duckdb_threads ,
657+ args .duckdb_memory_limit ,
658+ args .preserve_insertion_order ,
589659 )
590660 )
591661
@@ -597,6 +667,9 @@ def main() -> None:
597667 query_width = args .query_width ,
598668 id_dtype = id_dtype ,
599669 exact_query = args .query_single_value ,
670+ duckdb_threads = args .duckdb_threads ,
671+ duckdb_memory_limit = args .duckdb_memory_limit ,
672+ preserve_insertion_order = args .preserve_insertion_order ,
600673 )
601674
602675
0 commit comments