3737THREADS = os .environ .get ("DEGLIB_THREADS" , "8" )
3838
3939# --- per-dataset parameter profiles ------------------------------------------
40- # The graph is built ONCE per profile; max_dist is swept to yield several
41- # operating points (build/recall trade-off) from that single build. Unknown
40+ # Each dataset maps to a LIST of configs; every config is one binary invocation
41+ # (one graph build) that emits one or more operating points via its max_dist (and,
42+ # for task 2, eps_search) sweep. Configs that share build params are grouped into a
43+ # single sweep; configs with distinct build params build a separate graph. Unknown
4244# datasets fail fast (see _profile_or_die) rather than silently using bad params.
45+ #
46+ # Task 1 config keys: mode, non_zeros, k_graph, k_ext, eps_ext, prune_worst, evpK,
47+ # max_dist (list). The submission slots are the tuned (mode4 + mode7) configs from
48+ # the old python/submission_task1_*.py — 15 operating points per dataset.
4349TASK1_PROFILES = {
44- # 1024-dim BGE-M3 (normalised, inner product)
45- "wikipedia-small" : dict (mode = "mode4" , non_zeros = 576 , k_graph = 22 , k_ext = 29 ,
46- eps_ext = 0.001 , prune_worst = 10 , evpK = 113 ,
47- max_dist = [200 , 300 , 400 , 500 , 700 , 900 ]),
48- "wikipedia" : dict (mode = "mode4" , non_zeros = 608 , k_graph = 26 , k_ext = 32 ,
49- eps_ext = 0.002 , prune_worst = 9 , evpK = 50 ,
50- max_dist = [500 , 700 , 900 , 1200 , 1400 ]),
50+ # 1024-dim BGE-M3 (normalized, inner product) — 200K dev set; 15 tuned slots.
51+ "wikipedia-small" : [
52+ dict (mode = "mode4" , non_zeros = 768 , k_graph = 16 , k_ext = 19 , eps_ext = 0.001 , prune_worst = 2 , evpK = 28 , max_dist = [576 ]), # slot 1 ~0.757
53+ dict (mode = "mode4" , non_zeros = 704 , k_graph = 14 , k_ext = 29 , eps_ext = 0.001 , prune_worst = 3 , evpK = 31 , max_dist = [707 ]), # slot 2 ~0.787
54+ dict (mode = "mode4" , non_zeros = 704 , k_graph = 18 , k_ext = 26 , eps_ext = 0.001 , prune_worst = 4 , evpK = 37 , max_dist = [404 ]), # slot 3 ~0.806
55+ dict (mode = "mode4" , non_zeros = 768 , k_graph = 14 , k_ext = 38 , eps_ext = 0.001 , prune_worst = 3 , evpK = 37 , max_dist = [666 ]), # slot 4 ~0.815
56+ dict (mode = "mode4" , non_zeros = 640 , k_graph = 16 , k_ext = 37 , eps_ext = 0.001 , prune_worst = 8 , evpK = 38 , max_dist = [620 ]), # slot 5 ~0.826
57+ dict (mode = "mode4" , non_zeros = 704 , k_graph = 16 , k_ext = 31 , eps_ext = 0.001 , prune_worst = 3 , evpK = 86 , max_dist = [431 ]), # slot 6 ~0.831
58+ dict (mode = "mode4" , non_zeros = 384 , k_graph = 12 , k_ext = 16 , eps_ext = 0.001 , prune_worst = 6 , evpK = 95 , max_dist = [742 ]), # slot 7 ~0.838
59+ dict (mode = "mode4" , non_zeros = 768 , k_graph = 18 , k_ext = 41 , eps_ext = 0.001 , prune_worst = 9 , evpK = 78 , max_dist = [586 ]), # slot 8 ~0.881
60+ dict (mode = "mode4" , non_zeros = 576 , k_graph = 16 , k_ext = 74 , eps_ext = 0.001 , prune_worst = 6 , evpK = 48 , max_dist = [561 ]), # slot 9 ~0.888
61+ dict (mode = "mode4" , non_zeros = 576 , k_graph = 22 , k_ext = 29 , eps_ext = 0.001 , prune_worst = 10 , evpK = 113 , max_dist = [533 ]), # slot 10 ~0.894
62+ dict (mode = "mode7" , non_zeros = 768 , k_graph = 20 , k_ext = 18 , eps_ext = 0.001 , prune_worst = 10 , evpK = 41 , max_dist = [220 ]), # slot 11 ~0.776
63+ dict (mode = "mode7" , non_zeros = 704 , k_graph = 22 , k_ext = 20 , eps_ext = 0.001 , prune_worst = 11 , evpK = 35 , max_dist = [192 ]), # slot 12 ~0.783
64+ dict (mode = "mode7" , non_zeros = 640 , k_graph = 18 , k_ext = 48 , eps_ext = 0.001 , prune_worst = 9 , evpK = 33 , max_dist = [221 ]), # slot 13 ~0.815
65+ dict (mode = "mode7" , non_zeros = 640 , k_graph = 26 , k_ext = 27 , eps_ext = 0.001 , prune_worst = 12 , evpK = 33 , max_dist = [226 ]), # slot 14 ~0.840
66+ dict (mode = "mode7" , non_zeros = 768 , k_graph = 20 , k_ext = 39 , eps_ext = 0.001 , prune_worst = 10 , evpK = 77 , max_dist = [384 ]), # slot 15 ~0.859
67+ ],
68+ # 6.35M BGE-M3 — submission slots (eps_ext=0.002). Slots 1-8 share one build
69+ # (max_dist sweep); slots 9/10 and 11/15 are tuned individually.
70+ "wikipedia" : [
71+ dict (mode = "mode4" , non_zeros = 608 , k_graph = 26 , k_ext = 32 , eps_ext = 0.002 , prune_worst = 9 , evpK = 50 , max_dist = [500 , 600 , 700 , 800 , 900 , 1000 , 1200 , 1400 ]), # slots 1-8
72+ dict (mode = "mode4" , non_zeros = 512 , k_graph = 32 , k_ext = 24 , eps_ext = 0.002 , prune_worst = 11 , evpK = 50 , max_dist = [900 ]), # slot 9
73+ dict (mode = "mode4" , non_zeros = 512 , k_graph = 32 , k_ext = 24 , eps_ext = 0.002 , prune_worst = 11 , evpK = 100 , max_dist = [800 ]), # slot 10
74+ dict (mode = "mode7" , non_zeros = 576 , k_graph = 28 , k_ext = 34 , eps_ext = 0.002 , prune_worst = 10 , evpK = 50 , max_dist = [400 ]), # slot 11
75+ dict (mode = "mode7" , non_zeros = 512 , k_graph = 32 , k_ext = 24 , eps_ext = 0.002 , prune_worst = 11 , evpK = 50 , max_dist = [400 , 500 , 600 ]), # slots 12-14
76+ dict (mode = "mode7" , non_zeros = 576 , k_graph = 28 , k_ext = 34 , eps_ext = 0.002 , prune_worst = 10 , evpK = 75 , max_dist = [800 ]), # slot 15
77+ ],
5178 # 384-dim gooaq spot-check (different family; smoke test only, non_zeros<dim)
52- "gooaq-small" : dict ( mode = "mode4" , non_zeros = 300 , k_graph = 24 , k_ext = 24 ,
53- eps_ext = 0.001 , prune_worst = 8 , evpK = 50 ,
54- max_dist = [ 200 , 400 , 800 ]) ,
79+ "gooaq-small" : [
80+ dict ( mode = "mode4" , non_zeros = 300 , k_graph = 24 , k_ext = 24 , eps_ext = 0.001 , prune_worst = 8 , evpK = 50 , max_dist = [ 200 , 400 , 800 ]) ,
81+ ] ,
5582}
5683
57- # Task 2 (MIPS): graph built once (single-threaded per the rules), then a
58- # (eps_search x max_dist) sweep yields the operating points. mode5 = L2-build +
59- # FP16 inner-product search; FLAS pre-sort improves the build.
84+ # Task 2 (MIPS): each config builds the graph once (single-threaded per the rules),
85+ # then sweeps (eps_search x max_dist). The submission candidates are mode5 (L2-build +
86+ # FP16 inner-product search) and mode7 (L2 d+2 build + FP16 L2 search), both with FLAS.
87+ # Task 2 config keys: mode, k_graph, k_ext, eps_ext, build_threads, use_flas, num_runs,
88+ # max_dist (list), eps_search (list).
6089TASK2_PROFILES = {
61- # 128-dim Llama-3 attention (unnormalised inner product)
62- "llama-dev" : dict (mode = "mode5" , k_graph = 32 , k_ext = 64 , eps_ext = 0.001 , build_threads = 1 ,
63- use_flas = True , num_runs = 3 ,
64- max_dist = [5000 , 6000 , 7000 , 8000 ], eps_search = [0.18 , 0.19 , 0.2 ]),
90+ # 128-dim Llama-3 attention (unnormalized inner product) — submission candidates.
91+ "llama-dev" : [
92+ dict (mode = "mode5" , k_graph = 32 , k_ext = 64 , eps_ext = 0.001 , build_threads = 1 , use_flas = True ,
93+ num_runs = 10 , max_dist = [5000 , 6000 , 7000 , 8000 ], eps_search = [0.18 ]),
94+ dict (mode = "mode7" , k_graph = 32 , k_ext = 64 , eps_ext = 0.001 , build_threads = 1 , use_flas = True ,
95+ num_runs = 10 , max_dist = [5000 , 5500 , 6000 , 6200 , 6300 , 6500 , 7000 ], eps_search = [0.007 ]),
96+ ],
6597 # spot-check (14k vectors); smoke test only
66- "llama-small" : dict (mode = "mode5" , k_graph = 32 , k_ext = 64 , eps_ext = 0.001 , build_threads = 1 ,
67- use_flas = True , num_runs = 1 ,
68- max_dist = [2000 , 4000 , 8000 ], eps_search = [0.2 , 0.3 ]),
98+ "llama-small" : [
99+ dict (mode = "mode5" , k_graph = 32 , k_ext = 64 , eps_ext = 0.001 , build_threads = 1 , use_flas = True ,
100+ num_runs = 1 , max_dist = [2000 , 4000 , 8000 ], eps_search = [0.2 , 0.3 ]),
101+ ],
69102}
70103
71104
@@ -190,61 +223,66 @@ def _require_binary():
190223def run_task1 (input_path , cfg , output_dir ):
191224 dataset = cfg ["dataset_name" ]
192225 k = int (cfg .get ("k" , 15 ))
193- profile = _profile_or_die (TASK1_PROFILES , dataset , "task1" )
226+ configs = _profile_or_die (TASK1_PROFILES , dataset , "task1" )
194227 _require_binary ()
195- print (f"[task1] dataset={ dataset } mode={ profile ['mode' ]} "
196- f"non_zeros={ profile ['non_zeros' ]} max_dist={ profile ['max_dist' ]} " )
228+ print (f"[task1] dataset={ dataset } : { len (configs )} config(s) / build(s)" )
197229
198230 bin_input , tmp = maybe_decompress (input_path , ["train" ])
199- op_dir = None
231+ op_root = None
200232 try :
201- op_dir = tempfile .mkdtemp (prefix = "deglib_op_" )
202- cmd = [
203- DEGLIB_BIN , "task1" , bin_input , profile ["mode" ],
204- "--threads" , THREADS , "--k-top" , str (k ),
205- "--non-zeros" , str (profile ["non_zeros" ]),
206- "--k-graph" , str (profile ["k_graph" ]),
207- "--k-ext" , str (profile ["k_ext" ]),
208- "--eps-ext" , str (profile ["eps_ext" ]),
209- "--evpK" , str (profile ["evpK" ]),
210- "--max-dist" , "," .join (str (m ) for m in profile ["max_dist" ]),
211- "--prune-worst" , str (profile ["prune_worst" ]),
212- "--no-recall" , "--output" , op_dir ,
213- ]
214- print ("[task1] running:" , " " .join (cmd ))
215- subprocess .run (cmd , check = True )
216-
217- ops = sorted (Path (op_dir ).glob ("op_*.bin" ))
218- if not ops :
219- sys .exit ("Error: deglib produced no operating-point files for task1." )
233+ op_root = tempfile .mkdtemp (prefix = "deglib_op_" )
220234 sentinel = np .iinfo (np .uint32 ).max
221- for op in ops :
222- n , kk , t_build , t_explore , ids , dists = read_op_file (op )
223- ids = ids .astype (np .int64 )
224- # Padding slots (fewer than k candidates) -> map to the node's own
225- # 0-based id; after +1 that is a harmless duplicate of the self column,
226- # keeping every id a valid 1-based label (and never overflowing int32).
227- row0 = np .arange (n , dtype = np .int64 )[:, None ]
228- ids = np .where (ids == sentinel , row0 , ids )
229- # 0-based ids -> 1-based, then prepend the self-reference at column 0
230- # (k+1 columns) to match the ground-truth layout the evaluator uses.
231- self_ids = np .arange (1 , n + 1 , dtype = np .int64 )[:, None ]
232- knns = np .concatenate ([self_ids , ids + 1 ], axis = 1 ).astype (np .int32 )
233- self_d = np .zeros ((n , 1 ), dtype = np .float32 )
234- out_d = np .concatenate ([self_d , dists ], axis = 1 )
235- mobj = re .search (r"op_evpK(\d+)_md(\d+)" , op .name )
236- evpK , md = mobj .group (1 ), mobj .group (2 )
237- params = (f"mode={ profile ['mode' ]} ,non_zeros={ profile ['non_zeros' ]} ,"
238- f"k_graph={ profile ['k_graph' ]} ,k_ext={ profile ['k_ext' ]} ,"
239- f"prune_worst={ profile ['prune_worst' ]} ,evpK={ evpK } ,max_dist={ md } " )
240- # Task 1 is scored on construction time: buildtime = build + explore.
241- buildtime = t_build + t_explore
242- fn = os .path .join (output_dir , f"deglib_evpK{ evpK } _md{ md } .h5" )
243- store_results (fn , ALGO , dataset , "task1" , out_d , knns , buildtime , 0.0 , params )
244- print (f" wrote { fn } buildtime={ buildtime :.3f} s knns={ knns .shape } " )
235+ for ci , c in enumerate (configs ):
236+ # One binary invocation per config = one graph build + its own sweep.
237+ op_dir = os .path .join (op_root , f"c{ ci } " )
238+ os .makedirs (op_dir , exist_ok = True )
239+ cmd = [
240+ DEGLIB_BIN , "task1" , bin_input , c ["mode" ],
241+ "--threads" , THREADS , "--k-top" , str (k ),
242+ "--non-zeros" , str (c ["non_zeros" ]),
243+ "--k-graph" , str (c ["k_graph" ]),
244+ "--k-ext" , str (c ["k_ext" ]),
245+ "--eps-ext" , str (c ["eps_ext" ]),
246+ "--evpK" , str (c ["evpK" ]),
247+ "--max-dist" , "," .join (str (m ) for m in c ["max_dist" ]),
248+ "--prune-worst" , str (c ["prune_worst" ]),
249+ "--no-recall" , "--output" , op_dir ,
250+ ]
251+ print (f"[task1] config { ci + 1 } /{ len (configs )} ({ c ['mode' ]} ):" , " " .join (cmd ))
252+ subprocess .run (cmd , check = True )
253+
254+ ops = sorted (Path (op_dir ).glob ("op_*.bin" ))
255+ if not ops :
256+ sys .exit (f"Error: deglib produced no operating-point files for task1 config { ci } ." )
257+ for op in ops :
258+ n , kk , t_build , t_explore , ids , dists = read_op_file (op )
259+ ids = ids .astype (np .int64 )
260+ # Padding slots (fewer than k candidates) -> map to the node's own
261+ # 0-based id; after +1 that is a harmless duplicate of the self column,
262+ # keeping every id a valid 1-based label (and never overflowing int32).
263+ row0 = np .arange (n , dtype = np .int64 )[:, None ]
264+ ids = np .where (ids == sentinel , row0 , ids )
265+ # 0-based ids -> 1-based, then prepend the self-reference at column 0
266+ # (k+1 columns) to match the ground-truth layout the evaluator uses.
267+ self_ids = np .arange (1 , n + 1 , dtype = np .int64 )[:, None ]
268+ knns = np .concatenate ([self_ids , ids + 1 ], axis = 1 ).astype (np .int32 )
269+ self_d = np .zeros ((n , 1 ), dtype = np .float32 )
270+ out_d = np .concatenate ([self_d , dists ], axis = 1 )
271+ mobj = re .search (r"op_evpK(\d+)_md(\d+)" , op .name )
272+ evpK , md = mobj .group (1 ), mobj .group (2 )
273+ params = (f"mode={ c ['mode' ]} ,non_zeros={ c ['non_zeros' ]} ,"
274+ f"k_graph={ c ['k_graph' ]} ,k_ext={ c ['k_ext' ]} ,"
275+ f"prune_worst={ c ['prune_worst' ]} ,evpK={ evpK } ,max_dist={ md } " )
276+ # Task 1 is scored on construction time: buildtime = build + explore.
277+ buildtime = t_build + t_explore
278+ # Config index keeps filenames unique across builds that happen to
279+ # share an (evpK, max_dist) pair (e.g. mode4 vs mode7 at the same md).
280+ fn = os .path .join (output_dir , f"deglib_c{ ci } _evpK{ evpK } _md{ md } .h5" )
281+ store_results (fn , ALGO , dataset , "task1" , out_d , knns , buildtime , 0.0 , params )
282+ print (f" wrote { fn } buildtime={ buildtime :.3f} s knns={ knns .shape } " )
245283 finally :
246- if op_dir is not None :
247- shutil .rmtree (op_dir , ignore_errors = True )
284+ if op_root is not None :
285+ shutil .rmtree (op_root , ignore_errors = True )
248286 if tmp :
249287 try :
250288 os .unlink (tmp )
@@ -256,55 +294,59 @@ def run_task2(input_path, cfg, output_dir):
256294 dataset = cfg ["dataset_name" ]
257295 k = int (cfg .get ("k" , 30 ))
258296 queries_key = cfg .get ("queries" , "test/queries" )
259- profile = _profile_or_die (TASK2_PROFILES , dataset , "task2" )
297+ configs = _profile_or_die (TASK2_PROFILES , dataset , "task2" )
260298 _require_binary ()
261- print (f"[task2] dataset={ dataset } mode={ profile ['mode' ]} flas={ profile .get ('use_flas' )} "
262- f"eps_search={ profile ['eps_search' ]} max_dist={ profile ['max_dist' ]} " )
299+ print (f"[task2] dataset={ dataset } : { len (configs )} config(s) / build(s)" )
263300
264301 bin_input , tmp = maybe_decompress (input_path , ["train" , queries_key ])
265- op_dir = None
302+ op_root = None
266303 try :
267- op_dir = tempfile .mkdtemp (prefix = "deglib_op_" )
268- cmd = [
269- DEGLIB_BIN , "task2" , bin_input , profile ["mode" ],
270- "--threads" , THREADS , "--build-threads" , str (profile ["build_threads" ]),
271- "--k-top" , str (k ), "--k-graph" , str (profile ["k_graph" ]),
272- "--k-ext" , str (profile ["k_ext" ]), "--eps-ext" , str (profile ["eps_ext" ]),
273- "--max-dist" , "," .join (str (m ) for m in profile ["max_dist" ]),
274- "--eps-search" , "," .join (str (e ) for e in profile ["eps_search" ]),
275- "--num-runs" , str (profile ["num_runs" ]),
276- "--no-recall" , "--output" , op_dir ,
277- ]
278- if profile .get ("use_flas" ):
279- cmd .append ("--flas" )
280- print ("[task2] running:" , " " .join (cmd ))
281- subprocess .run (cmd , check = True )
282-
283- ops = sorted (Path (op_dir ).glob ("op_*.bin" ))
284- if not ops :
285- sys .exit ("Error: deglib produced no operating-point files for task2." )
304+ op_root = tempfile .mkdtemp (prefix = "deglib_op_" )
286305 sentinel = np .iinfo (np .uint32 ).max
287- for op in ops :
288- n , kk , t_build , t_search , ids , dists = read_op_file (op )
289- # task2 ids are ALREADY 1-based (the binary adds +1 to match test/knns).
290- # No self column (queries are separate from the database). Padding slots
291- # -> 0, the baseline's "missing" marker (never matches a 1-based id).
292- ids = ids .astype (np .int64 )
293- ids = np .where (ids == sentinel , 0 , ids )
294- knns = ids .astype (np .int32 )
295- mobj = re .search (r"op_eps(\d+)_md(\d+)" , op .name )
296- eps_i , md = mobj .group (1 ), mobj .group (2 )
297- eps = int (eps_i ) / 1000.0
298- params = (f"mode={ profile ['mode' ]} ,k_graph={ profile ['k_graph' ]} ,k_ext={ profile ['k_ext' ]} ,"
299- f"flas={ int (bool (profile .get ('use_flas' )))} ,num_runs={ profile ['num_runs' ]} ,"
300- f"eps_search={ eps } ,max_dist={ md } " )
301- # Task 2 is scored on search time: querytime = search, buildtime = one-time build.
302- fn = os .path .join (output_dir , f"deglib_eps{ eps_i } _md{ md } .h5" )
303- store_results (fn , ALGO , dataset , "task2" , dists , knns , t_build , t_search , params )
304- print (f" wrote { fn } buildtime={ t_build :.3f} s querytime={ t_search :.4f} s knns={ knns .shape } " )
306+ for ci , c in enumerate (configs ):
307+ # One binary invocation per config = one graph build + its own sweep.
308+ op_dir = os .path .join (op_root , f"c{ ci } " )
309+ os .makedirs (op_dir , exist_ok = True )
310+ cmd = [
311+ DEGLIB_BIN , "task2" , bin_input , c ["mode" ],
312+ "--threads" , THREADS , "--build-threads" , str (c ["build_threads" ]),
313+ "--k-top" , str (k ), "--k-graph" , str (c ["k_graph" ]),
314+ "--k-ext" , str (c ["k_ext" ]), "--eps-ext" , str (c ["eps_ext" ]),
315+ "--max-dist" , "," .join (str (m ) for m in c ["max_dist" ]),
316+ "--eps-search" , "," .join (str (e ) for e in c ["eps_search" ]),
317+ "--num-runs" , str (c ["num_runs" ]),
318+ "--no-recall" , "--output" , op_dir ,
319+ ]
320+ if c .get ("use_flas" ):
321+ cmd .append ("--flas" )
322+ print (f"[task2] config { ci + 1 } /{ len (configs )} ({ c ['mode' ]} ):" , " " .join (cmd ))
323+ subprocess .run (cmd , check = True )
324+
325+ ops = sorted (Path (op_dir ).glob ("op_*.bin" ))
326+ if not ops :
327+ sys .exit (f"Error: deglib produced no operating-point files for task2 config { ci } ." )
328+ for op in ops :
329+ n , kk , t_build , t_search , ids , dists = read_op_file (op )
330+ # task2 ids are ALREADY 1-based (the binary adds +1 to match test/knns).
331+ # No self column (queries are separate from the database). Padding slots
332+ # -> 0, the baseline's "missing" marker (never matches a 1-based id).
333+ ids = ids .astype (np .int64 )
334+ ids = np .where (ids == sentinel , 0 , ids )
335+ knns = ids .astype (np .int32 )
336+ mobj = re .search (r"op_eps(\d+)_md(\d+)" , op .name )
337+ eps_i , md = mobj .group (1 ), mobj .group (2 )
338+ eps = int (eps_i ) / 1000.0
339+ params = (f"mode={ c ['mode' ]} ,k_graph={ c ['k_graph' ]} ,k_ext={ c ['k_ext' ]} ,"
340+ f"flas={ int (bool (c .get ('use_flas' )))} ,num_runs={ c ['num_runs' ]} ,"
341+ f"eps_search={ eps } ,max_dist={ md } " )
342+ # Task 2 is scored on search time: querytime = search, buildtime = one-time build.
343+ # Config index keeps filenames unique across builds (e.g. mode5 vs mode7).
344+ fn = os .path .join (output_dir , f"deglib_c{ ci } _eps{ eps_i } _md{ md } .h5" )
345+ store_results (fn , ALGO , dataset , "task2" , dists , knns , t_build , t_search , params )
346+ print (f" wrote { fn } buildtime={ t_build :.3f} s querytime={ t_search :.4f} s knns={ knns .shape } " )
305347 finally :
306- if op_dir is not None :
307- shutil .rmtree (op_dir , ignore_errors = True )
348+ if op_root is not None :
349+ shutil .rmtree (op_root , ignore_errors = True )
308350 if tmp :
309351 try :
310352 os .unlink (tmp )
0 commit comments