101101 ],
102102}
103103
104+ # Vector dimensionality -> profile-key fallback. The final leaderboard runs on an
105+ # UNDISCLOSED dataset (same model and similar size as the dev set, but a different
106+ # name and more queries — see the challenge page), so an exact dataset_name match is
107+ # NOT guaranteed at evaluation time. We then pick the tuned profile by vector
108+ # dimensionality, which uniquely identifies the data family (BGE-M3 = 1024-d,
109+ # Llama-3 = 128-d). The 384-d gooaq spot-check has no fallback (CI always supplies
110+ # its config name), so an unknown 384-d set still fails fast.
111+ TASK1_DIM_FALLBACK = {1024 : "wikipedia" } # BGE-M3 family; eval ~ 6.4M vectors
112+ TASK2_DIM_FALLBACK = {128 : "llama-dev" } # Llama-3 family
113+
104114
105115def load_task_config (task_description_path ):
106116 """Load task configuration from a config.json file."""
@@ -204,14 +214,31 @@ def read_op_file(path):
204214 return n , k , float (bt ), float (et ), ids , dists
205215
206216
207- def _profile_or_die (profiles , dataset , task ):
208- profile = profiles .get (dataset )
209- if profile is None :
210- sys .exit (
211- f"Error: no deglib parameter profile for { task } dataset { dataset !r} . "
212- f"Known: { sorted (profiles )} . Add a profile before submitting."
213- )
214- return profile
217+ def _train_dims (input_path , data_key ):
218+ """Database vector dimensionality, read straight from the HDF5 shape (metadata
219+ only — instant even on a 14 GB compressed/chunked input)."""
220+ with h5py .File (input_path , "r" ) as f :
221+ return int (get_h5_item (f , data_key ).shape [1 ])
222+
223+
224+ def _resolve_profile (profiles , dim_fallback , dataset , task , dims ):
225+ """Pick the parameter profile. Exact dataset_name match first (known dev and
226+ spot-check sets); otherwise fall back by vector dimensionality so the undisclosed
227+ evaluation dataset (same family, different name) still maps to the right tuned
228+ profile. Fails fast if neither matches, rather than guessing bad parameters."""
229+ if dataset in profiles :
230+ print (f"[{ task } ] profile: exact match for dataset { dataset !r} (dim={ dims } )" )
231+ return profiles [dataset ]
232+ key = dim_fallback .get (dims )
233+ if key is not None and key in profiles :
234+ print (f"[{ task } ] profile: dataset { dataset !r} is unknown (likely the "
235+ f"undisclosed eval set) — falling back by dim={ dims } to the { key !r} profile" )
236+ return profiles [key ]
237+ sys .exit (
238+ f"Error: no deglib profile for { task } dataset { dataset !r} (dim={ dims } ). "
239+ f"Known names: { sorted (profiles )} ; dim fallbacks: { sorted (dim_fallback )} . "
240+ f"Add a profile before submitting."
241+ )
215242
216243
217244def _require_binary ():
@@ -223,7 +250,8 @@ def _require_binary():
223250def run_task1 (input_path , cfg , output_dir ):
224251 dataset = cfg ["dataset_name" ]
225252 k = int (cfg .get ("k" , 15 ))
226- configs = _profile_or_die (TASK1_PROFILES , dataset , "task1" )
253+ dims = _train_dims (input_path , cfg .get ("data" , "train" ))
254+ configs = _resolve_profile (TASK1_PROFILES , TASK1_DIM_FALLBACK , dataset , "task1" , dims )
227255 _require_binary ()
228256 print (f"[task1] dataset={ dataset } : { len (configs )} config(s) / build(s)" )
229257
@@ -294,7 +322,8 @@ def run_task2(input_path, cfg, output_dir):
294322 dataset = cfg ["dataset_name" ]
295323 k = int (cfg .get ("k" , 30 ))
296324 queries_key = cfg .get ("queries" , "test/queries" )
297- configs = _profile_or_die (TASK2_PROFILES , dataset , "task2" )
325+ dims = _train_dims (input_path , cfg .get ("data" , "train" ))
326+ configs = _resolve_profile (TASK2_PROFILES , TASK2_DIM_FALLBACK , dataset , "task2" , dims )
298327 _require_binary ()
299328 print (f"[task2] dataset={ dataset } : { len (configs )} config(s) / build(s)" )
300329
0 commit comments