@@ -260,8 +260,14 @@ def _build_instanexus_command(
260260
261261def main ():
262262 parser = argparse .ArgumentParser (description = "Hyperparameter Grid Search for InstaNexus." )
263- parser .add_argument ("--input-csv" , required = True , help = "Path to input cleaned CSV." )
264- parser .add_argument ("--metadata-json" , required = True , help = "Path to sample_metadata.json." )
263+ parser .add_argument (
264+ "--input-csv" ,
265+ required = True ,
266+ help = "Path to the raw input CSV file. Preprocessing runs automatically if no cleaned.csv exists in --output-dir." ,
267+ )
268+ parser .add_argument (
269+ "--metadata-json" , required = True , help = "Path to sample_metadata.json (required for reference protein lookup)."
270+ )
265271 parser .add_argument ("--grid-json" , required = True , help = "Path to gridsearch_params.json." )
266272 parser .add_argument ("--mode" , required = True , help = "Assembly mode (greedy, dbg_weighted, etc.)" )
267273 parser .add_argument ("--output-dir" , default = "outputs/_grid_search" , help = "Directory to save results." )
@@ -271,16 +277,63 @@ def main():
271277 parser .add_argument ("--eval-min-identity" , type = float , default = 0.8 , help = "Min identity for evaluation mapping." )
272278 parser .add_argument ("--eval-max-mismatches" , type = int , default = 100 , help = "Max mismatches for evaluation mapping." )
273279
280+ # Preprocessing knobs — applied once when cleaned.csv does not yet exist
281+ parser .add_argument (
282+ "--conf" ,
283+ type = float ,
284+ default = None ,
285+ help = "Confidence threshold for the preprocessing step (optional). Applied once when building cleaned.csv." ,
286+ )
287+ parser .add_argument (
288+ "--fdr" ,
289+ type = float ,
290+ default = None ,
291+ help = (
292+ "FDR threshold for the preprocessing step (optional). Note: the grid search also sweeps 'fdr' as a "
293+ "hyperparameter; setting this pre-filters the dataset before the sweep begins."
294+ ),
295+ )
296+ parser .add_argument (
297+ "--contaminants-fasta" ,
298+ type = str ,
299+ default = None ,
300+ help = "Path to contaminants.fasta for the preprocessing step (optional)." ,
301+ )
302+
274303 args = parser .parse_args ()
275304
276305 input_path = Path (args .input_csv )
277306 output_dir = Path (args .output_dir )
278307 output_dir .mkdir (parents = True , exist_ok = True )
279308
280- logger .info (f"Loading data from { input_path } ..." )
281- run_name = input_path .stem .replace ("_cleaned" , "" )
309+ # ------------------------------------------------------------------
310+ # Preprocessing — runs once; skipped when cleaned.csv already exists
311+ # ------------------------------------------------------------------
312+ cleaned_csv_path = output_dir / "cleaned.csv"
313+
314+ if not cleaned_csv_path .exists ():
315+ logger .info (f"No cleaned.csv found in { output_dir } . Running preprocessing on { input_path } ..." )
316+ try :
317+ preprocessing .main (
318+ input_csv = str (input_path ),
319+ metadata_json = args .metadata_json ,
320+ contaminants_fasta = args .contaminants_fasta ,
321+ chain = args .chain ,
322+ reference = False ,
323+ conf = args .conf ,
324+ fdr = args .fdr ,
325+ output_csv_path = str (cleaned_csv_path ),
326+ )
327+ except Exception as e :
328+ logger .error (f"Preprocessing failed: { e } " )
329+ return
330+ else :
331+ logger .info (f"Found existing cleaned.csv at { cleaned_csv_path } . Skipping preprocessing." )
332+
333+ run_name = input_path .stem
282334
283- df = pd .read_csv (input_path )
335+ logger .info (f"Loading data from { cleaned_csv_path } ..." )
336+ df = pd .read_csv (cleaned_csv_path )
284337
285338 try :
286339 meta = helpers .get_sample_metadata (run_name , chain = args .chain , json_path = args .metadata_json )
0 commit comments