Skip to content

Commit 1ac4e55

Browse files
fix: grid search runs from raw file and preprocessed file
1 parent 8c551a6 commit 1ac4e55

2 files changed

Lines changed: 1099 additions & 911 deletions

File tree

scripts/optimization/grid_search.py

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,14 @@ def _build_instanexus_command(
260260

261261
def main():
262262
parser = argparse.ArgumentParser(description="Hyperparameter Grid Search for InstaNexus.")
263-
parser.add_argument("--input-csv", required=True, help="Path to input cleaned CSV.")
264-
parser.add_argument("--metadata-json", required=True, help="Path to sample_metadata.json.")
263+
parser.add_argument(
264+
"--input-csv",
265+
required=True,
266+
help="Path to the raw input CSV file. Preprocessing runs automatically if no cleaned.csv exists in --output-dir.",
267+
)
268+
parser.add_argument(
269+
"--metadata-json", required=True, help="Path to sample_metadata.json (required for reference protein lookup)."
270+
)
265271
parser.add_argument("--grid-json", required=True, help="Path to gridsearch_params.json.")
266272
parser.add_argument("--mode", required=True, help="Assembly mode (greedy, dbg_weighted, etc.)")
267273
parser.add_argument("--output-dir", default="outputs/_grid_search", help="Directory to save results.")
@@ -271,16 +277,63 @@ def main():
271277
parser.add_argument("--eval-min-identity", type=float, default=0.8, help="Min identity for evaluation mapping.")
272278
parser.add_argument("--eval-max-mismatches", type=int, default=100, help="Max mismatches for evaluation mapping.")
273279

280+
# Preprocessing knobs — applied once when cleaned.csv does not yet exist
281+
parser.add_argument(
282+
"--conf",
283+
type=float,
284+
default=None,
285+
help="Confidence threshold for the preprocessing step (optional). Applied once when building cleaned.csv.",
286+
)
287+
parser.add_argument(
288+
"--fdr",
289+
type=float,
290+
default=None,
291+
help=(
292+
"FDR threshold for the preprocessing step (optional). Note: the grid search also sweeps 'fdr' as a "
293+
"hyperparameter; setting this pre-filters the dataset before the sweep begins."
294+
),
295+
)
296+
parser.add_argument(
297+
"--contaminants-fasta",
298+
type=str,
299+
default=None,
300+
help="Path to contaminants.fasta for the preprocessing step (optional).",
301+
)
302+
274303
args = parser.parse_args()
275304

276305
input_path = Path(args.input_csv)
277306
output_dir = Path(args.output_dir)
278307
output_dir.mkdir(parents=True, exist_ok=True)
279308

280-
logger.info(f"Loading data from {input_path}...")
281-
run_name = input_path.stem.replace("_cleaned", "")
309+
# ------------------------------------------------------------------
310+
# Preprocessing — runs once; skipped when cleaned.csv already exists
311+
# ------------------------------------------------------------------
312+
cleaned_csv_path = output_dir / "cleaned.csv"
313+
314+
if not cleaned_csv_path.exists():
315+
logger.info(f"No cleaned.csv found in {output_dir}. Running preprocessing on {input_path} ...")
316+
try:
317+
preprocessing.main(
318+
input_csv=str(input_path),
319+
metadata_json=args.metadata_json,
320+
contaminants_fasta=args.contaminants_fasta,
321+
chain=args.chain,
322+
reference=False,
323+
conf=args.conf,
324+
fdr=args.fdr,
325+
output_csv_path=str(cleaned_csv_path),
326+
)
327+
except Exception as e:
328+
logger.error(f"Preprocessing failed: {e}")
329+
return
330+
else:
331+
logger.info(f"Found existing cleaned.csv at {cleaned_csv_path}. Skipping preprocessing.")
332+
333+
run_name = input_path.stem
282334

283-
df = pd.read_csv(input_path)
335+
logger.info(f"Loading data from {cleaned_csv_path}...")
336+
df = pd.read_csv(cleaned_csv_path)
284337

285338
try:
286339
meta = helpers.get_sample_metadata(run_name, chain=args.chain, json_path=args.metadata_json)

0 commit comments

Comments
 (0)