fix: recognize 'prediction' column in main() preprocessing entry point

BioGeek · claude · BioGeek · commit b78829cabacf · 2026-04-11T17:43:43.000+02:00
The V2 handler clean_winnow_rescored() already checks multiple column
name candidates, but main() only checked 'preds' and
'prediction_untokenised'. Winnow outputs use 'prediction', which was
missed. Unified the column detection to match the V2 candidates list.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/instanexus/preprocessing.py b/src/instanexus/preprocessing.py
@@ -358,10 +358,10 @@ def main(
     if metadata_json is not None and "experiment_name" in df.columns:
         df["protease"] = df["experiment_name"].apply(lambda name: extract_protease(name, proteases))
 
-    if "preds" in df.columns:
-        df["cleaned_preds"] = df["preds"].apply(remove_modifications)
-    elif "prediction_untokenised" in df.columns:
-        df["cleaned_preds"] = df["prediction_untokenised"].apply(remove_modifications)
+    seq_candidates = ["preds", "prediction_untokenised", "prediction", "Peptide", "sequence"]
+    seq_col = next((c for c in seq_candidates if c in df.columns), None)
+    if seq_col is not None:
+        df["cleaned_preds"] = df[seq_col].apply(remove_modifications)
     else:
         raise ValueError("No suitable column found for peptide sequences.")