Merge pull request #26 from Multiomics-Analytics-Group/25-feature-make---metadata-json-path-optional-analogous-to---reference-flag

marcoreverenna · web-flow · commit a6dc65e6932a · 2026-04-06T12:40:31.000+02:00
25 feature make   metadata json path optional analogous to   reference flag
diff --git a/.github/workflows/cdci.yml b/.github/workflows/cdci.yml
@@ -7,14 +7,8 @@ permissions:
   contents: read
 
 jobs:
-  format:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: psf/black@stable
-      - uses: isort/isort-action@v1
   lint:
-    name: Lint with ruff
+    name: Lint and format with ruff
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -25,7 +19,9 @@ jobs:
       - name: Install ruff
         run: |
           pip install ruff
+      - name: Check formatting with ruff
+        run: |
+          ruff format --check .
       - name: Lint with ruff
         run: |
-          # stop the build if there are Python syntax errors or undefined names
           ruff check .
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -15,4 +15,35 @@
 ## Common Commands
 - Test: `uv run pytest`
 - Lint: `uv run ruff check .`
+- Format: `uv run ruff format .`
 - Type Check: `uv run mypy --config-file pyproject.toml`
+
+> **Do NOT use Black.** The project formatter is Ruff exclusively.
+
+## Architecture & Design Principles
+
+### Optional flags gate optional features
+`--metadata-json-path` is intentionally optional, following the same pattern as `--reference`.
+Neither flag is required to run the core assembly pipeline.
+
+| Flag | Without it | With it |
+|---|---|---|
+| `--metadata-json-path` | Minimal mode: all peptides treated as a single pool, no protease splitting, no chain filtering, no contaminant removal | Full mode: protease assignment, chain filtering, contaminant removal |
+| `--reference` | No reference mapping or coverage statistics | Reference-based validation and assembly quality metrics |
+
+**Do NOT make `--metadata-json-path` required.** This design exists to support users
+starting from any de novo sequencing tool output, not just InstaNovo.
+
+### CSV input assumptions in minimal mode
+When `--metadata-json-path` is absent, **do not assume any InstaNovo-specific columns**
+(including `experiment_name`). The only guaranteed columns are the peptide sequence
+and a confidence score. All protease-based and chain-based logic must be gated
+behind a metadata presence check.
+
+### Where to find the gating pattern
+The `--reference` flag handling in `src/instanexus/main.py` is the canonical example
+of how optional features should be gated. Use the same `if metadata is not None:`
+pattern when implementing metadata-optional behavior. Do not refactor unrelated code.
+```
+
+---
diff --git a/environment.linux.yml b/environment.linux.yml
@@ -0,0 +1,45 @@
+name: instanexus
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  # core runtime dependencies
+  - python=3.11
+  - scikit-learn
+  - biopython=1.85
+  - pandas=2.3.1
+  - upsetplot
+  - tqdm=4.67.1
+  - seaborn=0.13.2
+  - matplotlib-base
+  - plotly=6.2.0
+  - logomaker=0.8
+  - networkx
+  - mmseqs2
+  - sbl::clustalomega
+  - gawk
+  - wget
+
+  # notebook
+  - ipykernel
+  - nbformat
+  - notebook
+
+  # development tools
+  - black
+  - pytest
+  - coverage
+  - mypy
+  - pre-commit
+
+  # packaging
+  - pip
+  - pip:
+      - build
+      - twine
+      - hatchling
+      - wheel
+      - sphinx
+      - kaleido==0.2.1
diff --git a/environment.osx-arm64.yaml b/environment.osx-arm64.yaml
@@ -0,0 +1,46 @@
+name: instanexus
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  # core runtime dependencies
+  - python=3.11
+  - biopython=1.85
+  - pandas=2.3.1
+  - scikit-learn
+  - upsetplot
+  - tqdm=4.67.1
+  - seaborn=0.13.2
+  - matplotlib-base
+  - plotly=6.2.0
+  - logomaker=0.8
+  - networkx
+  - mmseqs2
+  - sbl::clustalomega
+  - gawk
+  - wget
+
+  # notebook
+  - ipykernel
+  - nbformat
+  - notebook
+
+  # development tools
+  - black
+  - ruff
+  - pytest
+  - coverage
+  - mypy
+  - pre-commit
+
+  # packaging
+  - pip
+  - pip:
+      - build
+      - isort
+      - twine
+      - hatchling
+      - wheel
+      - sphinx
diff --git a/src/instanexus/assembly.py b/src/instanexus/assembly.py
@@ -1286,7 +1286,7 @@ def run(self, sequences: List[str], df_full: Optional[pd.DataFrame] = None):
 def main(
     input_csv_path: str,
     output_scaffolds_path: str,
-    metadata_json_path: str,
+    metadata_json_path: Optional[str],
     assembly_mode: str,
     kmer_size: int,
     min_overlap: int,
diff --git a/src/instanexus/main.py b/src/instanexus/main.py
@@ -57,14 +57,14 @@ def cli():
     parser.add_argument(
         "--metadata-json-path",
         type=str,
-        required=True,
-        help="Path to the sample_metadata.json file (required by preprocessing and assembly).",
+        default=None,
+        help="Path to the sample_metadata.json file (optional; enables protease splitting, chain filtering, and contaminant removal).",
     )
     parser.add_argument(
         "--contaminants-fasta-path",
         type=str,
-        required=True,
-        help="Path to the contaminants.fasta file (required by preprocessing).",
+        default=None,
+        help="Path to the contaminants.fasta file (optional; enables contaminant filtering when metadata is also provided).",
     )
     parser.add_argument(
         "--chain",
diff --git a/src/instanexus/preprocessing.py b/src/instanexus/preprocessing.py
@@ -25,6 +25,7 @@
 import logging
 import re
 from pathlib import Path
+from typing import Optional
 
 import numpy as np
 import pandas as pd
@@ -328,8 +329,8 @@ def add_quantification_data(df_main, run_name, inputs_folder="inputs"):
 
 def main(
     input_csv: str,
-    metadata_json: str,
-    contaminants_fasta: str,
+    metadata_json: Optional[str],
+    contaminants_fasta: Optional[str],
     chain: str,
     reference: bool,
     conf: float,
@@ -341,21 +342,20 @@ def main(
 
     run = input_csv_path.stem  # stem gives the filename without suffix
 
-    # load metadata
-    if chain:
-        meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
-    else:
-        meta = get_sample_metadata(run, json_path=metadata_json)
-
-    proteases = meta["proteases"]
-
-    if reference:
-        protein = meta["protein"]
-        protein_norm = normalize_sequence(protein)
+    protein_norm = None
+    if metadata_json is not None:
+        if chain:
+            meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
+        else:
+            meta = get_sample_metadata(run, json_path=metadata_json)
+        proteases = meta["proteases"]
+        if reference:
+            protein = meta["protein"]
+            protein_norm = normalize_sequence(protein)
 
     df = pd.read_csv(input_csv_path)
 
-    if "experiment_name" in df.columns:
+    if metadata_json is not None and "experiment_name" in df.columns:
         df["protease"] = df["experiment_name"].apply(lambda name: extract_protease(name, proteases))
 
     if "preds" in df.columns:
@@ -367,12 +367,12 @@ def main(
 
     df = clean_dataframe(df)
 
-    # filtering contaminants
-    cleaned_psms = df["cleaned_preds"].tolist()
-    filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
-    df = df[df["cleaned_preds"].isin(filtered_psms)]
+    if contaminants_fasta is not None:
+        cleaned_psms = df["cleaned_preds"].tolist()
+        filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
+        df = df[df["cleaned_preds"].isin(filtered_psms)]
 
-    if reference:
+    if reference and protein_norm is not None:
         df["mapped"] = df["cleaned_preds"].apply(lambda x: True if x in protein_norm else False)
 
     if conf is not None:
@@ -440,14 +440,14 @@ def cli():
     parser.add_argument(
         "--metadata-json",
         type=str,
-        required=True,
-        help="Path to the sample_metadata.json file.",
+        default=None,
+        help="Path to the sample_metadata.json file (optional).",
     )
     parser.add_argument(
         "--contaminants-fasta",
         type=str,
-        required=True,
-        help="Path to the contaminants.fasta file.",
+        default=None,
+        help="Path to the contaminants.fasta file (optional).",
     )
     parser.add_argument(
         "--output-csv-path",
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -1,7 +1,10 @@
 #!/usr/bin/env python3
 
+from pathlib import Path
 
-from instanexus.preprocessing import remove_modifications
+import pandas as pd
+
+from instanexus.preprocessing import remove_modifications, main as preprocessing_main
 
 
 def test_remove_modifications():
@@ -15,3 +18,36 @@ def test_remove_modifications():
     assert remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
     assert remove_modifications("AI BCD") == "AL BCD"
     assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"
+
+
+def test_preprocessing_no_metadata(tmp_path: Path) -> None:
+    """Preprocessing must succeed when metadata_json and contaminants_fasta are both None."""
+    input_csv = tmp_path / "sample.csv"
+    output_csv = tmp_path / "out" / "cleaned.csv"
+
+    df = pd.DataFrame(
+        {
+            "preds": ["ACDEF", "GHIKL", "MNPQR"],
+            "log_probs": [-0.1, -0.2, -0.3],
+        }
+    )
+    df.to_csv(input_csv, index=False)
+
+    preprocessing_main(
+        input_csv=str(input_csv),
+        metadata_json=None,
+        contaminants_fasta=None,
+        chain="",
+        reference=False,
+        conf=None,
+        fdr=None,
+        output_csv_path=str(output_csv),
+    )
+
+    assert output_csv.exists()
+    result = pd.read_csv(output_csv)
+    assert "cleaned_preds" in result.columns
+    # I is normalized to L during remove_modifications
+    assert set(result["cleaned_preds"]) == {"ACDEF", "GHLKL", "MNPQR"}
+    assert "protease" not in result.columns
+    assert "mapped" not in result.columns