Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions .github/workflows/cdci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,8 @@ permissions:
contents: read

jobs:
format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: psf/black@stable
- uses: isort/isort-action@v1
lint:
name: Lint with ruff
name: Lint and format with ruff
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -25,7 +19,9 @@ jobs:
- name: Install ruff
run: |
pip install ruff
- name: Check formatting with ruff
run: |
ruff format --check .
- name: Lint with ruff
run: |
# stop the build if there are Python syntax errors or undefined names
ruff check .
31 changes: 31 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,35 @@
## Common Commands
- Test: `uv run pytest`
- Lint: `uv run ruff check .`
- Format: `uv run ruff format .`
- Type Check: `uv run mypy --config-file pyproject.toml`

> **Do NOT use Black.** The project formatter is Ruff exclusively.

## Architecture & Design Principles

### Optional flags gate optional features
`--metadata-json-path` is intentionally optional, following the same pattern as `--reference`.
Neither flag is required to run the core assembly pipeline.

| Flag | Without it | With it |
|---|---|---|
| `--metadata-json-path` | Minimal mode: all peptides treated as a single pool, no protease splitting, no chain filtering, no contaminant removal | Full mode: protease assignment, chain filtering, contaminant removal |
| `--reference` | No reference mapping or coverage statistics | Reference-based validation and assembly quality metrics |

**Do NOT make `--metadata-json-path` required.** This design exists to support users
starting from any de novo sequencing tool output, not just InstaNovo.

### CSV input assumptions in minimal mode
When `--metadata-json-path` is absent, **do not assume any InstaNovo-specific columns**
(including `experiment_name`). The only guaranteed columns are the peptide sequence
and a confidence score. All protease-based and chain-based logic must be gated
behind a metadata presence check.

### Where to find the gating pattern
The `--reference` flag handling in `src/instanexus/main.py` is the canonical example
of how optional features should be gated. Use the same `if metadata is not None:`
pattern when implementing metadata-optional behavior. Do not refactor unrelated code.
```

---
45 changes: 45 additions & 0 deletions environment.linux.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: instanexus
channels:
- conda-forge
- bioconda
- defaults

dependencies:
# core runtime dependencies
- python=3.11
- scikit-learn
- biopython=1.85
- pandas=2.3.1
- upsetplot
- tqdm=4.67.1
- seaborn=0.13.2
- matplotlib-base
- plotly=6.2.0
- logomaker=0.8
- networkx
- mmseqs2
- sbl::clustalomega
- gawk
- wget

# notebook
- ipykernel
- nbformat
- notebook

# development tools
- black
- pytest
- coverage
- mypy
- pre-commit

# packaging
- pip
- pip:
- build
- twine
- hatchling
- wheel
- sphinx
- kaleido==0.2.1
46 changes: 46 additions & 0 deletions environment.osx-arm64.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: instanexus
channels:
- conda-forge
- bioconda
- defaults

dependencies:
# core runtime dependencies
- python=3.11
- biopython=1.85
- pandas=2.3.1
- scikit-learn
- upsetplot
- tqdm=4.67.1
- seaborn=0.13.2
- matplotlib-base
- plotly=6.2.0
- logomaker=0.8
- networkx
- mmseqs2
- sbl::clustalomega
- gawk
- wget

# notebook
- ipykernel
- nbformat
- notebook

# development tools
- black
- ruff
- pytest
- coverage
- mypy
- pre-commit

# packaging
- pip
- pip:
- build
- isort
- twine
- hatchling
- wheel
- sphinx
2 changes: 1 addition & 1 deletion src/instanexus/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ def run(self, sequences: List[str], df_full: Optional[pd.DataFrame] = None):
def main(
input_csv_path: str,
output_scaffolds_path: str,
metadata_json_path: str,
metadata_json_path: Optional[str],
assembly_mode: str,
kmer_size: int,
min_overlap: int,
Expand Down
8 changes: 4 additions & 4 deletions src/instanexus/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ def cli():
parser.add_argument(
"--metadata-json-path",
type=str,
required=True,
help="Path to the sample_metadata.json file (required by preprocessing and assembly).",
default=None,
help="Path to the sample_metadata.json file (optional; enables protease splitting, chain filtering, and contaminant removal).",
)
parser.add_argument(
"--contaminants-fasta-path",
type=str,
required=True,
help="Path to the contaminants.fasta file (required by preprocessing).",
default=None,
help="Path to the contaminants.fasta file (optional; enables contaminant filtering when metadata is also provided).",
)
parser.add_argument(
"--chain",
Expand Down
46 changes: 23 additions & 23 deletions src/instanexus/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import logging
import re
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -328,8 +329,8 @@ def add_quantification_data(df_main, run_name, inputs_folder="inputs"):

def main(
input_csv: str,
metadata_json: str,
contaminants_fasta: str,
metadata_json: Optional[str],
contaminants_fasta: Optional[str],
chain: str,
reference: bool,
conf: float,
Expand All @@ -341,21 +342,20 @@ def main(

run = input_csv_path.stem # stem gives the filename without suffix

# load metadata
if chain:
meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
else:
meta = get_sample_metadata(run, json_path=metadata_json)

proteases = meta["proteases"]

if reference:
protein = meta["protein"]
protein_norm = normalize_sequence(protein)
protein_norm = None
if metadata_json is not None:
if chain:
meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
else:
meta = get_sample_metadata(run, json_path=metadata_json)
proteases = meta["proteases"]
if reference:
protein = meta["protein"]
protein_norm = normalize_sequence(protein)

df = pd.read_csv(input_csv_path)

if "experiment_name" in df.columns:
if metadata_json is not None and "experiment_name" in df.columns:
df["protease"] = df["experiment_name"].apply(lambda name: extract_protease(name, proteases))

if "preds" in df.columns:
Expand All @@ -367,12 +367,12 @@ def main(

df = clean_dataframe(df)

# filtering contaminants
cleaned_psms = df["cleaned_preds"].tolist()
filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
df = df[df["cleaned_preds"].isin(filtered_psms)]
if contaminants_fasta is not None:
cleaned_psms = df["cleaned_preds"].tolist()
filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
df = df[df["cleaned_preds"].isin(filtered_psms)]

if reference:
if reference and protein_norm is not None:
df["mapped"] = df["cleaned_preds"].apply(lambda x: True if x in protein_norm else False)

if conf is not None:
Expand Down Expand Up @@ -440,14 +440,14 @@ def cli():
parser.add_argument(
"--metadata-json",
type=str,
required=True,
help="Path to the sample_metadata.json file.",
default=None,
help="Path to the sample_metadata.json file (optional).",
)
parser.add_argument(
"--contaminants-fasta",
type=str,
required=True,
help="Path to the contaminants.fasta file.",
default=None,
help="Path to the contaminants.fasta file (optional).",
)
parser.add_argument(
"--output-csv-path",
Expand Down
38 changes: 37 additions & 1 deletion tests/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#!/usr/bin/env python3

from pathlib import Path

from instanexus.preprocessing import remove_modifications
import pandas as pd

from instanexus.preprocessing import remove_modifications, main as preprocessing_main


def test_remove_modifications():
Expand All @@ -15,3 +18,36 @@ def test_remove_modifications():
assert remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
assert remove_modifications("AI BCD") == "AL BCD"
assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"


def test_preprocessing_no_metadata(tmp_path: Path) -> None:
"""Preprocessing must succeed when metadata_json and contaminants_fasta are both None."""
input_csv = tmp_path / "sample.csv"
output_csv = tmp_path / "out" / "cleaned.csv"

df = pd.DataFrame(
{
"preds": ["ACDEF", "GHIKL", "MNPQR"],
"log_probs": [-0.1, -0.2, -0.3],
}
)
df.to_csv(input_csv, index=False)

preprocessing_main(
input_csv=str(input_csv),
metadata_json=None,
contaminants_fasta=None,
chain="",
reference=False,
conf=None,
fdr=None,
output_csv_path=str(output_csv),
)

assert output_csv.exists()
result = pd.read_csv(output_csv)
assert "cleaned_preds" in result.columns
# I is normalized to L during remove_modifications
assert set(result["cleaned_preds"]) == {"ACDEF", "GHLKL", "MNPQR"}
assert "protease" not in result.columns
assert "mapped" not in result.columns
Loading