Skip to content

Commit a6dc65e

Browse files
Merge pull request #26 from Multiomics-Analytics-Group/25-feature-make---metadata-json-path-optional-analogous-to---reference-flag
25 feature make metadata json path optional analogous to reference flag
2 parents ede9318 + b06124b commit a6dc65e

8 files changed

Lines changed: 191 additions & 37 deletions

File tree

.github/workflows/cdci.yml

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,8 @@ permissions:
77
contents: read
88

99
jobs:
10-
format:
11-
runs-on: ubuntu-latest
12-
steps:
13-
- uses: actions/checkout@v4
14-
- uses: psf/black@stable
15-
- uses: isort/isort-action@v1
1610
lint:
17-
name: Lint with ruff
11+
name: Lint and format with ruff
1812
runs-on: ubuntu-latest
1913
steps:
2014
- uses: actions/checkout@v4
@@ -25,7 +19,9 @@ jobs:
2519
- name: Install ruff
2620
run: |
2721
pip install ruff
22+
- name: Check formatting with ruff
23+
run: |
24+
ruff format --check .
2825
- name: Lint with ruff
2926
run: |
30-
# stop the build if there are Python syntax errors or undefined names
3127
ruff check .

CLAUDE.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,35 @@
1515
## Common Commands
1616
- Test: `uv run pytest`
1717
- Lint: `uv run ruff check .`
18+
- Format: `uv run ruff format .`
1819
- Type Check: `uv run mypy --config-file pyproject.toml`
20+
21+
> **Do NOT use Black.** The project formatter is Ruff exclusively.
22+
23+
## Architecture & Design Principles
24+
25+
### Optional flags gate optional features
26+
`--metadata-json-path` is intentionally optional, following the same pattern as `--reference`.
27+
Neither flag is required to run the core assembly pipeline.
28+
29+
| Flag | Without it | With it |
30+
|---|---|---|
31+
| `--metadata-json-path` | Minimal mode: all peptides treated as a single pool, no protease splitting, no chain filtering, no contaminant removal | Full mode: protease assignment, chain filtering, contaminant removal |
32+
| `--reference` | No reference mapping or coverage statistics | Reference-based validation and assembly quality metrics |
33+
34+
**Do NOT make `--metadata-json-path` required.** This design exists to support users
35+
starting from any de novo sequencing tool output, not just InstaNovo.
36+
37+
### CSV input assumptions in minimal mode
38+
When `--metadata-json-path` is absent, **do not assume any InstaNovo-specific columns**
39+
(including `experiment_name`). The only guaranteed columns are the peptide sequence
40+
and a confidence score. All protease-based and chain-based logic must be gated
41+
behind a metadata presence check.
42+
43+
### Where to find the gating pattern
44+
The `--reference` flag handling in `src/instanexus/main.py` is the canonical example
45+
of how optional features should be gated. Use the same `if metadata is not None:`
46+
pattern when implementing metadata-optional behavior. Do not refactor unrelated code.
47+
```
48+
49+
---

environment.linux.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: instanexus
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
7+
dependencies:
8+
# core runtime dependencies
9+
- python=3.11
10+
- scikit-learn
11+
- biopython=1.85
12+
- pandas=2.3.1
13+
- upsetplot
14+
- tqdm=4.67.1
15+
- seaborn=0.13.2
16+
- matplotlib-base
17+
- plotly=6.2.0
18+
- logomaker=0.8
19+
- networkx
20+
- mmseqs2
21+
- sbl::clustalomega
22+
- gawk
23+
- wget
24+
25+
# notebook
26+
- ipykernel
27+
- nbformat
28+
- notebook
29+
30+
# development tools
31+
- black
32+
- pytest
33+
- coverage
34+
- mypy
35+
- pre-commit
36+
37+
# packaging
38+
- pip
39+
- pip:
40+
- build
41+
- twine
42+
- hatchling
43+
- wheel
44+
- sphinx
45+
- kaleido==0.2.1

environment.osx-arm64.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: instanexus
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
7+
dependencies:
8+
# core runtime dependencies
9+
- python=3.11
10+
- biopython=1.85
11+
- pandas=2.3.1
12+
- scikit-learn
13+
- upsetplot
14+
- tqdm=4.67.1
15+
- seaborn=0.13.2
16+
- matplotlib-base
17+
- plotly=6.2.0
18+
- logomaker=0.8
19+
- networkx
20+
- mmseqs2
21+
- sbl::clustalomega
22+
- gawk
23+
- wget
24+
25+
# notebook
26+
- ipykernel
27+
- nbformat
28+
- notebook
29+
30+
# development tools
31+
- black
32+
- ruff
33+
- pytest
34+
- coverage
35+
- mypy
36+
- pre-commit
37+
38+
# packaging
39+
- pip
40+
- pip:
41+
- build
42+
- isort
43+
- twine
44+
- hatchling
45+
- wheel
46+
- sphinx

src/instanexus/assembly.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,7 @@ def run(self, sequences: List[str], df_full: Optional[pd.DataFrame] = None):
12861286
def main(
12871287
input_csv_path: str,
12881288
output_scaffolds_path: str,
1289-
metadata_json_path: str,
1289+
metadata_json_path: Optional[str],
12901290
assembly_mode: str,
12911291
kmer_size: int,
12921292
min_overlap: int,

src/instanexus/main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,14 @@ def cli():
5757
parser.add_argument(
5858
"--metadata-json-path",
5959
type=str,
60-
required=True,
61-
help="Path to the sample_metadata.json file (required by preprocessing and assembly).",
60+
default=None,
61+
help="Path to the sample_metadata.json file (optional; enables protease splitting, chain filtering, and contaminant removal).",
6262
)
6363
parser.add_argument(
6464
"--contaminants-fasta-path",
6565
type=str,
66-
required=True,
67-
help="Path to the contaminants.fasta file (required by preprocessing).",
66+
default=None,
67+
help="Path to the contaminants.fasta file (optional; enables contaminant filtering when metadata is also provided).",
6868
)
6969
parser.add_argument(
7070
"--chain",

src/instanexus/preprocessing.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import logging
2626
import re
2727
from pathlib import Path
28+
from typing import Optional
2829

2930
import numpy as np
3031
import pandas as pd
@@ -328,8 +329,8 @@ def add_quantification_data(df_main, run_name, inputs_folder="inputs"):
328329

329330
def main(
330331
input_csv: str,
331-
metadata_json: str,
332-
contaminants_fasta: str,
332+
metadata_json: Optional[str],
333+
contaminants_fasta: Optional[str],
333334
chain: str,
334335
reference: bool,
335336
conf: float,
@@ -341,21 +342,20 @@ def main(
341342

342343
run = input_csv_path.stem # stem gives the filename without suffix
343344

344-
# load metadata
345-
if chain:
346-
meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
347-
else:
348-
meta = get_sample_metadata(run, json_path=metadata_json)
349-
350-
proteases = meta["proteases"]
351-
352-
if reference:
353-
protein = meta["protein"]
354-
protein_norm = normalize_sequence(protein)
345+
protein_norm = None
346+
if metadata_json is not None:
347+
if chain:
348+
meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
349+
else:
350+
meta = get_sample_metadata(run, json_path=metadata_json)
351+
proteases = meta["proteases"]
352+
if reference:
353+
protein = meta["protein"]
354+
protein_norm = normalize_sequence(protein)
355355

356356
df = pd.read_csv(input_csv_path)
357357

358-
if "experiment_name" in df.columns:
358+
if metadata_json is not None and "experiment_name" in df.columns:
359359
df["protease"] = df["experiment_name"].apply(lambda name: extract_protease(name, proteases))
360360

361361
if "preds" in df.columns:
@@ -367,12 +367,12 @@ def main(
367367

368368
df = clean_dataframe(df)
369369

370-
# filtering contaminants
371-
cleaned_psms = df["cleaned_preds"].tolist()
372-
filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
373-
df = df[df["cleaned_preds"].isin(filtered_psms)]
370+
if contaminants_fasta is not None:
371+
cleaned_psms = df["cleaned_preds"].tolist()
372+
filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
373+
df = df[df["cleaned_preds"].isin(filtered_psms)]
374374

375-
if reference:
375+
if reference and protein_norm is not None:
376376
df["mapped"] = df["cleaned_preds"].apply(lambda x: True if x in protein_norm else False)
377377

378378
if conf is not None:
@@ -440,14 +440,14 @@ def cli():
440440
parser.add_argument(
441441
"--metadata-json",
442442
type=str,
443-
required=True,
444-
help="Path to the sample_metadata.json file.",
443+
default=None,
444+
help="Path to the sample_metadata.json file (optional).",
445445
)
446446
parser.add_argument(
447447
"--contaminants-fasta",
448448
type=str,
449-
required=True,
450-
help="Path to the contaminants.fasta file.",
449+
default=None,
450+
help="Path to the contaminants.fasta file (optional).",
451451
)
452452
parser.add_argument(
453453
"--output-csv-path",

tests/test_preprocessing.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#!/usr/bin/env python3
22

3+
from pathlib import Path
34

4-
from instanexus.preprocessing import remove_modifications
5+
import pandas as pd
6+
7+
from instanexus.preprocessing import remove_modifications, main as preprocessing_main
58

69

710
def test_remove_modifications():
@@ -15,3 +18,36 @@ def test_remove_modifications():
1518
assert remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
1619
assert remove_modifications("AI BCD") == "AL BCD"
1720
assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"
21+
22+
23+
def test_preprocessing_no_metadata(tmp_path: Path) -> None:
24+
"""Preprocessing must succeed when metadata_json and contaminants_fasta are both None."""
25+
input_csv = tmp_path / "sample.csv"
26+
output_csv = tmp_path / "out" / "cleaned.csv"
27+
28+
df = pd.DataFrame(
29+
{
30+
"preds": ["ACDEF", "GHIKL", "MNPQR"],
31+
"log_probs": [-0.1, -0.2, -0.3],
32+
}
33+
)
34+
df.to_csv(input_csv, index=False)
35+
36+
preprocessing_main(
37+
input_csv=str(input_csv),
38+
metadata_json=None,
39+
contaminants_fasta=None,
40+
chain="",
41+
reference=False,
42+
conf=None,
43+
fdr=None,
44+
output_csv_path=str(output_csv),
45+
)
46+
47+
assert output_csv.exists()
48+
result = pd.read_csv(output_csv)
49+
assert "cleaned_preds" in result.columns
50+
# I is normalized to L during remove_modifications
51+
assert set(result["cleaned_preds"]) == {"ACDEF", "GHLKL", "MNPQR"}
52+
assert "protease" not in result.columns
53+
assert "mapped" not in result.columns

0 commit comments

Comments
 (0)