Skip to content

Commit 8c551a6

Browse files
Merge pull request #31 from Multiomics-Analytics-Group/feat-batch-friendly-cli
feat: add --output-dir, --skip-plots, and summary.tsv for pipeline integration
2 parents dc06b69 + b78829c commit 8c551a6

4 files changed

Lines changed: 93 additions & 41 deletions

File tree

src/instanexus/consensus.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,10 @@ def plot_logo2(pssm_df, output_file):
141141
plt.close(fig)
142142

143143

144-
def run_consensus_generation(align_folder: str, output_folder: str, run_id: str = ""):
144+
def run_consensus_generation(align_folder: str, output_folder: str, run_id: str = "", skip_plots: bool = False):
145145
"""
146146
Core logic: Process all .afa files from alignment folder.
147-
Generate consensus sequences, heatmaps, and logos.
147+
Generate consensus sequences, and optionally heatmaps and logos.
148148
"""
149149
align_path = Path(align_folder)
150150
output_path = Path(output_folder)
@@ -155,12 +155,13 @@ def run_consensus_generation(align_folder: str, output_folder: str, run_id: str
155155
raise FileNotFoundError(f"Alignment folder not found: {align_path}")
156156

157157
consensus_fasta_dir = output_path / "consensus_fasta"
158-
heatmap_dir = output_path / "heatmap"
159-
logo_dir = output_path / "logo"
160-
161158
consensus_fasta_dir.mkdir(exist_ok=True)
162-
heatmap_dir.mkdir(exist_ok=True)
163-
logo_dir.mkdir(exist_ok=True)
159+
160+
if not skip_plots:
161+
heatmap_dir = output_path / "heatmap"
162+
logo_dir = output_path / "logo"
163+
heatmap_dir.mkdir(exist_ok=True)
164+
logo_dir.mkdir(exist_ok=True)
164165

165166
alignment_files = [f for f in sorted(os.listdir(align_path)) if f.endswith(".afa")]
166167

@@ -190,11 +191,12 @@ def run_consensus_generation(align_folder: str, output_folder: str, run_id: str
190191
consensus_fasta_path = consensus_fasta_dir / f"{base_filename}_consensus.fasta"
191192
Bio.SeqIO.write([consensus_record], consensus_fasta_path, "fasta")
192193

193-
heatmap_path = heatmap_dir / f"{base_filename}_heatmap.svg"
194-
plot_heatmap2(pssm_df, heatmap_path)
194+
if not skip_plots:
195+
heatmap_path = heatmap_dir / f"{base_filename}_heatmap.svg"
196+
plot_heatmap2(pssm_df, heatmap_path)
195197

196-
logo_path = logo_dir / f"{base_filename}_logo.svg"
197-
plot_logo2(pssm_df, logo_path)
198+
logo_path = logo_dir / f"{base_filename}_logo.svg"
199+
plot_logo2(pssm_df, logo_path)
198200

199201
logger.info("All consensus tasks completed.")
200202
return consensus_fasta_dir
@@ -248,7 +250,7 @@ def generate_consensus_stats(consensus_base_folder):
248250
logger.info(f"Consensus statistics saved to: {stats_path}")
249251

250252

251-
def main(input_alignment_folder: str, output_consensus_folder: str, run_id: str = ""):
253+
def main(input_alignment_folder: str, output_consensus_folder: str, run_id: str = "", skip_plots: bool = False):
252254
"""
253255
Main function to run the consensus generation script.
254256
"""
@@ -259,13 +261,16 @@ def main(input_alignment_folder: str, output_consensus_folder: str, run_id: str
259261

260262
logger.info(f"Alignment Folder (Input): {align_folder_in}")
261263
logger.info(f"Consensus Folder (Output): {consensus_folder_out}")
264+
if skip_plots:
265+
logger.info("Skipping heatmap and logo generation (--skip-plots)")
262266

263-
# --- Step 1: Generate consensus, heatmaps, and logos ---
267+
# --- Step 1: Generate consensus, and optionally heatmaps and logos ---
264268
logger.info("Running consensus generation from alignment files...")
265269
consensus_fasta_dir = run_consensus_generation(
266270
align_folder=str(align_folder_in),
267271
output_folder=str(consensus_folder_out),
268272
run_id=run_id,
273+
skip_plots=skip_plots,
269274
)
270275

271276
# --- Step 2: Generate statistics on the consensus files ---
@@ -300,13 +305,19 @@ def cli():
300305
default="",
301306
help="Optional ID to display in the progress bar.",
302307
)
308+
parser.add_argument(
309+
"--skip-plots",
310+
action="store_true",
311+
help="Skip generating heatmap and logo SVG plots.",
312+
)
303313

304314
args = parser.parse_args()
305315

306316
main(
307317
input_alignment_folder=args.input_folder,
308318
output_consensus_folder=args.output_folder,
309319
run_id=args.run_id,
320+
skip_plots=args.skip_plots,
310321
)
311322

312323

src/instanexus/main.py

Lines changed: 61 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,19 @@ def cli():
143143
action="store_true",
144144
help="Enables iterative refinement (Overlap Graph) to merge assembled contigs.",
145145
)
146+
parser.add_argument(
147+
"--output-dir",
148+
type=str,
149+
default=None,
150+
help="Explicit output directory. When set, overrides the auto-generated path (folder-outputs/run_name/params). "
151+
"Useful for pipeline/batch execution where deterministic output paths are required.",
152+
)
153+
parser.add_argument(
154+
"--skip-plots",
155+
action="store_true",
156+
help="Skip generating heatmap and logo plots in the consensus step. "
157+
"Useful for headless/batch execution where visualizations are not needed.",
158+
)
146159

147160
args = parser.parse_args()
148161

@@ -160,30 +173,28 @@ def run_pipeline(args):
160173
logger.info("--- InstaNexus Pipeline started ---")
161174

162175
run_name = Path(args.input_csv).stem
163-
base_output_folder = Path(args.folder_outputs) / run_name # e.g., 'outputs/bsa'
164-
165-
# Build the experiment folder name based on parameters
166-
folder_name_parts = [f"{args.assembly_mode}"]
167176

168-
if args.chain:
169-
folder_name_parts.append(f"{args.chain}")
170-
171-
if args.fdr is not None:
172-
folder_name_parts.append(f"fdr{args.fdr}")
173-
elif args.conf is not None:
174-
folder_name_parts.append(f"c{args.conf}")
175-
176-
if "dbg" in args.assembly_mode:
177-
folder_name_parts.append(f"ks{args.kmer_size}")
178-
179-
# folder_name_parts.append(f"mo{args.min_overlap}")
180-
# folder_name_parts.append(f"ts{args.size_threshold}")
181-
182-
# if args.reference:
183-
# folder_name_parts.extend([f"mi{args.min_identity}", f"mm{args.max_mismatches}"])
184-
185-
run_folder_name = "_".join(folder_name_parts)
186-
experiment_folder = base_output_folder / run_folder_name # e.g., 'outputs/bsa/greedy_c0.9_mo4_ts10'
177+
# Determine experiment output folder
178+
if args.output_dir:
179+
# Explicit output directory — deterministic path for pipeline/batch use
180+
experiment_folder = Path(args.output_dir)
181+
run_folder_name = experiment_folder.name
182+
else:
183+
# Auto-generated path from input name + parameters (interactive use)
184+
base_output_folder = Path(args.folder_outputs) / run_name
185+
186+
folder_name_parts = [f"{args.assembly_mode}"]
187+
if args.chain:
188+
folder_name_parts.append(f"{args.chain}")
189+
if args.fdr is not None:
190+
folder_name_parts.append(f"fdr{args.fdr}")
191+
elif args.conf is not None:
192+
folder_name_parts.append(f"c{args.conf}")
193+
if "dbg" in args.assembly_mode:
194+
folder_name_parts.append(f"ks{args.kmer_size}")
195+
196+
run_folder_name = "_".join(folder_name_parts)
197+
experiment_folder = base_output_folder / run_folder_name
187198

188199
cleaned_csv_path = experiment_folder / "cleaned.csv"
189200

@@ -300,12 +311,38 @@ def run_pipeline(args):
300311
consensus.main(
301312
input_alignment_folder=str(alignment_folder),
302313
output_consensus_folder=str(consensus_folder),
303-
run_id=run_id_str, # Pass ID for logs
314+
run_id=run_id_str,
315+
skip_plots=getattr(args, "skip_plots", False),
304316
)
305317
except Exception as e:
306318
logger.error(f"Consensus failed: {e}")
307319
return
308320

321+
# Write a stable summary file at a predictable path for pipeline integration
322+
summary_path = experiment_folder / "summary.tsv"
323+
try:
324+
summary_data = {
325+
"run_name": run_name,
326+
"assembly_mode": args.assembly_mode,
327+
"output_dir": str(experiment_folder),
328+
"scaffolds_fasta": str(scaffolds_fasta_path),
329+
"consensus_dir": str(consensus_folder),
330+
}
331+
# Include consensus stats if they exist
332+
consensus_stats_path = consensus_folder / "consensus_stats.json"
333+
if consensus_stats_path.exists():
334+
import json
335+
336+
with open(consensus_stats_path) as f:
337+
stats = json.load(f)
338+
summary_data.update(stats)
339+
340+
summary_df = pd.DataFrame([summary_data])
341+
summary_df.to_csv(summary_path, sep="\t", index=False)
342+
logger.info(f"Summary written to: {summary_path}")
343+
except Exception as e:
344+
logger.warning(f"Failed to write summary: {e}")
345+
309346
logger.info("--- InstaNexus Pipeline finished successfully! ---")
310347
logger.info(f"Final results in: {experiment_folder}")
311348

src/instanexus/preprocessing.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,10 +358,10 @@ def main(
358358
if metadata_json is not None and "experiment_name" in df.columns:
359359
df["protease"] = df["experiment_name"].apply(lambda name: extract_protease(name, proteases))
360360

361-
if "preds" in df.columns:
362-
df["cleaned_preds"] = df["preds"].apply(remove_modifications)
363-
elif "prediction_untokenised" in df.columns:
364-
df["cleaned_preds"] = df["prediction_untokenised"].apply(remove_modifications)
361+
seq_candidates = ["preds", "prediction_untokenised", "prediction", "Peptide", "sequence"]
362+
seq_col = next((c for c in seq_candidates if c in df.columns), None)
363+
if seq_col is not None:
364+
df["cleaned_preds"] = df[seq_col].apply(remove_modifications)
365365
else:
366366
raise ValueError("No suitable column found for peptide sequences.")
367367

uv.lock

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)