PRISM_python/Snakefile at main · inspirewind/PRISM_python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# ──────────────────────────────────────────────────────────────
# PRISM Snakemake Pipeline
# ──────────────────────────────────────────────────────────────
# Metagenomic pathogen detection pipeline.
# Restructured from PRISM.R into a modular Snakemake workflow.
#
# Usage:
#   snakemake --configfile config/config.yaml -j 16
#   snakemake -n --configfile config/config.yaml   # dry run
# ──────────────────────────────────────────────────────────────

import os
import glob as _glob

# ── Resolve paths relative to Snakefile location ─────────────
PRISM_DIR = os.path.dirname(workflow.snakefile)
SCRIPTS   = os.path.join(PRISM_DIR, "workflow", "scripts")

# ── Samples ──────────────────────────────────────────────────
# Two modes:
#   A) Auto-discovery: set data_dir + fq1_suffix/fq2_suffix in config
#   B) Manual: list each sample with fq1/fq2 paths under samples:
# Both can be combined; manual entries override auto-discovered ones.

def _discover_samples(cfg):
    """Build samples dict from config, with optional auto-discovery."""
    samples = {}

    # Auto-discover from data_dir
    data_dir = cfg.get("data_dir", "")
    if data_dir:
        fq1_sfx = cfg.get("fq1_suffix", "_R1.fq.gz")
        fq2_sfx = cfg.get("fq2_suffix", "_R2.fq.gz")
        pattern = os.path.join(data_dir, f"*{fq1_sfx}")
        for fq1_path in sorted(_glob.glob(pattern)):
            basename = os.path.basename(fq1_path)
            sample_name = basename[: -len(fq1_sfx)]
            fq2_path = os.path.join(data_dir, f"{sample_name}{fq2_sfx}")
            if not os.path.exists(fq2_path):
                raise ValueError(
                    f"Auto-discovery: found R1 '{fq1_path}' but missing R2 '{fq2_path}'"
                )
            samples[sample_name] = {"fq1": fq1_path, "fq2": fq2_path}

    # Manual entries override auto-discovered ones
    manual = cfg.get("samples", {})
    if manual:
        samples.update(manual)

    if not samples:
        raise ValueError(
            "No samples found. Set 'data_dir' for auto-discovery "
            "or list samples under 'samples:' in config.yaml"
        )
    return samples

SAMPLE_DICT = _discover_samples(config)
SAMPLES = list(SAMPLE_DICT.keys())

# ── Output root ──────────────────────────────────────────────
OUT_ROOT = config.get("output_dir", "results")

def out(sample, *parts):
    """Build output path:  {output_dir}/{sample}_prism/data/{parts}"""
    return os.path.join(OUT_ROOT, f"{sample}_prism", "data", *parts)

def out_final(sample, *parts):
    """Build final output path:  {output_dir}/{sample}_prism/{parts}"""
    return os.path.join(OUT_ROOT, f"{sample}_prism", *parts)

# ── Paired-end flag ──────────────────────────────────────────
PAIRED = config.get("paired", True)

# ── Include rule files ───────────────────────────────────────
include: "workflow/rules/step01_kraken.smk"
include: "workflow/rules/step02_minimap.smk"
include: "workflow/rules/step03_star.smk"
include: "workflow/rules/step04_subsample.smk"
include: "workflow/rules/step04a_customdb.smk"
include: "workflow/rules/step05_blast_sub.smk"
include: "workflow/rules/step06_multimapping.smk"
include: "workflow/rules/step07_blast_full.smk"
include: "workflow/rules/step08_filter.smk"
include: "workflow/rules/step09_genbank.smk"
include: "workflow/rules/step10_profile.smk"
include: "workflow/rules/step11_output.smk"

# ── Target rule ──────────────────────────────────────────────
rule all:
    input:
        expand(out_final("{sample}", "{sample}-results.csv"), sample=SAMPLES),
        expand(out_final("{sample}", "{sample}-counts.csv"),  sample=SAMPLES),
        expand(out_final("{sample}", "{sample}_1.fa"),        sample=SAMPLES),
        expand(out("{sample}", "customdb_cleaned.flag"),      sample=SAMPLES),