From a26dddab5bf2753d48016e1e155c6a4ad20b4f4f Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Fri, 17 Apr 2026 14:10:18 -0400 Subject: [PATCH 01/11] Ribodetector and RustQC additions - Implemented ribodetector rules and associated conda environment - Implemented RustQC and added singularity requirement in job script to run it. - Updated multiqc config to support branding to CQB and new QC report - IMplemented basic QC report (what we previously had) and detailed QC report --- Snakefile | 356 +++++++++++++++++++++++------------ env_config/pcaplot.yaml | 3 +- env_config/ribodetector.yaml | 10 + job.script.sh | 27 +-- multiqc_config.yaml | 23 ++- scripts/ribo_stats.sh | 79 ++++++++ 6 files changed, 353 insertions(+), 145 deletions(-) create mode 100644 env_config/ribodetector.yaml create mode 100644 scripts/ribo_stats.sh diff --git a/Snakefile b/Snakefile index cce1870..8f7c9dc 100644 --- a/Snakefile +++ b/Snakefile @@ -4,14 +4,22 @@ #####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ import pandas as pd -# set config file +#----- set config file configfile: "config.yaml" +print(config) -# read in sample data +#----- read in sample data samples_df = pd.read_csv(config["sample_csv"]).set_index("sample_id", drop=False) sample_list = list(samples_df['sample_id']) -print(config) +#----- Set additional rules based on params +REMOVE_rRNA = config.get("remove_rRNA", False) +if REMOVE_rRNA: + R1_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.1.fq.gz" + R2_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" if config["layout"]=="paired" else None +else: + R1_FASTQ_INPUT = "trimming/{sample}.R1.trim.fastq.gz" + R2_FASTQ_INPUT = "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else None #####~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # define rules @@ -22,10 +30,12 @@ rule all: expand("trimming/{sample}.R1.trim.fastq.gz", sample=sample_list), expand("trimming/{sample}.R2.trim.fastq.gz", sample=sample_list) if config["layout"] == "paired" else [], expand("trimming/{sample}.cutadapt.report", sample=sample_list), + expand("ribodetector/{sample}/{sample}.nonrrna.1.fq.gz", sample=sample_list) if REMOVE_rRNA else [], + expand("ribodetector/{sample}/{sample}.nonrrna.2.fq.gz", sample=sample_list) if REMOVE_rRNA and config["layout"] == "paired" else [], + "ribodetector/rrna_norrna_pct_mqc.tsv" if REMOVE_rRNA else [], expand("alignment/{sample}.srt.bam", sample=sample_list), expand("alignment/{sample}.srt.bam.bai", sample=sample_list), expand("alignment/{sample}.srt.bam", sample=sample_list), #commenting out until condition for STAR exists - expand("alignment/stats/{sample}.srt.bam.flagstat", sample=sample_list), expand("markdup/{sample}.mkdup.bam", sample=sample_list), expand("metrics/picard/{sample}.picard.rna.metrics.txt", sample=sample_list), expand("rsem/{sample}.genes.results", sample=sample_list) if config["run_rsem"] == "yes" else [], @@ -40,24 +50,33 @@ rule all: "featurecounts/featurecounts.readcounts_rpkm.ann.tsv", "featurecounts/featurecounts.readcounts_fpkm.tsv", "featurecounts/featurecounts.readcounts_fpkm.ann.tsv", + expand("qc/{sample}", sample=sample_list) conda: "env_config/multiqc.yaml", - resources: cpus="10", maxtime="2:00:00", mem_mb=60000, - + resources: cpus="10", maxtime="2:00:00", mem_mb="60gb", params: layout=config["layout"], multiqc=config["multiqc_path"], run_rsem=config["run_rsem"], aligner_name=config["aligner_name"], - output: - "multiqc_report.html" - + detailed_qc = "multiqc_report_detailed.html", + basic_qc = "multiqc_report_basic.html" shell: """ - #multiqc fastqc alignment markdup metrics featurecounts - {params.multiqc} -p alignment markdup metrics featurecounts + #-----multiqc fastqc alignment markdup metrics featurecounts + {params.multiqc} \ + -c multiqc_config.yaml \ + -p \ + qc alignment markdup metrics featurecounts ribodetector \ + -n {output.detailed_qc} + + {params.multiqc} \ + -c multiqc_config.yaml \ + -p \ + qc/*/samtools qc/*/preseq alignment markdup metrics featurecounts ribodetector \ + -n {output.basic_qc} - #remove dummy R2 file (created to meet input rule requirements for rule all:) + #-----remove dummy R2 file (created to meet input rule requirements for rule all:) # also remove dummy rpkm and fpkm files from featurecounts normalization if [ "{params.layout}" = "single" ] then @@ -69,8 +88,7 @@ rule all: rm -f featurecounts/featurecounts.readcounts_rpkm.ann.tsv fi - - #remove dummy alignment files (created to meet input rule requirements for rule all:) + #-----remove dummy alignment files (created to meet input rule requirements for rule all:) if [ "{params.aligner_name}" = "hisat" ] then rm -rf alignment/*.Aligned.toTranscriptome.out.bam @@ -78,12 +96,14 @@ rule all: """ - rule trimming: + """ + Read trimming + """ output: - "trimming/{sample}.R1.trim.fastq.gz", - "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else [], - "trimming/{sample}.cutadapt.report" + trim_R1 = "trimming/{sample}.R1.trim.fastq.gz", + trim_R2 = "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else [], + trim_log = "trimming/{sample}.cutadapt.report" params: sample = lambda wildcards: wildcards.sample, cutadapt = config["cutadapt_path"], @@ -93,41 +113,100 @@ rule trimming: nextseq_flag = config["cutadapt_nextseq_flag"] conda: "env_config/cutadapt.yaml", - resources: cpus="10", maxtime="2:00:00", mem_mb=60000, - + resources: cpus="10", maxtime="2:00:00", mem_mb="60gb", + message: "Trimming {wildcards.sample} reads with cutadapt." shell: """ if [ "{params.layout}" == "paired" ] then {params.cutadapt} \ - -o trimming/{params.sample}.R1.trim.fastq.gz \ - -p trimming/{params.sample}.R2.trim.fastq.gz \ + -o {output.trim_R1} \ + -p {output.trim_R2} \ {params.fastq_file_1} \ {params.fastq_file_2} \ -m 1 \ {params.nextseq_flag} \ -j {resources.cpus} \ --max-n 0.8 \ - --trim-n > trimming/{params.sample}.cutadapt.report + --trim-n > {output.trim_log} else {params.cutadapt} \ - -o trimming/{params.sample}.R1.trim.fastq.gz \ + -o {output.trim_R1} \ {params.fastq_file_1} \ -m 1 \ {params.nextseq_flag} \ -j {resources.cpus} \ --max-n 0.8 \ - --trim-n > trimming/{params.sample}.cutadapt.report + --trim-n > {output.trim_log} fi """ +rule ribodetector: + """ + Ribosomal RNA filtering + """ + input: + trimmed_read_1 = "trimming/{sample}.R1.trim.fastq.gz", + trimmed_read_2 = "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else [] + output: + filtered_read_1 = "ribodetector/{sample}/{sample}.nonrrna.1.fq.gz", + rrna_reads_1 = "ribodetector/{sample}/{sample}.rrna.1.fq.gz", + filtered_read_2 = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" if config["layout"]=="paired" else [], + rrna_reads_2 = "ribodetector/{sample}/{sample}.rrna.2.fq.gz" if config["layout"]=="paired" else [] + resources: cpus="10", maxtime="3:00:00", mem_mb="300gb" + message: "Removing rRNA sequences for {wildcards.sample} reads with ribodetector." + conda: + "env_config/ribodetector.yaml" + params: + sample = lambda wildcards: wildcards.sample, + read_length = config["read_length"], + layout = config["layout"], + filtering_method = "norrna" if config["layout"]=="paired" else "none" + log: + stdout = "ribodetector/{sample}/stdout.log", + stderr = "ribodetector/{sample}/stderr.log" + shell: """ + if [ "{params.layout}" = "paired" ]; then + ribodetector_cpu -t {resources.cpus} \ + -l {params.read_length} \ + -i {input.trimmed_read_1} {input.trimmed_read_2} \ + -e norrna \ + -o {output.filtered_read_1} {output.filtered_read_2} \ + -r {output.rrna_reads_1} {output.rrna_reads_2} \ + > {log.stdout} 2> {log.stderr} + else + ribodetector_cpu -t {resources.cpus} \ + -l {params.read_length} \ + -i {input.trimmed_read_1} \ + -e norrna \ + -o {output.filtered_read_1} \ + -r {output.rrna_reads_1} \ + > {log.stdout} 2> {log.stderr} + fi + """ - - - - +rule ribodetector_mqc_summary: + """ + Generate ribodetector multiqc content + """ + input: + expand("ribodetector/{sample}/stderr.log", sample=sample_list) + output: + "ribodetector/rrna_norrna_pct_mqc.tsv" + params: + script = "scripts/ribodetector_mqc.sh", + ribo_dir = "ribodetector" + resources: cpus="1", maxtime="10:00", mem_mb="2gb" + message: "Generating ribodetector report." + shell: """ + bash scripts/ribo_stats.sh ribodetector + """ + if config["aligner_name"]=="star": rule pre_alignment: + """ + Generate aligner index and verify + """ output: "alignment/index_status.txt", params: layout = config["layout"], @@ -137,9 +216,8 @@ if config["aligner_name"]=="star": samtools = config["samtools_path"], conda: "env_config/alignment.yaml", - - resources: cpus="10", maxtime="8:00:00", mem_mb=120000, - + resources: cpus="10", maxtime="8:00:00", mem_mb="120gb", + message: "Checking STAR alignment index" shell: """ align_folder="sample_ref/STAR_index" if [ ! -d "{params.aligner_index}" ] @@ -161,14 +239,15 @@ if config["aligner_name"]=="star": """ rule alignment: - input: "trimming/{sample}.R1.trim.fastq.gz", - "trimming/{sample}.R2.trim.fastq.gz" if config["layout"] == "paired" else [], + """ + STAR alignment + """ + input: R1_FASTQ_INPUT, + R2_FASTQ_INPUT if config["layout"] == "paired" else [], "alignment/index_status.txt", - output: "alignment/{sample}.srt.bam", "alignment/{sample}.srt.bam.bai", "alignment/{sample}.Aligned.toTranscriptome.out.bam", - params: layout = config["layout"], sample = lambda wildcards: wildcards.sample, @@ -176,12 +255,11 @@ if config["aligner_name"]=="star": aligner = config["aligner_path"], aligner_index = config["aligner_index"], samtools = config["samtools_path"], - readFilesIn = "trimming/{sample}.R1.trim.fastq.gz" + " trimming/{sample}.R2.trim.fastq.gz" if config["layout"] == "paired" else 'trimming/{sample}.R1.trim.fastq.gz' + readFilesIn = R1_FASTQ_INPUT + (f" {R2_FASTQ_INPUT}" if config["layout"] == "paired" else '') conda: "env_config/alignment.yaml", - - resources: cpus="5", maxtime="8:00:00", mem_mb=100000, - + resources: cpus="5", maxtime="8:00:00", mem_mb="100gb", + message: "aligning {wildcards.sample} reads with STAR." shell: """ align_folder=`cat alignment/index_status.txt` {params.aligner} \ @@ -205,23 +283,22 @@ if config["aligner_name"]=="star": --readFilesCommand zcat \ --outFileNamePrefix alignment/{params.sample}. -# rename output BAM + #----- rename output BAM mv alignment/{params.sample}.Aligned.sortedByCoord.out.bam alignment/{params.sample}.srt.bam - # index BAM + #----- index BAM {params.samtools} index -@ 4 alignment/{params.sample}.srt.bam """ - - if config["aligner_name"]=="hisat": rule alignment: + """ + Hisat2 alignment + """ input: "trimming/{sample}.R1.trim.fastq.gz", "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else [], - output: "alignment/{sample}.srt.bam", "alignment/{sample}.srt.bam.bai", - params: layout = config["layout"], sample = lambda wildcards: wildcards.sample, @@ -230,71 +307,80 @@ if config["aligner_name"]=="hisat": aligner_index = config["aligner_index"], samtools = config["samtools_path"], fastq_1_flag = '-1' if config['layout']=='paired' else '-U', - fastq_2 = '-2 trimming/{sample}.R2.trim.fastq.gz' if config['layout']=='paired' else '', - + fastq_2 = '-2 trimming/{sample}.R2.trim.fastq.gz' if config['layout']=='paired' else '', conda: "env_config/alignment.yaml", - - resources: cpus="4", maxtime="8:00:00", mem_mb=40000, - + resources: cpus="4", maxtime="8:00:00", mem_mb="40gb", + message: "aligning {wildcards.sample} reads with hisat2." shell: """ {params.hisat2} \ -x {params.aligner_index} \ --rg ID:{params.sample} \ --rg SM:{params.sample} \ --rg LB:{params.sample} \ - {params.fastq_1_flag} trimming/{params.sample}.R1.trim.fastq.gz \ + {params.fastq_1_flag} \ + trimming/{params.sample}.R1.trim.fastq.gz \ {params.fastq_2} \ -p {resources.cpus} \ --summary-file alignment/{params.sample}.hisat.summary.txt | \ {params.samtools} view -@ {resources.cpus} -b | \ {params.samtools} sort -T /scratch/samtools_{params.sample} -@ {resources.cpus} -m 128M - 1> alignment/{params.sample}.srt.bam - # generate BAM index + #----- generate BAM index {params.samtools} index -@ {resources.cpus} alignment/{params.sample}.srt.bam """ - - -rule alignment_metrics: - input: "alignment/{sample}.srt.bam", - - output: "alignment/stats/{sample}.srt.bam.flagstat", - "alignment/stats/{sample}.srt.bam.idxstats", - +rule rustqc: + """ + RNA-seq QC with RustQC (containerized) + """ + input: + bam = "markdup/{sample}.mkdup.bam" + output: + qc_dir = directory("qc/{sample}") + container: "docker://ghcr.io/seqeralabs/rustqc:latest" params: - samtools = config["samtools_path"], - sample = lambda wildcards: wildcards.sample, - conda: - "env_config/samtools.yaml", - - resources: cpus="2", maxtime="8:00:00", mem_mb=20000, - + gtf = config["annotation_gtf"], + paired_flag = '-p' if config['layout']=='paired' else '' + threads: 4 + resources: cpus="4", maxtime="8:00:00", mem_mb="40gb", + message: "Running comprehensive QC for {wildcards.sample} with RustQC." + log: "qc/{sample}/{sample}.log" shell: """ - {params.samtools} flagstat alignment/{params.sample}.srt.bam > alignment/stats/{params.sample}.srt.bam.flagstat - {params.samtools} idxstats alignment/{params.sample}.srt.bam > alignment/stats/{params.sample}.srt.bam.idxstats - """ - -rule picard_markdup: - input: "alignment/{sample}.srt.bam", + + rustqc rna \ + {input.bam} \ + --gtf {params.gtf} \ + {params.paired_flag} \ + -o {output.qc_dir} 2> {log} && + + rm -r {output.qc_dir}/featurecounts - output: "markdup/{sample}.mkdup.bam", + """ +rule picard_markdup: + """ + Deduplication + """ + input: + sorted_bam = "alignment/{sample}.srt.bam", + output: + mkdups = "markdup/{sample}.mkdup.bam", + picard_log = "markdup/{sample}.mkdup.log.txt" params: sample = lambda wildcards: wildcards.sample, picard = config['picard_path'], conda: "env_config/picard.yaml", - - resources: cpus="2", maxtime="30:00", mem_mb=20000, - + resources: cpus="2", maxtime="30:00", mem_mb="20gb", + message: "Deduplicating reads for {wildcards.sample} reads with Picard." shell: """ {params.picard} -Xmx2G -Xms2G \ MarkDuplicates \ - I=alignment/{params.sample}.srt.bam \ - O=markdup/{params.sample}.mkdup.bam \ - M=markdup/{params.sample}.mkdup.log.txt \ + I={input.sorted_bam} \ + O={output.mkdups} \ + M={output.picard_log} \ OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 \ CREATE_INDEX=false \ MAX_RECORDS_IN_RAM=4000000 \ @@ -303,10 +389,13 @@ rule picard_markdup: """ rule picard_collectmetrics: - input: "markdup/{sample}.mkdup.bam", - - output: "metrics/picard/{sample}.picard.rna.metrics.txt", - + """ + Picard metrics + """ + input: + mkdup_bam = "markdup/{sample}.mkdup.bam", + output: + picard_metrics = "metrics/picard/{sample}.picard.rna.metrics.txt", params: sample = lambda wildcards: wildcards.sample, picard = config['picard_path'], @@ -315,24 +404,28 @@ rule picard_collectmetrics: strand = config['picard_strand'], conda: "env_config/picard.yaml", - - resources: cpus="2", maxtime="8:00:00", mem_mb=20000, - + resources: cpus="2", maxtime="8:00:00", mem_mb="20gb", + message: "Collecting RNASeq metrics for {wildcards.sample} reads with Picard." shell: """ {params.picard} -Xmx2G -Xms2G \ - CollectRnaSeqMetrics \ - I=markdup/{params.sample}.mkdup.bam \ - O=metrics/picard/{params.sample}.picard.rna.metrics.txt \ - REF_FLAT={params.flatref} STRAND={params.strand} \ + CollectRnaSeqMetrics \ + I={input.mkdup_bam} \ + O={output.picard_metrics} \ + REF_FLAT={params.flatref} \ + STRAND={params.strand} \ RIBOSOMAL_INTERVALS={params.rrna_list} \ MAX_RECORDS_IN_RAM=1000000 """ rule rsem: - input: "alignment/{sample}.srt.bam", - - output: "rsem/{sample}.genes.results", - "rsem/{sample}.isoforms.results" + """ + Isoform counting + """ + input: + "alignment/{sample}.srt.bam", + output: + "rsem/{sample}.genes.results", + "rsem/{sample}.isoforms.results" params: sample = lambda wildcards: wildcards.sample, rsem_calc_exp_path = config['rsem_calc_exp_path'], @@ -341,8 +434,8 @@ rule rsem: rsem_paired_flag = '--paired-end' if config["layout"]=='paired' else '', conda: "env_config/rsem.yaml", - resources: cpus="10", maxtime="8:00:00", mem_mb=60000, - + resources: cpus="10", maxtime="8:00:00", mem_mb="60gb", + message: "Counting transcript isoforms for {wildcards.sample} reads with RSEM." shell: """ {params.rsem_calc_exp_path} \ {params.rsem_paired_flag} \ @@ -356,16 +449,20 @@ rule rsem: """ rule featurecounts: - input: expand("alignment/{sample}.srt.bam", sample=sample_list), - - output: "featurecounts/featurecounts.readcounts.tsv", - "featurecounts/featurecounts.readcounts.ann.tsv", - "featurecounts/featurecounts.readcounts_tpm.tsv", - "featurecounts/featurecounts.readcounts_tpm.ann.tsv", - "featurecounts/featurecounts.readcounts_rpkm.tsv", - "featurecounts/featurecounts.readcounts_rpkm.ann.tsv", - "featurecounts/featurecounts.readcounts_fpkm.tsv", - "featurecounts/featurecounts.readcounts_fpkm.ann.tsv", + """ + Count reads + """ + input: + expand("alignment/{sample}.srt.bam", sample=sample_list), + output: + "featurecounts/featurecounts.readcounts.tsv", + "featurecounts/featurecounts.readcounts.ann.tsv", + "featurecounts/featurecounts.readcounts_tpm.tsv", + "featurecounts/featurecounts.readcounts_tpm.ann.tsv", + "featurecounts/featurecounts.readcounts_rpkm.tsv", + "featurecounts/featurecounts.readcounts_rpkm.ann.tsv", + "featurecounts/featurecounts.readcounts_fpkm.tsv", + "featurecounts/featurecounts.readcounts_fpkm.ann.tsv", params: featurecounts = config['featurecounts_path'], layout = config["layout"], @@ -376,12 +473,21 @@ rule featurecounts: fc_ann_script = config['featurecounts_annscript'], conda: "env_config/featurecounts.yaml", - - resources: cpus="10", maxtime="8:00:00", mem_mb=100000, - + resources: cpus="10", maxtime="8:00:00", mem_mb="100gb", + message: "Counting reads with FeatureCounts." shell: """ - {params.featurecounts} -T 32 {params.pair_flag} -s {params.strand} -a {params.gtf} -o featurecounts/featurecounts.readcounts.raw.tsv {input} + {params.featurecounts} \ + -T 32 \ + {params.pair_flag} \ + -s {params.strand} \ + -a {params.gtf} \ + -o featurecounts/featurecounts.readcounts.raw.tsv \ + {input} + + #----- Clean sed s/"alignment\/"//g featurecounts/featurecounts.readcounts.raw.tsv| sed s/".srt.bam"//g| tail -n +2 > featurecounts/featurecounts.readcounts.tsv + + #----- Annotate python {params.fc_tpm_script} featurecounts/featurecounts.readcounts.tsv {params.layout} python {params.fc_ann_script} {params.gtf} featurecounts/featurecounts.readcounts.tsv > featurecounts/featurecounts.readcounts.ann.tsv python {params.fc_ann_script} {params.gtf} featurecounts/featurecounts.readcounts_tpm.tsv > featurecounts/featurecounts.readcounts_tpm.ann.tsv @@ -398,8 +504,11 @@ rule featurecounts: """ rule pca_plots: - input: "featurecounts/featurecounts.readcounts.tsv", - + """ + PCA + """ + input: + "featurecounts/featurecounts.readcounts.tsv", output: "plots/PCA_top_PC1_vs_PC2.png", "plots/PCA_top_PCA_variance_bar.png", @@ -408,20 +517,22 @@ rule pca_plots: conda: "env_config/pcaplot.yaml", resources: cpus="1", maxtime="1:00:00", mem_mb=2000, + message: "Running PCA" shell: """ python {params.pca_plot_script} \ featurecounts/featurecounts.readcounts.tsv \ plots """ - - #### # Reference path checking # This rule is not run by the default Snakemake target. # To run these checks, run snakemake -s Snakefile check_refs #### rule check_refs: + """ + Verify ref paths + """ params: ref_gtf = config["annotation_gtf"], aligner_index = config["aligner_index"], @@ -430,7 +541,8 @@ rule check_refs: picard_rrna_list = config["picard_rrna_list"], run_rsem = config["run_rsem"], rsem_ref = config["rsem_ref_path"], - resources: cpus="1", maxtime="1:00:00", mem_mb=2000, + resources: cpus="1", maxtime="1:00:00", mem_mb="2gb", + message: "Checking reference paths..." shell: """ echo "\nChecking for reference annotation GTF file..." @@ -498,15 +610,15 @@ rule check_refs: """ - - - #### # Automatic Reference building # This rule is not run by the default Snakemake target. # To run these build commands, run snakemake -s Snakefile build_refs #### rule build_refs: + """ + Building refs + """ params: ref_fa = config["reference_fa"], ref_gtf = config["annotation_gtf"], @@ -517,7 +629,8 @@ rule build_refs: rsem_prepare_path = config["rsem_prep_ref_path"], conda: "env_config/build_refs.yaml", - resources: cpus="12", maxtime="8:00:00", mem_mb=48000, + resources: cpus="12", maxtime="8:00:00", mem_mb="48gb", + message: "Building references." shell: """ REF_NAME=`basename {params.ref_fa} .fa` mkdir -p ref/pipeline_refs @@ -576,3 +689,4 @@ fi """ + diff --git a/env_config/pcaplot.yaml b/env_config/pcaplot.yaml index 64264e1..b375718 100644 --- a/env_config/pcaplot.yaml +++ b/env_config/pcaplot.yaml @@ -9,5 +9,4 @@ dependencies: - seaborn - plotnine - pandas - - fastcluster - + - fastcluster \ No newline at end of file diff --git a/env_config/ribodetector.yaml b/env_config/ribodetector.yaml new file mode 100644 index 0000000..c0cdbe9 --- /dev/null +++ b/env_config/ribodetector.yaml @@ -0,0 +1,10 @@ +name: ribodetector +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.8 + - pip + - pip: + - ribodetector==0.2.7 diff --git a/job.script.sh b/job.script.sh index 5e91b3a..acbac46 100644 --- a/job.script.sh +++ b/job.script.sh @@ -1,25 +1,13 @@ #!/bin/bash -# Name of the job -#SBATCH --job-name=RNAseq.preprocess - -# Number of compute nodes +#SBATCH --job-name=Hixon_RNA #SBATCH --nodes=1 - -# specify que to submit to (preempt1 only contains q10 DAC node) -#SBATCH --partition=standard - -# specify account you are submitting from -#SBATCH --account=nccc - -# Walltime (job duration) +#SBATCH --partition=preempt1 +#SBATCH --account=dac #SBATCH --time=60:00:00 - -# Email address -#SBATCH --mail-user=XXXXXXXXX@dartmouth.edu - -# Email notifications (comma-separated options: BEGIN,END,FAIL) +#SBATCH --mail-user=f007qps@dartmouth.edu #SBATCH --mail-type=FAIL +#SBATCH --output=_Hixon_RNA_%j.log #----- Source conda environment source /optnfs/common/miniconda3/etc/profile.d/conda.sh @@ -29,9 +17,10 @@ conda activate /dartfs/rc/nosnapshots/G/GMBSR_refs/envs/snakemake mkdir -p slurm_logs #----- Call Snakemake -snakemake -s Snakefile \ - --conda-frontend conda \ +snakemake -s Snakefile \ --use-conda \ + --use-singularity \ + --singularity-args "--bind /dartfs,/optnfs" \ --conda-prefix /dartfs/rc/nosnapshots/G/GMBSR_refs/envs/DAC-RNAseq-pipeline \ --profile cluster_profile \ --rerun-incomplete \ diff --git a/multiqc_config.yaml b/multiqc_config.yaml index d09094d..97fcb72 100644 --- a/multiqc_config.yaml +++ b/multiqc_config.yaml @@ -1,6 +1,23 @@ -report_header_info: - - Contact E-mail: DataAnalyticsCore@groups.dartmouth.edu" - - Application Type: "X RNA-seq" +# ============================================================================ +# CUSTOM BRANDING - Report Title, Subtitle, and Contact +# ============================================================================ + +# Main report title (appears at top of report) +title: "GDSC-RNASeq Pipeline v1.4" + +# Subtitle text (appears below title) +subtitle: "RNA-Seq Preprocessing" + +custom_logo: "img/cqb_logo.jpg" +custom_logo_url: "https://sites.dartmouth.edu/cqb/files/2019/10/Logo-and-bringing-smaller-300.png" + +# Introductory text (appears at top of report, supports HTML/Markdown) +intro_text: | +

+ Authors: Dartmouth Genomic Data Science Core
+ For correspondence contact: gdsc@groups.dartmouth.edu +

+ extra_fn_clean_exts: - ".srt" diff --git a/scripts/ribo_stats.sh b/scripts/ribo_stats.sh new file mode 100644 index 0000000..fa80011 --- /dev/null +++ b/scripts/ribo_stats.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail +shopt -s nullglob + +if [[ $# -ne 1 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +RESULTS_DIR="$1" + +if [[ ! -d "$RESULTS_DIR" ]]; then + echo "Error: '$RESULTS_DIR' is not a directory" >&2 + exit 1 +fi + +out="ribodetector/rrna_norrna_pct_mqc.tsv" +mkdir -p "$(dirname "$out")" + +# write MultiQC custom-content header +cat > "$out" <<'EOF' +# id: 'ribodetector' +# section_name: 'Ribodetector rRNA Detection' +# description: 'Percentage of reads classified as rRNA by ribodetector. High values indicate poor rRNA depletion during library prep.' +# format: 'tsv' +# plot_type: 'table' +# pconfig: +# id: 'ribodetector_table' +# title: 'Ribodetector: rRNA contamination' +# headers: +# total_reads: +# title: 'Total reads' +# description: 'Total input reads processed by ribodetector' +# format: '{:,.0f}' +# scale: 'Blues' +# rrna_reads: +# title: 'rRNA reads' +# description: 'Reads classified as rRNA (filtered out)' +# format: '{:,.0f}' +# scale: 'Reds' +# pct_rrna: +# title: '% rRNA' +# description: 'Percentage of input reads classified as rRNA' +# format: '{:,.2f}' +# suffix: '%' +# min: 0 +# max: 100 +# scale: 'RdYlGn-rev' +EOF + +# append column header and data +printf "sample\ttotal_reads\trrna_reads\tpct_rrna\n" >> "$out" + +for d in "$RESULTS_DIR"/*/; do + sample=$(basename "$d") + log="$d/stderr.log" + + [[ -f "$log" ]] || { echo "skip: no stderr.log in $d" >&2; continue; } + + total=$(awk ' + { gsub(/\x1B\[[0-9;]*[A-Za-z]/,"") } + match($0,/([0-9]+)[[:space:]]+sequences loaded/,a) { print a[1]; exit } + ' "$log") + + rrna=$(awk ' + { gsub(/\x1B\[[0-9;]*[A-Za-z]/,"") } + match($0,/Detected[[:space:]]+([0-9]+)[[:space:]]+rRNA sequences/,a) { print a[1]; exit } + ' "$log") + + [[ -n "${total:-}" && -n "${rrna:-}" ]] || { echo "skip: missing counts in $log" >&2; continue; } + [[ "$total" =~ ^[0-9]+$ && "$total" -gt 0 ]] || { echo "skip: bad total in $log" >&2; continue; } + [[ "$rrna" =~ ^[0-9]+$ ]] || { echo "skip: bad rrna in $log" >&2; continue; } + + pct=$(awk -v r="$rrna" -v t="$total" 'BEGIN{printf "%.4f", 100*r/t}') + + printf "%s\t%s\t%s\t%s\n" "$sample" "$total" "$rrna" "$pct" >> "$out" +done + +echo "Saved: $out" \ No newline at end of file From 695b049985a386a468d55cf9cac74b5c341fe562 Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Fri, 17 Apr 2026 14:14:53 -0400 Subject: [PATCH 02/11] Ribodetector params in prebuilt configs --- prebuilt_configs/human_config_paired_hisat.yaml | 3 +++ prebuilt_configs/human_config_paired_star.yaml | 3 +++ prebuilt_configs/human_config_paired_star_rsem.yaml | 3 +++ prebuilt_configs/human_config_single_hisat.yaml | 3 +++ prebuilt_configs/human_config_single_star.yaml | 3 +++ prebuilt_configs/mouse_config_paired_hisat.yaml | 2 ++ prebuilt_configs/mouse_config_paired_star.yaml | 2 ++ prebuilt_configs/mouse_config_paired_star_rsem.yaml | 2 ++ prebuilt_configs/mouse_config_single_hisat.yaml | 2 ++ prebuilt_configs/mouse_config_single_star.yaml | 2 ++ 10 files changed, 25 insertions(+) diff --git a/prebuilt_configs/human_config_paired_hisat.yaml b/prebuilt_configs/human_config_paired_hisat.yaml index ae659bf..d55dd0f 100644 --- a/prebuilt_configs/human_config_paired_hisat.yaml +++ b/prebuilt_configs/human_config_paired_hisat.yaml @@ -8,6 +8,9 @@ layout: "paired" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 + ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/human_config_paired_star.yaml b/prebuilt_configs/human_config_paired_star.yaml index 1ac92af..20908b1 100644 --- a/prebuilt_configs/human_config_paired_star.yaml +++ b/prebuilt_configs/human_config_paired_star.yaml @@ -8,6 +8,9 @@ layout: "paired" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 + ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/human_config_paired_star_rsem.yaml b/prebuilt_configs/human_config_paired_star_rsem.yaml index 8739c6c..d070638 100644 --- a/prebuilt_configs/human_config_paired_star_rsem.yaml +++ b/prebuilt_configs/human_config_paired_star_rsem.yaml @@ -8,6 +8,9 @@ layout: "paired" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 + ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/human_config_single_hisat.yaml b/prebuilt_configs/human_config_single_hisat.yaml index 7298aee..aa7c6b7 100644 --- a/prebuilt_configs/human_config_single_hisat.yaml +++ b/prebuilt_configs/human_config_single_hisat.yaml @@ -8,6 +8,9 @@ layout: "single" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 + ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/human_config_single_star.yaml b/prebuilt_configs/human_config_single_star.yaml index b169135..9f9f6c7 100644 --- a/prebuilt_configs/human_config_single_star.yaml +++ b/prebuilt_configs/human_config_single_star.yaml @@ -8,6 +8,9 @@ layout: "single" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 + ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/mouse_config_paired_hisat.yaml b/prebuilt_configs/mouse_config_paired_hisat.yaml index c60cd5d..ec3b9a6 100644 --- a/prebuilt_configs/mouse_config_paired_hisat.yaml +++ b/prebuilt_configs/mouse_config_paired_hisat.yaml @@ -8,6 +8,8 @@ layout: "paired" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/mouse_config_paired_star.yaml b/prebuilt_configs/mouse_config_paired_star.yaml index 9a2fe4b..772a54b 100644 --- a/prebuilt_configs/mouse_config_paired_star.yaml +++ b/prebuilt_configs/mouse_config_paired_star.yaml @@ -8,6 +8,8 @@ layout: "paired" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/mouse_config_paired_star_rsem.yaml b/prebuilt_configs/mouse_config_paired_star_rsem.yaml index ebd4202..6214610 100644 --- a/prebuilt_configs/mouse_config_paired_star_rsem.yaml +++ b/prebuilt_configs/mouse_config_paired_star_rsem.yaml @@ -8,6 +8,8 @@ layout: "paired" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/mouse_config_single_hisat.yaml b/prebuilt_configs/mouse_config_single_hisat.yaml index 4b74b7b..870bf3c 100644 --- a/prebuilt_configs/mouse_config_single_hisat.yaml +++ b/prebuilt_configs/mouse_config_single_hisat.yaml @@ -8,6 +8,8 @@ layout: "single" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 ############################################# # Software Paths for use with "--use-conda" # diff --git a/prebuilt_configs/mouse_config_single_star.yaml b/prebuilt_configs/mouse_config_single_star.yaml index 017b430..0c62dac 100644 --- a/prebuilt_configs/mouse_config_single_star.yaml +++ b/prebuilt_configs/mouse_config_single_star.yaml @@ -8,6 +8,8 @@ layout: "single" #cutadapt_nextseq_flag: "--nextseq-trim=20" cutadapt_nextseq_flag: "" +remove_rRNA: true +read_length: 51 ############################################# # Software Paths for use with "--use-conda" # From f44fddb95cbb1b7f95dd13f5b541f75b287b4af1 Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Fri, 17 Apr 2026 18:13:49 -0400 Subject: [PATCH 03/11] update tests --- .github/workflows/test.yml | 60 ++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f6eeff9..ca994c3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,9 +6,9 @@ name: Snakemake_tests on: # Triggers the workflow on push or pull request events but only for the branches listed push: - branches: [ master, dev-tim, dev-andrew-github-actions ] + branches: [ master, dev-tim, dev-andrew-github-actions, dev-ribodetector ] pull_request: - branches: [ master, dev-tim, dev-andrew-github-actions ] + branches: [ master, dev-tim, dev-andrew-github-actions, dev-ribodetector ] # Including this block the allow the workflow to be run manually from the GitHub Actions tab workflow_dispatch: @@ -29,12 +29,16 @@ jobs: # Steps to be run steps: - uses: actions/checkout@v2 + - name: Install Apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.3.6 - name: Create_conda_env uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: '--cores 4 --use-conda --conda-create-envs-only' + args: '--cores 4 --use-conda --use-singularity --conda-create-envs-only' - name: tar conda envs run: | tar -cf ./conda_envs_archive.tar ./.snakemake @@ -88,12 +92,16 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install Apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.3.6 - name: Hisat_single_config_build uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: 'build_refs --cores 4 --use-conda --configfile tests/test_config_single_hisat.yaml' + args: 'build_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_single_hisat.yaml' - name: Hisat_single_config_configure run: cat ref/pipeline_refs/hg38_chr567_100k.entries.yaml >> tests/test_config_single_hisat.yaml - name: Hisat_single_config_check @@ -101,13 +109,13 @@ jobs: with: directory: '.' snakefile: 'Snakefile' - args: 'check_refs --cores 4 --use-conda --configfile tests/test_config_single_hisat.yaml' + args: 'check_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_single_hisat.yaml' - name: Hisat_single_config_run uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: '--cores 4 --use-conda --configfile tests/test_config_single_hisat.yaml' + args: '--cores 4 --use-conda --use-singularity --configfile tests/test_config_single_hisat.yaml' # paired_hisat # @@ -118,12 +126,16 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install Apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.3.6 - name: Hisat_paired_config_build uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: 'build_refs --cores 4 --use-conda --configfile tests/test_config_paired_hisat.yaml' + args: 'build_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_hisat.yaml' - name: Hisat_paired_config_configure run: cat ref/pipeline_refs/hg38_chr567_100k.entries.yaml >> tests/test_config_paired_hisat.yaml - name: Hisat_paired_config_check @@ -131,13 +143,13 @@ jobs: with: directory: '.' snakefile: 'Snakefile' - args: 'check_refs --cores 4 --use-conda --configfile tests/test_config_paired_hisat.yaml' + args: 'check_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_hisat.yaml' - name: Hisat_paired_config_run uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: '--cores 4 --use-conda --configfile tests/test_config_paired_hisat.yaml' + args: '--cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_hisat.yaml' # single_star # @@ -147,12 +159,16 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install Apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.3.6 - name: STAR_single_config_build uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: 'build_refs --cores 4 --use-conda --configfile tests/test_config_single_star.yaml' + args: 'build_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_single_star.yaml' - name: STAR_single_config_configure run: cat ref/pipeline_refs/hg38_chr567_100k.entries.yaml >> tests/test_config_single_star.yaml - name: STAR_single_config_check @@ -160,13 +176,13 @@ jobs: with: directory: '.' snakefile: 'Snakefile' - args: 'check_refs --cores 4 --use-conda --configfile tests/test_config_single_star.yaml' + args: 'check_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_single_star.yaml' - name: STAR_single_config_run uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: '--cores 4 --use-conda --configfile tests/test_config_single_star.yaml' + args: '--cores 4 --use-conda --use-singularity --configfile tests/test_config_single_star.yaml' # paired_star # @@ -177,12 +193,16 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install Apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.3.6 - name: STAR_paired_config_build uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: 'build_refs --cores 4 --use-conda --configfile tests/test_config_paired_star.yaml' + args: 'build_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_star.yaml' - name: STAR_paired_config_configure run: cat ref/pipeline_refs/hg38_chr567_100k.entries.yaml >> tests/test_config_paired_star.yaml - name: STAR_paired_config_check @@ -190,13 +210,13 @@ jobs: with: directory: '.' snakefile: 'Snakefile' - args: 'check_refs --cores 4 --use-conda --configfile tests/test_config_paired_star.yaml' + args: 'check_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_star.yaml' - name: STAR_paired_config_run uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: '--cores 4 --use-conda --configfile tests/test_config_paired_star.yaml' + args: '--cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_star.yaml' # paired_star with rsem # @@ -207,12 +227,16 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install Apptainer + uses: eWaterCycle/setup-apptainer@v2 + with: + apptainer-version: 1.3.6 - name: STAR_RSEM_paired_config_build uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: 'build_refs --cores 4 --use-conda --configfile tests/test_config_paired_star_rsem.yaml' + args: 'build_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_star_rsem.yaml' - name: STAR_RSEM_paired_config_configure run: cat ref/pipeline_refs/hg38_chr567_100k.entries.yaml >> tests/test_config_paired_star_rsem.yaml - name: STAR_RSEM_paired_config_check @@ -220,13 +244,13 @@ jobs: with: directory: '.' snakefile: 'Snakefile' - args: 'check_refs --cores 4 --use-conda --configfile tests/test_config_paired_star_rsem.yaml' + args: 'check_refs --cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_star_rsem.yaml' - name: STAR_RSEM_paired_config_run uses: snakemake/snakemake-github-action@v1 with: directory: '.' snakefile: 'Snakefile' - args: '--cores 4 --use-conda --configfile tests/test_config_paired_star_rsem.yaml' + args: '--cores 4 --use-conda --use-singularity --configfile tests/test_config_paired_star_rsem.yaml' #debug_environment_info # From 8e8889fefe6539555fbf36512ab4ed5cbf5cbdab Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Fri, 17 Apr 2026 18:25:15 -0400 Subject: [PATCH 04/11] convert all mem_mb to integers --- Snakefile | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/Snakefile b/Snakefile index 8f7c9dc..9abd951 100644 --- a/Snakefile +++ b/Snakefile @@ -6,7 +6,6 @@ import pandas as pd #----- set config file configfile: "config.yaml" -print(config) #----- read in sample data samples_df = pd.read_csv(config["sample_csv"]).set_index("sample_id", drop=False) @@ -53,7 +52,7 @@ rule all: expand("qc/{sample}", sample=sample_list) conda: "env_config/multiqc.yaml", - resources: cpus="10", maxtime="2:00:00", mem_mb="60gb", + resources: cpus="10", maxtime="2:00:00", mem_mb=60000, params: layout=config["layout"], multiqc=config["multiqc_path"], @@ -113,7 +112,7 @@ rule trimming: nextseq_flag = config["cutadapt_nextseq_flag"] conda: "env_config/cutadapt.yaml", - resources: cpus="10", maxtime="2:00:00", mem_mb="60gb", + resources: cpus="10", maxtime="2:00:00", mem_mb=60000, message: "Trimming {wildcards.sample} reads with cutadapt." shell: """ if [ "{params.layout}" == "paired" ] @@ -153,7 +152,7 @@ rule ribodetector: rrna_reads_1 = "ribodetector/{sample}/{sample}.rrna.1.fq.gz", filtered_read_2 = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" if config["layout"]=="paired" else [], rrna_reads_2 = "ribodetector/{sample}/{sample}.rrna.2.fq.gz" if config["layout"]=="paired" else [] - resources: cpus="10", maxtime="3:00:00", mem_mb="300gb" + resources: cpus="10", maxtime="3:00:00", mem_mb=300000 message: "Removing rRNA sequences for {wildcards.sample} reads with ribodetector." conda: "env_config/ribodetector.yaml" @@ -196,7 +195,7 @@ rule ribodetector_mqc_summary: params: script = "scripts/ribodetector_mqc.sh", ribo_dir = "ribodetector" - resources: cpus="1", maxtime="10:00", mem_mb="2gb" + resources: cpus="1", maxtime="10:00", mem_mb=2000 message: "Generating ribodetector report." shell: """ bash scripts/ribo_stats.sh ribodetector @@ -216,7 +215,7 @@ if config["aligner_name"]=="star": samtools = config["samtools_path"], conda: "env_config/alignment.yaml", - resources: cpus="10", maxtime="8:00:00", mem_mb="120gb", + resources: cpus="10", maxtime="8:00:00", mem_mb=120000, message: "Checking STAR alignment index" shell: """ align_folder="sample_ref/STAR_index" @@ -258,7 +257,7 @@ if config["aligner_name"]=="star": readFilesIn = R1_FASTQ_INPUT + (f" {R2_FASTQ_INPUT}" if config["layout"] == "paired" else '') conda: "env_config/alignment.yaml", - resources: cpus="5", maxtime="8:00:00", mem_mb="100gb", + resources: cpus="5", maxtime="8:00:00", mem_mb=100000, message: "aligning {wildcards.sample} reads with STAR." shell: """ align_folder=`cat alignment/index_status.txt` @@ -310,7 +309,7 @@ if config["aligner_name"]=="hisat": fastq_2 = '-2 trimming/{sample}.R2.trim.fastq.gz' if config['layout']=='paired' else '', conda: "env_config/alignment.yaml", - resources: cpus="4", maxtime="8:00:00", mem_mb="40gb", + resources: cpus="4", maxtime="8:00:00", mem_mb=40000, message: "aligning {wildcards.sample} reads with hisat2." shell: """ {params.hisat2} \ @@ -344,7 +343,7 @@ rule rustqc: gtf = config["annotation_gtf"], paired_flag = '-p' if config['layout']=='paired' else '' threads: 4 - resources: cpus="4", maxtime="8:00:00", mem_mb="40gb", + resources: cpus="4", maxtime="8:00:00", mem_mb=40000, message: "Running comprehensive QC for {wildcards.sample} with RustQC." log: "qc/{sample}/{sample}.log" shell: """ @@ -373,7 +372,7 @@ rule picard_markdup: picard = config['picard_path'], conda: "env_config/picard.yaml", - resources: cpus="2", maxtime="30:00", mem_mb="20gb", + resources: cpus="2", maxtime="30:00", mem_mb=20000, message: "Deduplicating reads for {wildcards.sample} reads with Picard." shell: """ {params.picard} -Xmx2G -Xms2G \ @@ -404,7 +403,7 @@ rule picard_collectmetrics: strand = config['picard_strand'], conda: "env_config/picard.yaml", - resources: cpus="2", maxtime="8:00:00", mem_mb="20gb", + resources: cpus="2", maxtime="8:00:00", mem_mb=20000, message: "Collecting RNASeq metrics for {wildcards.sample} reads with Picard." shell: """ {params.picard} -Xmx2G -Xms2G \ @@ -434,7 +433,7 @@ rule rsem: rsem_paired_flag = '--paired-end' if config["layout"]=='paired' else '', conda: "env_config/rsem.yaml", - resources: cpus="10", maxtime="8:00:00", mem_mb="60gb", + resources: cpus="10", maxtime="8:00:00", mem_mb=60000, message: "Counting transcript isoforms for {wildcards.sample} reads with RSEM." shell: """ {params.rsem_calc_exp_path} \ @@ -473,7 +472,7 @@ rule featurecounts: fc_ann_script = config['featurecounts_annscript'], conda: "env_config/featurecounts.yaml", - resources: cpus="10", maxtime="8:00:00", mem_mb="100gb", + resources: cpus="10", maxtime="8:00:00", mem_mb=100000, message: "Counting reads with FeatureCounts." shell: """ {params.featurecounts} \ @@ -485,7 +484,7 @@ rule featurecounts: {input} #----- Clean - sed s/"alignment\/"//g featurecounts/featurecounts.readcounts.raw.tsv| sed s/".srt.bam"//g| tail -n +2 > featurecounts/featurecounts.readcounts.tsv + sed 's|alignment/||g' featurecounts/featurecounts.readcounts.raw.tsv| sed 's|.srt.bam||g'| tail -n +2 > featurecounts/featurecounts.readcounts.tsv #----- Annotate python {params.fc_tpm_script} featurecounts/featurecounts.readcounts.tsv {params.layout} @@ -541,7 +540,7 @@ rule check_refs: picard_rrna_list = config["picard_rrna_list"], run_rsem = config["run_rsem"], rsem_ref = config["rsem_ref_path"], - resources: cpus="1", maxtime="1:00:00", mem_mb="2gb", + resources: cpus="1", maxtime="1:00:00", mem_mb=2000, message: "Checking reference paths..." shell: """ @@ -629,7 +628,7 @@ rule build_refs: rsem_prepare_path = config["rsem_prep_ref_path"], conda: "env_config/build_refs.yaml", - resources: cpus="12", maxtime="8:00:00", mem_mb="48gb", + resources: cpus="12", maxtime="8:00:00", mem_mb=48000, message: "Building references." shell: """ REF_NAME=`basename {params.ref_fa} .fa` From 3ae090dd906447acb19a512c7164d4a4a132cb0e Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Fri, 17 Apr 2026 18:33:04 -0400 Subject: [PATCH 05/11] Fix read_length inclusion in main config --- config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/config.yaml b/config.yaml index 847842f..f5b1790 100644 --- a/config.yaml +++ b/config.yaml @@ -16,6 +16,10 @@ aligner_name: "hisat" #'hisat' or 'star' # rsem choice run_rsem: "no" +# Run rrna filtering? +remove_rRNA: true +read_length: 51 + ############################################# #Strandedness settings - these should match # ############################################# From 36d47e770626817d8283643b467ca5cf58213e86 Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Fri, 17 Apr 2026 18:46:56 -0400 Subject: [PATCH 06/11] fix gawk error in ribo_stats to work with github action --- scripts/ribo_stats.sh | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/scripts/ribo_stats.sh b/scripts/ribo_stats.sh index fa80011..eb84240 100644 --- a/scripts/ribo_stats.sh +++ b/scripts/ribo_stats.sh @@ -57,15 +57,11 @@ for d in "$RESULTS_DIR"/*/; do [[ -f "$log" ]] || { echo "skip: no stderr.log in $d" >&2; continue; } - total=$(awk ' - { gsub(/\x1B\[[0-9;]*[A-Za-z]/,"") } - match($0,/([0-9]+)[[:space:]]+sequences loaded/,a) { print a[1]; exit } - ' "$log") + total=$(sed 's/\x1B\[[0-9;]*[A-Za-z]//g' "$log" \ + | grep -oP '[0-9]+(?=\s+sequences loaded)' | head -1) - rrna=$(awk ' - { gsub(/\x1B\[[0-9;]*[A-Za-z]/,"") } - match($0,/Detected[[:space:]]+([0-9]+)[[:space:]]+rRNA sequences/,a) { print a[1]; exit } - ' "$log") + rrna=$(sed 's/\x1B\[[0-9;]*[A-Za-z]//g' "$log" \ + | grep -oP '(?<=Detected\s)[0-9]+(?=\s+rRNA sequences)' | head -1) [[ -n "${total:-}" && -n "${rrna:-}" ]] || { echo "skip: missing counts in $log" >&2; continue; } [[ "$total" =~ ^[0-9]+$ && "$total" -gt 0 ]] || { echo "skip: bad total in $log" >&2; continue; } From 391c7d83253484a77d837360f7a0bee6f626a286 Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Tue, 21 Apr 2026 11:39:25 -0400 Subject: [PATCH 07/11] add gitignore --- .gitignore | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e69de29 From 73344fbbb4c898ec5a16a085a99cf810d0395745 Mon Sep 17 00:00:00 2001 From: mikemartinez99 Date: Tue, 21 Apr 2026 11:43:43 -0400 Subject: [PATCH 08/11] Make RustQC optional --- .gitignore | 19 ++++ Snakefile | 99 +++++++++++-------- config.yaml | 24 +++-- job.script.sh | 6 +- .../human_config_paired_hisat.yaml | 42 +++++--- .../human_config_paired_star.yaml | 46 ++++++--- .../human_config_paired_star_rsem.yaml | 46 ++++++--- .../human_config_single_hisat.yaml | 46 ++++++--- .../human_config_single_star.yaml | 46 ++++++--- .../mouse_config_paired_hisat.yaml | 47 ++++++--- .../mouse_config_paired_star.yaml | 47 ++++++--- .../mouse_config_paired_star_rsem.yaml | 47 ++++++--- .../mouse_config_single_hisat.yaml | 47 ++++++--- .../mouse_config_single_star.yaml | 47 ++++++--- tests/test_config_paired_hisat.yaml | 1 + tests/test_config_paired_star.yaml | 1 + tests/test_config_paired_star_rsem.yaml | 1 + tests/test_config_single_hisat.yaml | 1 + tests/test_config_single_star.yaml | 1 + 19 files changed, 444 insertions(+), 170 deletions(-) diff --git a/.gitignore b/.gitignore index e69de29..4232ef6 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,19 @@ +.cache/* +.claude/* +.fontconfig/* +.snakemake/* +.Rapp.history +alignment/* +featurecounts/* +logs/* +markdup/* +metrics/* +multiqc_report_data/* +multiqc_report_plots/* +plots/* +qc/* +ribodetector/* +trimming/* +slurm_logs/* +*.log +multiqc_report.html diff --git a/Snakefile b/Snakefile index 9abd951..aab731e 100644 --- a/Snakefile +++ b/Snakefile @@ -13,6 +13,8 @@ sample_list = list(samples_df['sample_id']) #----- Set additional rules based on params REMOVE_rRNA = config.get("remove_rRNA", False) +RUN_RUSTQC = config.get("run_rustqc", False) + if REMOVE_rRNA: R1_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.1.fq.gz" R2_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" if config["layout"]=="paired" else None @@ -49,7 +51,7 @@ rule all: "featurecounts/featurecounts.readcounts_rpkm.ann.tsv", "featurecounts/featurecounts.readcounts_fpkm.tsv", "featurecounts/featurecounts.readcounts_fpkm.ann.tsv", - expand("qc/{sample}", sample=sample_list) + expand("qc/{sample}", sample=sample_list) if RUN_RUSTQC else [expand("alignment/stats/{sample}.srt.bam.flagstat", sample=sample_list)] conda: "env_config/multiqc.yaml", resources: cpus="10", maxtime="2:00:00", mem_mb=60000, @@ -58,23 +60,21 @@ rule all: multiqc=config["multiqc_path"], run_rsem=config["run_rsem"], aligner_name=config["aligner_name"], + multiqc_dirs=( + ("qc " if RUN_RUSTQC else "") + + "alignment markdup metrics featurecounts" + + (" ribodetector" if REMOVE_rRNA else "") + ) output: - detailed_qc = "multiqc_report_detailed.html", - basic_qc = "multiqc_report_basic.html" + report = "multiqc_report.html", shell: """ #-----multiqc fastqc alignment markdup metrics featurecounts {params.multiqc} \ -c multiqc_config.yaml \ -p \ - qc alignment markdup metrics featurecounts ribodetector \ - -n {output.detailed_qc} + {params.multiqc_dirs} \ + -n {output.report} - {params.multiqc} \ - -c multiqc_config.yaml \ - -p \ - qc/*/samtools qc/*/preseq alignment markdup metrics featurecounts ribodetector \ - -n {output.basic_qc} - #-----remove dummy R2 file (created to meet input rule requirements for rule all:) # also remove dummy rpkm and fpkm files from featurecounts normalization if [ "{params.layout}" = "single" ] @@ -292,7 +292,7 @@ if config["aligner_name"]=="star": if config["aligner_name"]=="hisat": rule alignment: """ - Hisat2 alignment + HISAT2 Alignment. """ input: "trimming/{sample}.R1.trim.fastq.gz", "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else [], @@ -306,56 +306,47 @@ if config["aligner_name"]=="hisat": aligner_index = config["aligner_index"], samtools = config["samtools_path"], fastq_1_flag = '-1' if config['layout']=='paired' else '-U', - fastq_2 = '-2 trimming/{sample}.R2.trim.fastq.gz' if config['layout']=='paired' else '', + fastq_2 = '-2 trimming/{sample}.R2.trim.fastq.gz' if config['layout']=='paired' else '', conda: "env_config/alignment.yaml", resources: cpus="4", maxtime="8:00:00", mem_mb=40000, - message: "aligning {wildcards.sample} reads with hisat2." + message: "aligning {wildcards.sample} reads with Hisat2." shell: """ {params.hisat2} \ -x {params.aligner_index} \ --rg ID:{params.sample} \ --rg SM:{params.sample} \ - --rg LB:{params.sample} \ - {params.fastq_1_flag} \ - trimming/{params.sample}.R1.trim.fastq.gz \ + --rg LB:{params.sample} \ + {params.fastq_1_flag} trimming/{params.sample}.R1.trim.fastq.gz \ {params.fastq_2} \ -p {resources.cpus} \ --summary-file alignment/{params.sample}.hisat.summary.txt | \ {params.samtools} view -@ {resources.cpus} -b | \ {params.samtools} sort -T /scratch/samtools_{params.sample} -@ {resources.cpus} -m 128M - 1> alignment/{params.sample}.srt.bam - #----- generate BAM index + # generate BAM index {params.samtools} index -@ {resources.cpus} alignment/{params.sample}.srt.bam """ -rule rustqc: + +rule alignment_metrics: """ - RNA-seq QC with RustQC (containerized) + Samtools metrics (when rustQC is false) """ - input: - bam = "markdup/{sample}.mkdup.bam" - output: - qc_dir = directory("qc/{sample}") - container: "docker://ghcr.io/seqeralabs/rustqc:latest" + input: "alignment/{sample}.srt.bam", + output: "alignment/stats/{sample}.srt.bam.flagstat", + "alignment/stats/{sample}.srt.bam.idxstats", params: - gtf = config["annotation_gtf"], - paired_flag = '-p' if config['layout']=='paired' else '' - threads: 4 - resources: cpus="4", maxtime="8:00:00", mem_mb=40000, - message: "Running comprehensive QC for {wildcards.sample} with RustQC." - log: "qc/{sample}/{sample}.log" + samtools = config["samtools_path"], + sample = lambda wildcards: wildcards.sample, + conda: + "env_config/samtools.yaml", + resources: cpus="2", maxtime="8:00:00", mem_mb=20000, + message: "Running flagstats and idxstats QC for {wildcards.sample} with Samtools." shell: """ - - rustqc rna \ - {input.bam} \ - --gtf {params.gtf} \ - {params.paired_flag} \ - -o {output.qc_dir} 2> {log} && - - rm -r {output.qc_dir}/featurecounts - + {params.samtools} flagstat alignment/{params.sample}.srt.bam > alignment/stats/{params.sample}.srt.bam.flagstat + {params.samtools} idxstats alignment/{params.sample}.srt.bam > alignment/stats/{params.sample}.srt.bam.idxstats """ rule picard_markdup: @@ -416,6 +407,34 @@ rule picard_collectmetrics: MAX_RECORDS_IN_RAM=1000000 """ +rule rustqc: + """ + RNA-seq QC with RustQC (containerized) + """ + input: + bam = "markdup/{sample}.mkdup.bam" + output: + qc_dir = directory("qc/{sample}") + container: "docker://ghcr.io/seqeralabs/rustqc:v0.2.1" + params: + gtf = config["annotation_gtf"], + paired_flag = '-p' if config['layout']=='paired' else '' + threads: 4 + resources: cpus="4", maxtime="8:00:00", mem_mb=40000 + message: "Running comprehensive QC for {wildcards.sample} with RustQC." + log: "logs/rustqc/{sample}.log" + shell: """ + + rustqc rna \ + {input.bam} \ + --gtf {params.gtf} \ + {params.paired_flag} \ + --skip-dup-check \ + -o {output.qc_dir} 2> {log} + + + """ + rule rsem: """ Isoform counting diff --git a/config.yaml b/config.yaml index f5b1790..7ae4e62 100644 --- a/config.yaml +++ b/config.yaml @@ -7,8 +7,12 @@ # this file must have the header sample_idfastq_1fastq_2. (The second tab and fastq_2 are optional.) sample_csv: "sample_fastq_list_paired.csv" +############################################# +# Toggleable parameters # +############################################# + # library layout -layout: "paired" +layout: "paired" # 'single or paired' # aligner choice aligner_name: "hisat" #'hisat' or 'star' @@ -16,10 +20,13 @@ aligner_name: "hisat" #'hisat' or 'star' # rsem choice run_rsem: "no" -# Run rrna filtering? +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + ############################################# #Strandedness settings - these should match # ############################################# @@ -27,15 +34,14 @@ featurecounts_strand: "2" # '1', '0' rsem_strandedness: "reverse" # 'foward', 'none' picard_strand: "SECOND_READ_TRANSCRIPTION_STRAND" # 'FIRST_READ_TRANSCRIPTION', 'NONE' - ############################################# -# miscellaneous settings #################### +# miscellaneous settings # ############################################# cutadapt_nextseq_flag: "" # '--nextseq-trim-20' ############################################# -# Input reference fasta and GTF annotation ## +# Input reference fasta and GTF annotation # ############################################# ###### 'sample/ref' contains a subset of hg38 that should be used only for pipeline testing. reference_fa: "sample_ref/hg38_chr567_100k.fa" @@ -43,9 +49,9 @@ annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" ############################################# -# Aligner index and picard references ####### -# create and specify these or ############### -# run snakemake build_ref rule to create #### +# Aligner index and picard references # +# create and specify these or # +# run snakemake build_ref rule to create # ############################################# aligner_index: '' picard_refflat: '' @@ -53,7 +59,7 @@ picard_rrna_list: '' rsem_ref_path: '' ############################################# -# Pipeline scripts ########################## +# Pipeline scripts # ############################################# picard_build_script: "scripts/picard_ref_builder.sh" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" diff --git a/job.script.sh b/job.script.sh index acbac46..75149b9 100644 --- a/job.script.sh +++ b/job.script.sh @@ -16,11 +16,13 @@ conda activate /dartfs/rc/nosnapshots/G/GMBSR_refs/envs/snakemake #----- Make slurm logs folder mkdir -p slurm_logs +#! If using 'RUN_RUSTQC: True', include the following line in the snakemake call +# --use-singularity \ +# --singularity-args "--bind /dartfs,/optnfs" \ + #----- Call Snakemake snakemake -s Snakefile \ --use-conda \ - --use-singularity \ - --singularity-args "--bind /dartfs,/optnfs" \ --conda-prefix /dartfs/rc/nosnapshots/G/GMBSR_refs/envs/DAC-RNAseq-pipeline \ --profile cluster_profile \ --rerun-incomplete \ diff --git a/prebuilt_configs/human_config_paired_hisat.yaml b/prebuilt_configs/human_config_paired_hisat.yaml index d55dd0f..c82de0c 100644 --- a/prebuilt_configs/human_config_paired_hisat.yaml +++ b/prebuilt_configs/human_config_paired_hisat.yaml @@ -1,16 +1,34 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_paired.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "paired" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "hisat" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + ############################################# # Software Paths for use with "--use-conda" # @@ -24,9 +42,9 @@ rsem_calc_exp_path: "rsem-calculate-expression" rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -38,20 +56,22 @@ rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" pca_plot_script: "scripts/pca_plotting.py" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "hisat" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/HISAT_index/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" diff --git a/prebuilt_configs/human_config_paired_star.yaml b/prebuilt_configs/human_config_paired_star.yaml index 20908b1..5ea95c2 100644 --- a/prebuilt_configs/human_config_paired_star.yaml +++ b/prebuilt_configs/human_config_paired_star.yaml @@ -1,16 +1,34 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_paired.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "paired" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "star" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + ############################################# # Software Paths for use with "--use-conda" # @@ -26,9 +44,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -39,25 +57,29 @@ java_path: "/usr/bin/java" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "star" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/human_config_paired_star_rsem.yaml b/prebuilt_configs/human_config_paired_star_rsem.yaml index d070638..fc7f673 100644 --- a/prebuilt_configs/human_config_paired_star_rsem.yaml +++ b/prebuilt_configs/human_config_paired_star_rsem.yaml @@ -1,16 +1,34 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_paired.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "paired" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "star" #'hisat' or 'star' + +# rsem choice +run_rsem: "yes" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + ############################################# # Software Paths for use with "--use-conda" # @@ -26,9 +44,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -39,26 +57,30 @@ java_path: "/usr/bin/java" #rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "star" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "yes" #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_ref_path: "sample_ref/RSEM_ref/ref" rsem_strandedness: "reverse" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/human_config_single_hisat.yaml b/prebuilt_configs/human_config_single_hisat.yaml index aa7c6b7..0ca8bc2 100644 --- a/prebuilt_configs/human_config_single_hisat.yaml +++ b/prebuilt_configs/human_config_single_hisat.yaml @@ -1,16 +1,34 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_single.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "single" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "hisat" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + ############################################# # Software Paths for use with "--use-conda" # @@ -25,9 +43,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -38,25 +56,29 @@ java_path: "/usr/bin/java" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "hisat" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/HISAT_index/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "1" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/human_config_single_star.yaml b/prebuilt_configs/human_config_single_star.yaml index 9f9f6c7..0ab8676 100644 --- a/prebuilt_configs/human_config_single_star.yaml +++ b/prebuilt_configs/human_config_single_star.yaml @@ -1,16 +1,34 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_single.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "single" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "star" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + ############################################# # Software Paths for use with "--use-conda" # @@ -25,9 +43,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -37,25 +55,29 @@ java_path: "/usr/bin/java" #featurecounts_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/featurecounts/subread-2.0.1-Linux-x86_64/bin/featureCounts" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "star" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "1" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/mouse_config_paired_hisat.yaml b/prebuilt_configs/mouse_config_paired_hisat.yaml index ec3b9a6..c460856 100644 --- a/prebuilt_configs/mouse_config_paired_hisat.yaml +++ b/prebuilt_configs/mouse_config_paired_hisat.yaml @@ -1,16 +1,35 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_paired.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "paired" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "hisat" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + + ############################################# # Software Paths for use with "--use-conda" # ############################################# @@ -24,9 +43,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -37,25 +56,29 @@ java_path: "/usr/bin/java" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "hisat" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/HISAT_index/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/mouse_config_paired_star.yaml b/prebuilt_configs/mouse_config_paired_star.yaml index 772a54b..650dfdd 100644 --- a/prebuilt_configs/mouse_config_paired_star.yaml +++ b/prebuilt_configs/mouse_config_paired_star.yaml @@ -1,16 +1,35 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_paired.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "paired" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "star" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + + ############################################# # Software Paths for use with "--use-conda" # ############################################# @@ -25,9 +44,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -38,25 +57,29 @@ java_path: "/usr/bin/java" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "star" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/mouse_config_paired_star_rsem.yaml b/prebuilt_configs/mouse_config_paired_star_rsem.yaml index 6214610..5bf0269 100644 --- a/prebuilt_configs/mouse_config_paired_star_rsem.yaml +++ b/prebuilt_configs/mouse_config_paired_star_rsem.yaml @@ -1,16 +1,35 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_paired.cdv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "paired" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "star" #'hisat' or 'star' + +# rsem choice +run_rsem: "yes" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + + ############################################# # Software Paths for use with "--use-conda" # ############################################# @@ -25,32 +44,36 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" java_path: "/usr/bin/java" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "star" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "yes" #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_ref_path: "sample_ref/RSEM_ref/ref" rsem_strandedness: "reverse" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/mouse_config_single_hisat.yaml b/prebuilt_configs/mouse_config_single_hisat.yaml index 870bf3c..5c06adb 100644 --- a/prebuilt_configs/mouse_config_single_hisat.yaml +++ b/prebuilt_configs/mouse_config_single_hisat.yaml @@ -1,16 +1,35 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_single.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "single" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "hisat" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + + ############################################# # Software Paths for use with "--use-conda" # ############################################# @@ -24,9 +43,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -37,25 +56,29 @@ java_path: "/usr/bin/java" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "hisat" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/HISAT_index/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "1" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/prebuilt_configs/mouse_config_single_star.yaml b/prebuilt_configs/mouse_config_single_star.yaml index 0c62dac..4170745 100644 --- a/prebuilt_configs/mouse_config_single_star.yaml +++ b/prebuilt_configs/mouse_config_single_star.yaml @@ -1,16 +1,35 @@ -###~~~~~~ general settings ~~~~~~### +############################################# +# General settings # +############################################# # sample meta data file sample_csv: "sample_fastq_list_single.csv" -# sequencing config +############################################# +# Toggleable parameters # +############################################# + +# library layout layout: "single" -#cutadapt_nextseq_flag: "--nextseq-trim=20" -cutadapt_nextseq_flag: "" +# aligner choice +aligner_name: "star" #'hisat' or 'star' + +# rsem choice +run_rsem: "no" + +# Run Ribodetector for rRNA filtering remove_rRNA: true read_length: 51 +# Run RustQC (14 tool comprehensive QC in a single pass) +run_rustqc: false + +# sequencing config +#cutadapt_nextseq_flag: "--nextseq-trim=20" +cutadapt_nextseq_flag: "" + + ############################################# # Software Paths for use with "--use-conda" # ############################################# @@ -25,9 +44,9 @@ rsem_prep_ref_path: "rsem-prepare-reference" multiqc_path: "multiqc" pca_plot_script: "scripts/pca_plotting.py" -############################################# -# Custom Software Paths ##################### -############################################# +############################################# +# Custom Software Paths # +############################################# #cutadapt_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/cutadapt" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" fastqc_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/fastqc/FastQC/fastqc" @@ -38,25 +57,29 @@ java_path: "/usr/bin/java" rsem_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/owen/software/bin" #multiqc_path: "/dartfs-hpc/rc/home/d/d41294d/.conda/envs/rnaseq1/bin/multiqc" -###~~~~~~ alignment settings ~~~~~~### -aligner_name: "star" #'hisat' or 'star' +############################################# +# Alignment settings # +############################################# #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/STAR/STAR-2.7.1a/bin/Linux_x86_64/STAR" #aligner_index: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/STAR/hg38_index" #aligner_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/misc/sullivan/tools/hisat2/hisat2-2.1.0/hisat2" aligner_index: "sample_ref/hg38_chr567_100k" -###~~~~~~ quant settings ~~~~~~### +############################################# +# Quantification settings # +############################################# #annotation_gtf: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/ensembl-annotations/Homo_sapiens.GRCh38.97.gtf" annotation_gtf: "sample_ref/hg38_chr567_100k.chr.gtf" featurecounts_strand: "1" #1 for first read transcription strand, 2 for second. # featurecounts_rscript: "scripts/fc_to_rpkmtpm.R" featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" -run_rsem: "no" #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" -###~~~~~~ other ~~~~~~### +############################################# +# Other settings # +############################################# picard_rrna_list: "sample_ref/hg38_chr567_100k_rRNA.interval.list" #picard_rrna_list: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.rRNA.interval_list" #picard_refflat: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/CollectRnaSeqMetrics/Homo_sapiens.GRCh38.97.refFlat.txt" diff --git a/tests/test_config_paired_hisat.yaml b/tests/test_config_paired_hisat.yaml index 918c93b..a801ca0 100644 --- a/tests/test_config_paired_hisat.yaml +++ b/tests/test_config_paired_hisat.yaml @@ -49,6 +49,7 @@ featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" run_rsem: "no" +run_rustqc: false rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" diff --git a/tests/test_config_paired_star.yaml b/tests/test_config_paired_star.yaml index ecbd4d9..3322012 100644 --- a/tests/test_config_paired_star.yaml +++ b/tests/test_config_paired_star.yaml @@ -50,6 +50,7 @@ featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" run_rsem: "no" +run_rustqc: false #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" diff --git a/tests/test_config_paired_star_rsem.yaml b/tests/test_config_paired_star_rsem.yaml index ad2dc94..af8e4a1 100644 --- a/tests/test_config_paired_star_rsem.yaml +++ b/tests/test_config_paired_star_rsem.yaml @@ -50,6 +50,7 @@ featurecounts_strand: "2" #1 for first read transcription strand, 2 for second. featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" run_rsem: "yes" +run_rustqc: false #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_ref_path: "sample_ref/RSEM_ref/ref" rsem_strandedness: "unstranded" diff --git a/tests/test_config_single_hisat.yaml b/tests/test_config_single_hisat.yaml index 7f966a0..00a833b 100644 --- a/tests/test_config_single_hisat.yaml +++ b/tests/test_config_single_hisat.yaml @@ -61,3 +61,4 @@ picard_refflat: "sample_ref/hg38_chr567_refFlat_1M.txt" picard_strand: "FIRST_READ_TRANSCRIPTION_STRAND" reference_fa: "sample_ref/hg38_chr567_100k.fa" picard_build_script: "scripts/picard_ref_builder.sh" +run_rustqc: false diff --git a/tests/test_config_single_star.yaml b/tests/test_config_single_star.yaml index 48e15d3..c614272 100644 --- a/tests/test_config_single_star.yaml +++ b/tests/test_config_single_star.yaml @@ -50,6 +50,7 @@ featurecounts_strand: "1" #1 for first read transcription strand, 2 for second. featurecounts_rscript: "scripts/readcnt_to_rpkmtpm.py" featurecounts_annscript: "scripts/add_gene_to_ensg.py" run_rsem: "no" +run_rustqc: false #rsem_ref_path: "/dartfs-hpc/rc/lab/G/GMBSR_bioinfo/genomic_references/human/RSEM/txdir/RSEMref" rsem_strandedness: "unstranded" From a31ff489a0c851e0852ea00d7aa3861f8a3aef5e Mon Sep 17 00:00:00 2001 From: Beatriz Bergamo Date: Tue, 21 Apr 2026 16:20:42 -0400 Subject: [PATCH 09/11] Apply RiboDetector only for paired-end samples --- Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index aab731e..b6deb23 100644 --- a/Snakefile +++ b/Snakefile @@ -15,9 +15,9 @@ sample_list = list(samples_df['sample_id']) REMOVE_rRNA = config.get("remove_rRNA", False) RUN_RUSTQC = config.get("run_rustqc", False) -if REMOVE_rRNA: +if REMOVE_rRNA and config["layout"]=="paired": R1_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.1.fq.gz" - R2_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" if config["layout"]=="paired" else None + R2_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" else: R1_FASTQ_INPUT = "trimming/{sample}.R1.trim.fastq.gz" R2_FASTQ_INPUT = "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else None From df89afd31c518dff810ac5d4f4dcbb0d95593e5c Mon Sep 17 00:00:00 2001 From: Beatriz Bergamo Date: Tue, 21 Apr 2026 16:27:43 -0400 Subject: [PATCH 10/11] Removing -e parameter from else ribodetector (for single end) --- Snakefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index b6deb23..19b0d61 100644 --- a/Snakefile +++ b/Snakefile @@ -15,9 +15,9 @@ sample_list = list(samples_df['sample_id']) REMOVE_rRNA = config.get("remove_rRNA", False) RUN_RUSTQC = config.get("run_rustqc", False) -if REMOVE_rRNA and config["layout"]=="paired": +if REMOVE_rRNA: R1_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.1.fq.gz" - R2_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" + R2_FASTQ_INPUT = "ribodetector/{sample}/{sample}.nonrrna.2.fq.gz" if config["layout"]=="paired" else None else: R1_FASTQ_INPUT = "trimming/{sample}.R1.trim.fastq.gz" R2_FASTQ_INPUT = "trimming/{sample}.R2.trim.fastq.gz" if config["layout"]=="paired" else None @@ -177,7 +177,6 @@ rule ribodetector: ribodetector_cpu -t {resources.cpus} \ -l {params.read_length} \ -i {input.trimmed_read_1} \ - -e norrna \ -o {output.filtered_read_1} \ -r {output.rrna_reads_1} \ > {log.stdout} 2> {log.stderr} From 7af85df98953eef5a74885d07c1f88ab6ed56edf Mon Sep 17 00:00:00 2001 From: Mike Martinez Date: Tue, 21 Apr 2026 16:34:21 -0400 Subject: [PATCH 11/11] Remove script parameter from ribodetector rule Removed script parameter from ribodetector rule. --- Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Snakefile b/Snakefile index 19b0d61..f074109 100644 --- a/Snakefile +++ b/Snakefile @@ -192,7 +192,6 @@ rule ribodetector_mqc_summary: output: "ribodetector/rrna_norrna_pct_mqc.tsv" params: - script = "scripts/ribodetector_mqc.sh", ribo_dir = "ribodetector" resources: cpus="1", maxtime="10:00", mem_mb=2000 message: "Generating ribodetector report."