From 8a34b39407e7be7bd8362facd92a5c3629aff64c Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 21 May 2026 23:41:10 +0100 Subject: [PATCH 1/9] Add dotseq/dotseq module DOTSeq is a Bioconductor package for detecting differential ORF usage (DOU) and ORF-level differential translation efficiency (DTE) from Ribo-seq with matched RNA-seq. Module wraps DOTSeqDataSetsFromFeatureCounts + DOTSeq() + getContrasts() and emits per-ORF TSVs for the DOU and DTE interaction contrasts plus the serialised DOTSeqDataSets object. Pre-requisites (in flight): - Bioconda recipe: https://github.com/bioconda/bioconda-recipes/pull/65677 - Test data: https://github.com/nf-core/test-datasets/pull/2072 --- modules/nf-core/dotseq/dotseq/environment.yml | 7 + modules/nf-core/dotseq/dotseq/main.nf | 28 ++ modules/nf-core/dotseq/dotseq/meta.yml | 159 +++++++++++ .../nf-core/dotseq/dotseq/templates/dotseq.R | 255 ++++++++++++++++++ .../nf-core/dotseq/dotseq/tests/main.nf.test | 51 ++++ .../dotseq/dotseq/tests/main.nf.test.snap | 22 ++ .../dotseq/dotseq/tests/nextflow.config | 5 + 7 files changed, 527 insertions(+) create mode 100644 modules/nf-core/dotseq/dotseq/environment.yml create mode 100644 modules/nf-core/dotseq/dotseq/main.nf create mode 100644 modules/nf-core/dotseq/dotseq/meta.yml create mode 100644 modules/nf-core/dotseq/dotseq/templates/dotseq.R create mode 100644 modules/nf-core/dotseq/dotseq/tests/main.nf.test create mode 100644 modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap create mode 100644 modules/nf-core/dotseq/dotseq/tests/nextflow.config diff --git a/modules/nf-core/dotseq/dotseq/environment.yml b/modules/nf-core/dotseq/dotseq/environment.yml new file mode 100644 index 000000000000..8585dd9e0a5c --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bioconductor-dotseq=1.0.0 diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf new file mode 100644 index 000000000000..c4910aa1c135 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -0,0 +1,28 @@ +process DOTSEQ_DOTSEQ { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-dotseq:1.0.0--r45hdfd78af_0' : + 'quay.io/biocontainers/bioconductor-dotseq:1.0.0--r45hdfd78af_0' }" + + input: + tuple val(meta), val(contrast_variable), val(reference), val(target) + tuple val(meta2), path(samplesheet), path(counts), path(flattened_gtf), path(flattened_bed) + + output: + tuple val(meta), path("*.dou.interaction.dotseq.results.tsv"), emit: dou_interaction + tuple val(meta), path("*.dte.interaction.dotseq.results.tsv"), emit: dte_interaction + tuple val(meta), path("*.dou.strategy.dotseq.results.tsv") , emit: dou_strategy , optional: true + tuple val(meta), path("*.dte.strategy.dotseq.results.tsv") , emit: dte_strategy , optional: true + tuple val(meta), path("*.DOTSeqDataSets.rds") , emit: rdata + tuple val(meta), path("*.R_sessionInfo.log") , emit: session_info + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'dotseq.R' +} diff --git a/modules/nf-core/dotseq/dotseq/meta.yml b/modules/nf-core/dotseq/dotseq/meta.yml new file mode 100644 index 000000000000..c72a4c8f837c --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/meta.yml @@ -0,0 +1,159 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "dotseq_dotseq" +description: | + Genome-wide detection of differential ORF usage (DOU) and ORF-level + differential translation efficiency (DTE) from Ribo-seq with matched + RNA-seq using DOTSeq. +keywords: + - riboseq + - rnaseq + - translation + - differential + - orf +tools: + - "dotseq": + description: "DOTSeq: Differential ORF Translation analysis for Ribo-seq with matched RNA-seq" + homepage: "https://bioconductor.org/packages/release/bioc/html/DOTSeq.html" + documentation: "https://bioconductor.org/packages/release/bioc/vignettes/DOTSeq/inst/doc/DOTSeq.html" + tool_dev_url: "https://github.com/compgenom/DOTSeq" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - contrast_variable: + type: string + description: | + The column in the sample sheet that holds the experimental condition + (will be mapped to the DOTSeq 'condition' column internally). + - reference: + type: string + description: | + The value within the contrast_variable column to use as the reference (baseline) condition. + - target: + type: string + description: | + The value within the contrast_variable column to use as the target (non-reference) condition. + - - meta2: + type: map + description: | + Groovy map containing study-wide metadata related to the sample sheet and count matrix + - samplesheet: + type: file + description: | + CSV or TSV sample sheet. Must contain columns mapping to DOTSeq's + required `run`, `strategy`, `replicate`, and `condition` (defaults + match those names; can be overridden via task.ext.args). + ontologies: + - edam: "http://edamontology.org/format_3752" # CSV + - edam: "http://edamontology.org/format_3475" # TSV + - counts: + type: file + description: | + ORF-level featureCounts output. Header must contain the columns + `Geneid`, `Chr`, `Start`, `End`, `Strand`, `Length` followed by + per-sample count columns matching the `run` values in the sample sheet. + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + - flattened_gtf: + type: file + description: | + Flattened ORF annotation in GTF format, with `gene_id` and + `exon_number` attributes and `type == "exon"` features. + ontologies: + - edam: "http://edamontology.org/format_2306" # GTF + - flattened_bed: + type: file + description: | + Flattened ORF annotation in BED format matching the GTF. + ontologies: + - edam: "http://edamontology.org/format_3003" # BED + +output: + dou_interaction: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dou.interaction.dotseq.results.tsv": + type: file + description: | + DOU interaction contrasts: per-ORF beta-binomial GLM results + (effect size, lfsr, padj, contrast). + pattern: ".dou.interaction.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + dte_interaction: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dte.interaction.dotseq.results.tsv": + type: file + description: | + DTE interaction contrasts: per-ORF DESeq2 + ashr shrinkage results + for differential translation efficiency. + pattern: ".dte.interaction.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + dou_strategy: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dou.strategy.dotseq.results.tsv": + type: file + description: | + DOU strategy contrasts (Ribo vs RNA effect within each condition), + written when available. + pattern: ".dou.strategy.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + dte_strategy: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dte.strategy.dotseq.results.tsv": + type: file + description: | + DTE strategy contrasts (Ribo vs RNA effect within each condition), + written when available. + pattern: ".dte.strategy.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + rdata: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.DOTSeqDataSets.rds": + type: file + description: Serialised DOTSeqDataSets object containing DOU and DTE results + pattern: ".DOTSeqDataSets.rds" + ontologies: [] + session_info: + - - meta: + type: map + description: | + Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.R_sessionInfo.log": + type: file + description: dump of R sessionInfo() + pattern: "*.log" + ontologies: + - edam: "http://edamontology.org/data_1678" # Log + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R new file mode 100644 index 000000000000..e757637299a3 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -0,0 +1,255 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Check for Non-Empty, Non-Whitespace String +is_valid_string <- function(input) { + !is.null(input) && nzchar(trimws(input)) +} + +#' Parse long-form options like --opt1 val1 --opt2 val2 +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV / TSV / featureCounts-style tables +read_delim_flexible <- function(file, header = TRUE, sep = NULL, comment.char = "", check.names = TRUE){ + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + if (ext == "gz") { + # peek at the inner extension + inner <- tolower(tail(strsplit(sub("\\\\.gz\$", "", basename(file)), split = "\\\\.")[[1]], 1)) + } else { + inner <- ext + } + if (is.null(sep)) { + sep <- if (inner == "csv") "," else "\t" + } + read.table( + file, + sep = sep, + header = header, + comment.char = comment.char, + stringsAsFactors = FALSE, + check.names = check.names + ) +} + +################################################ +################################################ +## Parse parameters from Nextflow ## +################################################ +################################################ + +opt <- list( + output_prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), + count_file = '$counts', + sample_file = '$samplesheet', + flattened_gtf = '$flattened_gtf', + flattened_bed = '$flattened_bed', + contrast_variable = '$contrast_variable', + reference_level = '$reference', + target_level = '$target', + sample_id_col = "run", + strategy_col = "strategy", + replicate_col = "replicate", + sample_name_regex = NULL, + modules = "DOU,DTE", + min_count = as.integer(1), + stringent = "TRUE", + dispersion_modeling = "auto", + nullweight = as.numeric(500), + contrasts_method = "revpairwise", + cores = as.integer('$task.cpus') +) +opt_types <- lapply(opt, class) + +args_opt <- parse_args('$task.ext.args') +for (ao in names(args_opt)) { + if (!ao %in% names(opt)) stop(paste("Invalid option:", ao)) + if (!is.null(opt[[ao]])) { + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + opt[[ao]] <- args_opt[[ao]] +} + +required_opts <- c("contrast_variable", "reference_level", "target_level", "output_prefix") +missing <- required_opts[!unlist(lapply(opt[required_opts], is_valid_string)) | !required_opts %in% names(opt)] +if (length(missing) > 0) { + stop(paste("Missing required options:", paste(missing, collapse = ", "))) +} + +for (file_input in c("count_file", "sample_file", "flattened_gtf", "flattened_bed")){ + if (!is_valid_string(opt[[file_input]])) stop(paste("Please provide", file_input)) + if (!file.exists(opt[[file_input]])) stop(paste0("Value of ", file_input, ": ", opt[[file_input]], " is not a valid file")) +} + +modules <- trimws(strsplit(opt\$modules, ",")[[1]]) +stringent_val <- switch(toupper(opt\$stringent), + "TRUE" = TRUE, + "FALSE" = FALSE, + "NULL" = NULL, + stop("`stringent` must be one of TRUE, FALSE, NULL") +) + +################################################ +################################################ +## Load libraries ## +################################################ +################################################ + +suppressPackageStartupMessages({ + library(DOTSeq) + library(SummarizedExperiment) +}) + +################################################ +################################################ +## Read inputs ## +################################################ +################################################ + +cnt <- read_delim_flexible(opt\$count_file, header = TRUE, comment.char = "#") + +# Allow optional renaming of long count-table column names via regex +if (!is.null(opt\$sample_name_regex) && nzchar(opt\$sample_name_regex)) { + names(cnt) <- gsub(opt\$sample_name_regex, "\\\\1", names(cnt)) +} + +cond <- read_delim_flexible(opt\$sample_file, header = TRUE) + +# Normalise condition column names and rename user-chosen columns into the +# names that DOTSeq expects ("run", "strategy", "replicate", "condition") +names(cond) <- tolower(trimws(names(cond))) +user_to_required <- list( + run = tolower(opt\$sample_id_col), + strategy = tolower(opt\$strategy_col), + replicate = tolower(opt\$replicate_col), + condition = tolower(opt\$contrast_variable) +) + +for (req in names(user_to_required)) { + src <- user_to_required[[req]] + if (!src %in% names(cond)) { + stop(paste0("Sample sheet column '", src, "' (mapped to '", req, "') not found. Have: ", + paste(names(cond), collapse = ", "))) + } + if (src != req) { + # Drop any pre-existing column with the required name to avoid collision + cond[[req]] <- cond[[src]] + if (src %in% names(cond) && src != req) cond[[src]] <- NULL + } +} + +# Filter samplesheet to only the levels involved in the contrast +cond <- cond[cond\$condition %in% c(opt\$reference_level, opt\$target_level), , drop = FALSE] +if (nrow(cond) == 0) { + stop("No samples remain after filtering condition column to reference/target levels.") +} + +# Set baseline as the reference level so DTE coefficient is target_vs_reference +cond\$condition <- factor(cond\$condition, levels = c(opt\$reference_level, opt\$target_level)) + +################################################ +################################################ +## Build DOTSeqDataSets ## +################################################ +################################################ + +d <- DOTSeqDataSetsFromFeatureCounts( + count_table = cnt, + condition_table = cond, + flattened_gtf = opt\$flattened_gtf, + flattened_bed = opt\$flattened_bed, + min_count = opt\$min_count, + stringent = stringent_val, + baseline = opt\$reference_level, + verbose = FALSE +) + +################################################ +################################################ +## Run DOTSeq ## +################################################ +################################################ + +d <- DOTSeq( + datasets = d, + modules = modules, + target = opt\$target_level, + baseline = opt\$reference_level, + min_count = opt\$min_count, + stringent = stringent_val, + dispersion_modeling = opt\$dispersion_modeling, + nullweight = opt\$nullweight, + contrasts_method = opt\$contrasts_method, + parallel = list(n = opt\$cores, autopar = TRUE), + verbose = FALSE +) + +################################################ +################################################ +## Extract results ## +################################################ +################################################ + +write_results <- function(df, suffix) { + if (is.null(df) || (is.data.frame(df) && nrow(df) == 0)) return(invisible(NULL)) + out_df <- as.data.frame(df) + write.table( + out_df, + file = paste(opt\$output_prefix, suffix, sep = "."), + col.names = TRUE, + row.names = FALSE, + sep = "\t", + quote = FALSE + ) +} + +interaction_results <- tryCatch(getContrasts(d, type = "interaction"), error = function(e) NULL) +if (!is.null(interaction_results)) { + write_results(interaction_results\$DOU, "dou.interaction.dotseq.results.tsv") + write_results(interaction_results\$DTE, "dte.interaction.dotseq.results.tsv") +} + +strategy_results <- tryCatch(getContrasts(d, type = "strategy"), error = function(e) NULL) +if (!is.null(strategy_results)) { + write_results(strategy_results\$DOU, "dou.strategy.dotseq.results.tsv") + write_results(strategy_results\$DTE, "dte.strategy.dotseq.results.tsv") +} + +# Serialise the full DOTSeqDataSets object for downstream use +saveRDS(d, file = paste(opt\$output_prefix, "DOTSeqDataSets.rds", sep = ".")) + +################################################ +################################################ +## R session info ## +################################################ +################################################ + +sink(paste(opt\$output_prefix, "R_sessionInfo.log", sep = ".")) +print(sessionInfo()) +sink() + +################################################ +################################################ +## Versions ## +################################################ +################################################ + +dotseq.version <- as.character(packageVersion("DOTSeq")) + +writeLines( + c( + '"${task.process}":', + paste(" bioconductor-dotseq:", dotseq.version) + ), + "versions.yml" +) diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test b/modules/nf-core/dotseq/dotseq/tests/main.nf.test new file mode 100644 index 000000000000..779faeb149b9 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test @@ -0,0 +1,51 @@ +nextflow_process { + + name "Test Process DOTSEQ_DOTSEQ" + script "../main.nf" + process "DOTSEQ_DOTSEQ" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "dotseq" + tag "dotseq/dotseq" + + test("human - featurecounts") { + + when { + process { + """ + input[0] = [ + [ id:'cycling_vs_interphase' ], + 'condition', + 'Interphase', + 'Mitotic_Cycling' + ] + input[1] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/featureCounts.cell_cycle_subset.txt.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.gtf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.bed.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.session_info[0][1]).getText().contains('DOTSeq') }, + { assert path(process.out.dou_interaction[0][1]).exists() }, + { assert path(process.out.dte_interaction[0][1]).exists() }, + { assert snapshot( + file(process.out.dou_interaction[0][1]).name, + file(process.out.dte_interaction[0][1]).name, + file(process.out.rdata[0][1]).name, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } +} diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap new file mode 100644 index 000000000000..e2f63e690070 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -0,0 +1,22 @@ +{ + "human - featurecounts": { + "content": [ + "cycling_vs_interphase.dou.interaction.dotseq.results.tsv", + "cycling_vs_interphase.dte.interaction.dotseq.results.tsv", + "cycling_vs_interphase.DOTSeqDataSets.rds", + [ + "versions.yml:md5,8d12f412f693c1ad9f62e6c2a625ed4f" + ], + { + "DOTSEQ_DOTSEQ": { + "bioconductor-dotseq": "1.0.0" + } + } + ], + "timestamp": "2026-05-21T22:32:27.567791436", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/dotseq/dotseq/tests/nextflow.config b/modules/nf-core/dotseq/dotseq/tests/nextflow.config new file mode 100644 index 000000000000..868dd7f0789d --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'DOTSEQ_DOTSEQ' { + ext.args = '--sample_name_regex .*(SRR[0-9]+).*' + } +} From 852b56545de6b8eb0bc834a050037e04f30d2b64 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 08:36:18 +0100 Subject: [PATCH 2/9] Use Wave community container for bioconductor-dotseq 1.0.0 Bioconda recipe (bioconda/bioconda-recipes#65677) merged; biocontainer image is not yet built so swap the placeholder quay.io/depot URLs for a Wave community container built from the now-merged bioconda package. Also widen the singularity guard to include 'apptainer' and add the versions topic block in meta.yml (via nf-core modules lint --fix). --- modules/nf-core/dotseq/dotseq/main.nf | 6 ++-- modules/nf-core/dotseq/dotseq/meta.yml | 39 +++++++++++++++----------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf index c4910aa1c135..afc214b18d9e 100644 --- a/modules/nf-core/dotseq/dotseq/main.nf +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -3,9 +3,9 @@ process DOTSEQ_DOTSEQ { label 'process_medium' conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-dotseq:1.0.0--r45hdfd78af_0' : - 'quay.io/biocontainers/bioconductor-dotseq:1.0.0--r45hdfd78af_0' }" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/9c/9ca895c805758ea7068b919008213c64337748731035237d63b3e1139fae8cfc/data' : + 'community.wave.seqera.io/library/bioconductor-dotseq:1.0.0--4ebea46321bb93bb' }" input: tuple val(meta), val(contrast_variable), val(reference), val(target) diff --git a/modules/nf-core/dotseq/dotseq/meta.yml b/modules/nf-core/dotseq/dotseq/meta.yml index c72a4c8f837c..d30a4e958280 100644 --- a/modules/nf-core/dotseq/dotseq/meta.yml +++ b/modules/nf-core/dotseq/dotseq/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "dotseq_dotseq" description: | Genome-wide detection of differential ORF usage (DOU) and ORF-level @@ -12,13 +11,14 @@ keywords: - orf tools: - "dotseq": - description: "DOTSeq: Differential ORF Translation analysis for Ribo-seq with matched RNA-seq" + description: "DOTSeq: Differential ORF Translation analysis for Ribo-seq with + matched RNA-seq" homepage: "https://bioconductor.org/packages/release/bioc/html/DOTSeq.html" documentation: "https://bioconductor.org/packages/release/bioc/vignettes/DOTSeq/inst/doc/DOTSeq.html" tool_dev_url: "https://github.com/compgenom/DOTSeq" - licence: ["MIT"] + licence: + - "MIT" identifier: "" - input: - - meta: type: map @@ -48,8 +48,8 @@ input: required `run`, `strategy`, `replicate`, and `condition` (defaults match those names; can be overridden via task.ext.args). ontologies: - - edam: "http://edamontology.org/format_3752" # CSV - - edam: "http://edamontology.org/format_3475" # TSV + - edam: "http://edamontology.org/format_3752" + - edam: "http://edamontology.org/format_3475" - counts: type: file description: | @@ -57,21 +57,20 @@ input: `Geneid`, `Chr`, `Start`, `End`, `Strand`, `Length` followed by per-sample count columns matching the `run` values in the sample sheet. ontologies: - - edam: "http://edamontology.org/format_3475" # TSV + - edam: "http://edamontology.org/format_3475" - flattened_gtf: type: file description: | Flattened ORF annotation in GTF format, with `gene_id` and `exon_number` attributes and `type == "exon"` features. ontologies: - - edam: "http://edamontology.org/format_2306" # GTF + - edam: "http://edamontology.org/format_2306" - flattened_bed: type: file description: | Flattened ORF annotation in BED format matching the GTF. ontologies: - - edam: "http://edamontology.org/format_3003" # BED - + - edam: "http://edamontology.org/format_3003" output: dou_interaction: - - meta: @@ -85,7 +84,7 @@ output: (effect size, lfsr, padj, contrast). pattern: ".dou.interaction.dotseq.results.tsv" ontologies: - - edam: "http://edamontology.org/format_3475" # TSV + - edam: "http://edamontology.org/format_3475" dte_interaction: - - meta: type: map @@ -98,7 +97,7 @@ output: for differential translation efficiency. pattern: ".dte.interaction.dotseq.results.tsv" ontologies: - - edam: "http://edamontology.org/format_3475" # TSV + - edam: "http://edamontology.org/format_3475" dou_strategy: - - meta: type: map @@ -111,7 +110,7 @@ output: written when available. pattern: ".dou.strategy.dotseq.results.tsv" ontologies: - - edam: "http://edamontology.org/format_3475" # TSV + - edam: "http://edamontology.org/format_3475" dte_strategy: - - meta: type: map @@ -124,7 +123,7 @@ output: written when available. pattern: ".dte.strategy.dotseq.results.tsv" ontologies: - - edam: "http://edamontology.org/format_3475" # TSV + - edam: "http://edamontology.org/format_3475" rdata: - - meta: type: map @@ -132,7 +131,8 @@ output: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - "*.DOTSeqDataSets.rds": type: file - description: Serialised DOTSeqDataSets object containing DOU and DTE results + description: Serialised DOTSeqDataSets object containing DOU and DTE + results pattern: ".DOTSeqDataSets.rds" ontologies: [] session_info: @@ -145,14 +145,19 @@ output: description: dump of R sessionInfo() pattern: "*.log" ontologies: - - edam: "http://edamontology.org/data_1678" # Log + - edam: "http://edamontology.org/data_1678" versions: - versions.yml: type: file description: File containing software versions pattern: "versions.yml" ontologies: - - edam: "http://edamontology.org/format_3750" # YAML + - edam: "http://edamontology.org/format_3750" +topics: + versions: + - versions.yml: + type: string + description: The name of the process authors: - "@pinin4fjords" maintainers: From e5aca7c69ea62f6f898fd0b554d6b6d87b91d580 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 09:55:43 +0100 Subject: [PATCH 3/9] Add native plotDOT outputs, simplify template, tidyverse syntax - Restructure the R template around optparse + readr + dplyr + purrr + ggplot2; drop the homemade parse_args / read_delim_flexible helpers in favour of the standard package idioms and native pipe. - Output set is now what DOTSeq itself emits natively: per-ORF DTE contrasts (translation.dotseq.results.tsv), DOU contrasts (dou.dotseq.results.tsv), optional dou_strategy / dte_strategy per-condition Ribo-vs-RNA contrasts, plus the four plotDOT() PNGs (volcano / composite / venn / heatmap) and a DTE p-value distribution histogram drawn directly from DOTSeq's padj column. - Container picks up r-eulerr + r-ggsignif (required for plotDOT venn) and explicit r-ggplot2 so the histogram has a stable ggplot version. - plotDOT() default of force_new_device=TRUE was killing our png() device on each call; pass FALSE so the PNGs land where Nextflow expects them. --- modules/nf-core/dotseq/dotseq/environment.yml | 10 + modules/nf-core/dotseq/dotseq/main.nf | 23 +- modules/nf-core/dotseq/dotseq/meta.yml | 159 ++++--- .../nf-core/dotseq/dotseq/templates/dotseq.R | 450 ++++++++++-------- .../nf-core/dotseq/dotseq/tests/main.nf.test | 10 +- .../dotseq/dotseq/tests/main.nf.test.snap | 14 +- 6 files changed, 389 insertions(+), 277 deletions(-) diff --git a/modules/nf-core/dotseq/dotseq/environment.yml b/modules/nf-core/dotseq/dotseq/environment.yml index 8585dd9e0a5c..620960ff719f 100644 --- a/modules/nf-core/dotseq/dotseq/environment.yml +++ b/modules/nf-core/dotseq/dotseq/environment.yml @@ -5,3 +5,13 @@ channels: - bioconda dependencies: - bioconda::bioconductor-dotseq=1.0.0 + - conda-forge::r-dplyr=1.2.1 + - conda-forge::r-eulerr=7.1.0 + - conda-forge::r-ggplot2=4.0.3 + - conda-forge::r-ggrepel=0.9.8 + - conda-forge::r-ggsignif=0.6.4 + - conda-forge::r-optparse=1.8.2 + - conda-forge::r-purrr=1.2.2 + - conda-forge::r-readr=2.2.0 + - conda-forge::r-tibble=3.3.1 + - conda-forge::r-tidyr=1.3.2 diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf index afc214b18d9e..1b9fee44f144 100644 --- a/modules/nf-core/dotseq/dotseq/main.nf +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -4,18 +4,29 @@ process DOTSEQ_DOTSEQ { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? - 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/9c/9ca895c805758ea7068b919008213c64337748731035237d63b3e1139fae8cfc/data' : - 'community.wave.seqera.io/library/bioconductor-dotseq:1.0.0--4ebea46321bb93bb' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/12/12667d472e9ae0f1602041dc018ba6bde294e6190e67999d71b65e7a2df7ea1f/data' : + 'community.wave.seqera.io/library/bioconductor-dotseq_r-dplyr_r-eulerr_r-ggplot2_pruned:6c8a9ebdec36c958' }" input: tuple val(meta), val(contrast_variable), val(reference), val(target) tuple val(meta2), path(samplesheet), path(counts), path(flattened_gtf), path(flattened_bed) output: - tuple val(meta), path("*.dou.interaction.dotseq.results.tsv"), emit: dou_interaction - tuple val(meta), path("*.dte.interaction.dotseq.results.tsv"), emit: dte_interaction - tuple val(meta), path("*.dou.strategy.dotseq.results.tsv") , emit: dou_strategy , optional: true - tuple val(meta), path("*.dte.strategy.dotseq.results.tsv") , emit: dte_strategy , optional: true + // Per-ORF differential translation efficiency (DTE interaction term) + tuple val(meta), path("*.translation.dotseq.results.tsv") , emit: translation + // Per-ORF differential ORF usage + tuple val(meta), path("*.dou.dotseq.results.tsv") , emit: dou + // Per-condition Ribo-vs-RNA strategy contrasts, when DOTSeq emits them + tuple val(meta), path("*.dou_strategy.dotseq.results.tsv") , emit: dou_strategy , optional: true + tuple val(meta), path("*.dte_strategy.dotseq.results.tsv") , emit: dte_strategy , optional: true + // plotDOT() outputs + tuple val(meta), path("*.volcano.png") , emit: volcano_plot , optional: true + tuple val(meta), path("*.composite.png") , emit: composite_plot, optional: true + tuple val(meta), path("*.venn.png") , emit: venn_plot , optional: true + tuple val(meta), path("*.heatmap.png") , emit: heatmap_plot , optional: true + // Histogram of DTE adjusted p-values + tuple val(meta), path("*.interaction_p_distribution.png") , emit: interaction_p_distribution_plot, optional: true + // Serialised dataset + session info + versions tuple val(meta), path("*.DOTSeqDataSets.rds") , emit: rdata tuple val(meta), path("*.R_sessionInfo.log") , emit: session_info path "versions.yml" , emit: versions, topic: versions diff --git a/modules/nf-core/dotseq/dotseq/meta.yml b/modules/nf-core/dotseq/dotseq/meta.yml index d30a4e958280..e23d1bd3893c 100644 --- a/modules/nf-core/dotseq/dotseq/meta.yml +++ b/modules/nf-core/dotseq/dotseq/meta.yml @@ -1,8 +1,11 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "dotseq_dotseq" description: | - Genome-wide detection of differential ORF usage (DOU) and ORF-level - differential translation efficiency (DTE) from Ribo-seq with matched - RNA-seq using DOTSeq. + Detect differential ORF usage (DOU) and ORF-level differential + translation efficiency (DTE) from Ribo-seq with matched RNA-seq using + DOTSeq. Wraps DOTSeqDataSetsFromFeatureCounts() + DOTSeq() + + getContrasts() and emits the package's native contrast tables plus + plotDOT() visualisations. keywords: - riboseq - rnaseq @@ -11,135 +14,171 @@ keywords: - orf tools: - "dotseq": - description: "DOTSeq: Differential ORF Translation analysis for Ribo-seq with - matched RNA-seq" + description: "Differential ORF Translation analysis for Ribo-seq with matched RNA-seq" homepage: "https://bioconductor.org/packages/release/bioc/html/DOTSeq.html" documentation: "https://bioconductor.org/packages/release/bioc/vignettes/DOTSeq/inst/doc/DOTSeq.html" tool_dev_url: "https://github.com/compgenom/DOTSeq" licence: - "MIT" identifier: "" + input: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - contrast_variable: type: string - description: | - The column in the sample sheet that holds the experimental condition - (will be mapped to the DOTSeq 'condition' column internally). + description: Sample-sheet column that holds the experimental condition (mapped to DOTSeq's `condition` internally). - reference: type: string - description: | - The value within the contrast_variable column to use as the reference (baseline) condition. + description: Value of the contrast_variable to use as reference (baseline). - target: type: string - description: | - The value within the contrast_variable column to use as the target (non-reference) condition. + description: Value of the contrast_variable to use as target (non-reference). - - meta2: type: map - description: | - Groovy map containing study-wide metadata related to the sample sheet and count matrix + description: Groovy map containing study-wide metadata - samplesheet: type: file description: | - CSV or TSV sample sheet. Must contain columns mapping to DOTSeq's - required `run`, `strategy`, `replicate`, and `condition` (defaults - match those names; can be overridden via task.ext.args). + CSV or TSV sample sheet with `run`, `strategy`, `replicate`, and + `condition` columns (defaults; can be overridden via + task.ext.args). Both Ribo-seq and RNA-seq samples are required: + DOTSeq's design is `~ condition * strategy` and the interaction + term is unestimable without both strategies. ontologies: - edam: "http://edamontology.org/format_3752" - edam: "http://edamontology.org/format_3475" - counts: type: file description: | - ORF-level featureCounts output. Header must contain the columns - `Geneid`, `Chr`, `Start`, `End`, `Strand`, `Length` followed by - per-sample count columns matching the `run` values in the sample sheet. + ORF-level featureCounts output - header must include + `Geneid, Chr, Start, End, Strand, Length` followed by per-sample + count columns matching the `run` values in the sample sheet. ontologies: - edam: "http://edamontology.org/format_3475" - flattened_gtf: type: file description: | - Flattened ORF annotation in GTF format, with `gene_id` and - `exon_number` attributes and `type == "exon"` features. + Flattened ORF annotation in GTF format (gene_id + exon_number, + type=="exon"). ontologies: - edam: "http://edamontology.org/format_2306" - flattened_bed: type: file - description: | - Flattened ORF annotation in BED format matching the GTF. + description: Flattened ORF annotation in BED format matching the GTF. ontologies: - edam: "http://edamontology.org/format_3003" + output: - dou_interaction: + translation: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - - "*.dou.interaction.dotseq.results.tsv": + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.translation.dotseq.results.tsv": type: file description: | - DOU interaction contrasts: per-ORF beta-binomial GLM results - (effect size, lfsr, padj, contrast). - pattern: ".dou.interaction.dotseq.results.tsv" + Per-ORF differential translation efficiency: DOTSeq's DTE + interaction-term results (DESeq2 + ashr shrinkage). + pattern: ".translation.dotseq.results.tsv" ontologies: - edam: "http://edamontology.org/format_3475" - dte_interaction: + dou: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - - "*.dte.interaction.dotseq.results.tsv": + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dou.dotseq.results.tsv": type: file description: | - DTE interaction contrasts: per-ORF DESeq2 + ashr shrinkage results - for differential translation efficiency. - pattern: ".dte.interaction.dotseq.results.tsv" + DOTSeq Differential ORF Usage results (beta-binomial GLM + modelling Ribo / RNA proportion changes within each gene, + shrunk with ashr). DOTSeq-unique. + pattern: ".dou.dotseq.results.tsv" ontologies: - edam: "http://edamontology.org/format_3475" dou_strategy: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - - "*.dou.strategy.dotseq.results.tsv": + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dou_strategy.dotseq.results.tsv": type: file - description: | - DOU strategy contrasts (Ribo vs RNA effect within each condition), - written when available. - pattern: ".dou.strategy.dotseq.results.tsv" + description: DOU strategy contrasts (Ribo vs RNA effect per condition), when present. + pattern: ".dou_strategy.dotseq.results.tsv" ontologies: - edam: "http://edamontology.org/format_3475" dte_strategy: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - - "*.dte.strategy.dotseq.results.tsv": + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dte_strategy.dotseq.results.tsv": type: file - description: | - DTE strategy contrasts (Ribo vs RNA effect within each condition), - written when available. - pattern: ".dte.strategy.dotseq.results.tsv" + description: DTE strategy contrasts (Ribo vs RNA effect per condition), when present. + pattern: ".dte_strategy.dotseq.results.tsv" ontologies: - edam: "http://edamontology.org/format_3475" + volcano_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.volcano.png": + type: file + description: DOTSeq plotDOT() volcano (DOU + DTE significance). + pattern: ".volcano.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + composite_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.composite.png": + type: file + description: DOTSeq plotDOT() composite scatter (DOU vs DTE effect sizes). + pattern: ".composite.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + venn_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.venn.png": + type: file + description: DOTSeq plotDOT() Venn diagram of DOU vs DTE significant ORFs. + pattern: ".venn.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + heatmap_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.heatmap.png": + type: file + description: DOTSeq plotDOT() heatmap of DOU across top genes. + pattern: ".heatmap.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + interaction_p_distribution_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.interaction_p_distribution.png": + type: file + description: Histogram of DOTSeq's DTE adjusted p-values. + pattern: ".interaction_p_distribution.png" + ontologies: + - edam: "http://edamontology.org/format_3603" rdata: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - "*.DOTSeqDataSets.rds": type: file - description: Serialised DOTSeqDataSets object containing DOU and DTE - results + description: Serialised DOTSeqDataSets object containing DOU + DTE fits pattern: ".DOTSeqDataSets.rds" ontologies: [] session_info: - - meta: type: map - description: | - Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] - "*.R_sessionInfo.log": type: file description: dump of R sessionInfo() diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R index e757637299a3..ea2f5c6f0f17 100644 --- a/modules/nf-core/dotseq/dotseq/templates/dotseq.R +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -1,255 +1,301 @@ #!/usr/bin/env Rscript -################################################ -################################################ -## Functions ## -################################################ -################################################ - -#' Check for Non-Empty, Non-Whitespace String -is_valid_string <- function(input) { - !is.null(input) && nzchar(trimws(input)) -} - -#' Parse long-form options like --opt1 val1 --opt2 val2 -parse_args <- function(x){ - args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] - args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) - args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) - parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) - parsed_args[! is.na(parsed_args)] -} - -#' Flexibly read CSV / TSV / featureCounts-style tables -read_delim_flexible <- function(file, header = TRUE, sep = NULL, comment.char = "", check.names = TRUE){ - ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) - if (ext == "gz") { - # peek at the inner extension - inner <- tolower(tail(strsplit(sub("\\\\.gz\$", "", basename(file)), split = "\\\\.")[[1]], 1)) - } else { - inner <- ext - } - if (is.null(sep)) { - sep <- if (inner == "csv") "," else "\t" - } - read.table( - file, - sep = sep, - header = header, - comment.char = comment.char, - stringsAsFactors = FALSE, - check.names = check.names - ) -} +suppressPackageStartupMessages({ + library(optparse) + library(readr) + library(dplyr) + library(tidyr) + library(tibble) + library(purrr) + library(ggplot2) + library(DOTSeq) + library(SummarizedExperiment) +}) -################################################ -################################################ -## Parse parameters from Nextflow ## -################################################ -################################################ - -opt <- list( - output_prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), - count_file = '$counts', - sample_file = '$samplesheet', - flattened_gtf = '$flattened_gtf', - flattened_bed = '$flattened_bed', - contrast_variable = '$contrast_variable', - reference_level = '$reference', - target_level = '$target', - sample_id_col = "run", - strategy_col = "strategy", - replicate_col = "replicate", - sample_name_regex = NULL, - modules = "DOU,DTE", - min_count = as.integer(1), - stringent = "TRUE", - dispersion_modeling = "auto", - nullweight = as.numeric(500), - contrasts_method = "revpairwise", - cores = as.integer('$task.cpus') +################################################################################ +## Parse parameters ## +################################################################################ + +option_list <- list( + make_option("--output_prefix", type = "character", default = NULL), + make_option("--count_file", type = "character", default = NULL), + make_option("--sample_file", type = "character", default = NULL), + make_option("--flattened_gtf", type = "character", default = NULL), + make_option("--flattened_bed", type = "character", default = NULL), + make_option("--contrast_variable", type = "character", default = NULL), + make_option("--reference_level", type = "character", default = NULL), + make_option("--target_level", type = "character", default = NULL), + make_option("--sample_id_col", type = "character", default = "run"), + make_option("--strategy_col", type = "character", default = "strategy"), + make_option("--replicate_col", type = "character", default = "replicate"), + make_option("--sample_name_regex", type = "character", default = NULL, + help = "Regex applied to count-table column names; the first capture group is kept (matches DOTSeq's vignette pattern)."), + make_option("--modules", type = "character", default = "DOU,DTE", + help = "Which DOTSeq modules to run [default: %default]"), + make_option("--min_count", type = "integer", default = 1L), + make_option("--stringent", type = "character", default = "TRUE", + help = "TRUE / FALSE / NULL [default: %default]"), + make_option("--dispersion_modeling", type = "character", default = "auto"), + make_option("--nullweight", type = "double", default = 500), + make_option("--contrasts_method", type = "character", default = "revpairwise"), + make_option("--generate_plots", type = "logical", default = TRUE), + make_option("--alpha", type = "double", default = 0.05, + help = "Padj cut-off for the DTE p-value distribution plot"), + make_option("--top_hits", type = "integer", default = 25L), + make_option("--cores", type = "integer", default = 1L) ) -opt_types <- lapply(opt, class) -args_opt <- parse_args('$task.ext.args') -for (ao in names(args_opt)) { - if (!ao %in% names(opt)) stop(paste("Invalid option:", ao)) - if (!is.null(opt[[ao]])) { - args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) - } - opt[[ao]] <- args_opt[[ao]] -} +# Defaults wired in by the Nextflow template; task.ext.args (if any) layers +# on top so users can override anything via `--key value`. +nf_defaults <- c( + paste0("--output_prefix=", ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix')), + paste0("--count_file=", '$counts'), + paste0("--sample_file=", '$samplesheet'), + paste0("--flattened_gtf=", '$flattened_gtf'), + paste0("--flattened_bed=", '$flattened_bed'), + paste0("--contrast_variable=", '$contrast_variable'), + paste0("--reference_level=", '$reference'), + paste0("--target_level=", '$target'), + paste0("--cores=", '$task.cpus') +) -required_opts <- c("contrast_variable", "reference_level", "target_level", "output_prefix") -missing <- required_opts[!unlist(lapply(opt[required_opts], is_valid_string)) | !required_opts %in% names(opt)] -if (length(missing) > 0) { - stop(paste("Missing required options:", paste(missing, collapse = ", "))) +ext_args_raw <- '$task.ext.args' +ext_argv <- if (identical(ext_args_raw, "null") || !nzchar(trimws(ext_args_raw))) { + character(0) +} else { + strsplit(ext_args_raw, "\\\\s+", perl = TRUE)[[1]] |> (\\(x) x[nzchar(x)])() } -for (file_input in c("count_file", "sample_file", "flattened_gtf", "flattened_bed")){ - if (!is_valid_string(opt[[file_input]])) stop(paste("Please provide", file_input)) - if (!file.exists(opt[[file_input]])) stop(paste0("Value of ", file_input, ": ", opt[[file_input]], " is not a valid file")) -} +opt <- parse_args(OptionParser(option_list = option_list), args = c(nf_defaults, ext_argv)) -modules <- trimws(strsplit(opt\$modules, ",")[[1]]) +# DOTSeq accepts TRUE, FALSE, or NULL for `stringent` (three filter modes); +# optparse won't natively parse a tri-state into a logical, so we round-trip. stringent_val <- switch(toupper(opt\$stringent), - "TRUE" = TRUE, + "TRUE" = TRUE, "FALSE" = FALSE, - "NULL" = NULL, - stop("`stringent` must be one of TRUE, FALSE, NULL") + "NULL" = NULL, + stop("`--stringent` must be one of TRUE, FALSE, NULL") ) +modules <- trimws(strsplit(opt\$modules, ",")[[1]]) -################################################ -################################################ -## Load libraries ## -################################################ -################################################ - -suppressPackageStartupMessages({ - library(DOTSeq) - library(SummarizedExperiment) +walk(c("count_file", "sample_file", "flattened_gtf", "flattened_bed"), \\(x) { + if (!file.exists(opt[[x]])) stop("Missing input file: ", x, " = ", opt[[x]]) }) -################################################ -################################################ -## Read inputs ## -################################################ -################################################ +prefix <- opt\$output_prefix + +################################################################################ +## Helpers ## +################################################################################ + +# Pick TSV/CSV from file extension; the `comment` argument lets us drop +# featureCounts' first-line program-version comment. +read_delim_flexible <- function(file, comment = "") { + base <- sub("\\\\.gz\$", "", basename(file)) + ext <- tolower(tools::file_ext(base)) + delim <- if (ext == "csv") "," else "\t" + suppressWarnings( + read_delim(file, delim = delim, comment = comment, + show_col_types = FALSE, progress = FALSE) + ) +} + +# DOTSeq's getContrasts() returns objects with ORF IDs in rownames; lift +# them into an `orf_id` column so the TSV is self-describing. +to_orf_tibble <- function(x) { + if (is.null(x)) return(NULL) + df <- as.data.frame(x) + if (!"orf_id" %in% names(df)) df <- rownames_to_column(df, "orf_id") + as_tibble(df) +} -cnt <- read_delim_flexible(opt\$count_file, header = TRUE, comment.char = "#") +# Always emit the file (even when empty) so downstream Nextflow channels +# behave consistently across runs with different significance counts. +write_results_tsv <- function(df, suffix) { + out_path <- paste0(prefix, ".", suffix) + if (is.null(df) || nrow(df) == 0) { + write_tsv(tibble(), out_path) + } else { + write_tsv(df, out_path) + } +} -# Allow optional renaming of long count-table column names via regex +################################################################################ +## Read inputs and normalise the sample sheet ## +################################################################################ + +cnt <- as.data.frame(read_delim_flexible(opt\$count_file, comment = "#")) + +# featureCounts column names often carry the full BAM path; the vignette uses +# `gsub(".*(SRR[0-9]+).*", "\\1", names(cnt))` to keep just the run accession. +# Expose the same regex via the CLI so users can adapt to their own naming. if (!is.null(opt\$sample_name_regex) && nzchar(opt\$sample_name_regex)) { names(cnt) <- gsub(opt\$sample_name_regex, "\\\\1", names(cnt)) } -cond <- read_delim_flexible(opt\$sample_file, header = TRUE) +cond <- read_delim_flexible(opt\$sample_file) |> + as.data.frame() |> + rename_with(\\(nm) tolower(trimws(nm))) -# Normalise condition column names and rename user-chosen columns into the -# names that DOTSeq expects ("run", "strategy", "replicate", "condition") -names(cond) <- tolower(trimws(names(cond))) -user_to_required <- list( - run = tolower(opt\$sample_id_col), - strategy = tolower(opt\$strategy_col), +# DOTSeq's parse_condition_table() insists on columns named exactly +# `run, strategy, replicate, condition` (lower-case). Allow the user to point +# at differently-named columns via task.ext.args and rename in-place. +col_map <- c( + run = tolower(opt\$sample_id_col), + strategy = tolower(opt\$strategy_col), replicate = tolower(opt\$replicate_col), condition = tolower(opt\$contrast_variable) ) - -for (req in names(user_to_required)) { - src <- user_to_required[[req]] - if (!src %in% names(cond)) { - stop(paste0("Sample sheet column '", src, "' (mapped to '", req, "') not found. Have: ", - paste(names(cond), collapse = ", "))) - } - if (src != req) { - # Drop any pre-existing column with the required name to avoid collision - cond[[req]] <- cond[[src]] - if (src %in% names(cond) && src != req) cond[[src]] <- NULL - } +missing_cols <- col_map[!col_map %in% names(cond)] +if (length(missing_cols) > 0) { + stop(sprintf("Sample sheet missing column(s): %s. Have: %s", + paste(missing_cols, collapse = ", "), + paste(names(cond), collapse = ", "))) } - -# Filter samplesheet to only the levels involved in the contrast -cond <- cond[cond\$condition %in% c(opt\$reference_level, opt\$target_level), , drop = FALSE] -if (nrow(cond) == 0) { - stop("No samples remain after filtering condition column to reference/target levels.") +for (req in names(col_map)) { + src <- col_map[[req]] + if (src != req) names(cond)[names(cond) == src] <- req } -# Set baseline as the reference level so DTE coefficient is target_vs_reference -cond\$condition <- factor(cond\$condition, levels = c(opt\$reference_level, opt\$target_level)) +# Subset to the two contrast levels and put `reference` first so it becomes +# the implicit baseline in the DESeq2 / glmmTMB design. +cond <- cond |> + filter(.data\$condition %in% c(opt\$reference_level, opt\$target_level)) |> + mutate( + condition = factor(.data\$condition, levels = c(opt\$reference_level, opt\$target_level)), + strategy = factor(.data\$strategy) + ) + +if (nrow(cond) == 0) stop("No samples remain after filtering on the contrast levels.") -################################################ -################################################ -## Build DOTSeqDataSets ## -################################################ -################################################ +################################################################################ +## DOTSeq: DOU + DTE ## +################################################################################ d <- DOTSeqDataSetsFromFeatureCounts( - count_table = cnt, + count_table = cnt, condition_table = cond, - flattened_gtf = opt\$flattened_gtf, - flattened_bed = opt\$flattened_bed, - min_count = opt\$min_count, - stringent = stringent_val, - baseline = opt\$reference_level, - verbose = FALSE + flattened_gtf = opt\$flattened_gtf, + flattened_bed = opt\$flattened_bed, + min_count = opt\$min_count, + stringent = stringent_val, + baseline = opt\$reference_level, + verbose = FALSE ) -################################################ -################################################ -## Run DOTSeq ## -################################################ -################################################ - d <- DOTSeq( - datasets = d, - modules = modules, - target = opt\$target_level, - baseline = opt\$reference_level, - min_count = opt\$min_count, - stringent = stringent_val, + datasets = d, + modules = modules, + target = opt\$target_level, + baseline = opt\$reference_level, + min_count = opt\$min_count, + stringent = stringent_val, dispersion_modeling = opt\$dispersion_modeling, - nullweight = opt\$nullweight, - contrasts_method = opt\$contrasts_method, - parallel = list(n = opt\$cores, autopar = TRUE), - verbose = FALSE + nullweight = opt\$nullweight, + contrasts_method = opt\$contrasts_method, + parallel = list(n = opt\$cores, autopar = TRUE), + verbose = FALSE ) -################################################ -################################################ -## Extract results ## -################################################ -################################################ - -write_results <- function(df, suffix) { - if (is.null(df) || (is.data.frame(df) && nrow(df) == 0)) return(invisible(NULL)) - out_df <- as.data.frame(df) - write.table( - out_df, - file = paste(opt\$output_prefix, suffix, sep = "."), - col.names = TRUE, - row.names = FALSE, - sep = "\t", - quote = FALSE - ) -} +dou_interaction <- to_orf_tibble(tryCatch(getContrasts(getDOU(d), type = "interaction"), error = \\(e) NULL)) +dou_strategy <- to_orf_tibble(tryCatch(getContrasts(getDOU(d), type = "strategy"), error = \\(e) NULL)) +dte_interaction <- to_orf_tibble(tryCatch(getContrasts(getDTE(d), type = "interaction"), error = \\(e) NULL)) +dte_strategy <- to_orf_tibble(tryCatch(getContrasts(getDTE(d), type = "strategy"), error = \\(e) NULL)) -interaction_results <- tryCatch(getContrasts(d, type = "interaction"), error = function(e) NULL) -if (!is.null(interaction_results)) { - write_results(interaction_results\$DOU, "dou.interaction.dotseq.results.tsv") - write_results(interaction_results\$DTE, "dte.interaction.dotseq.results.tsv") -} +################################################################################ +## Write result tables ## +## ## +## DTE interaction is written as `translation.dotseq.results.tsv` because it ## +## is the per-ORF differential translation efficiency contrast. ## +################################################################################ + +write_results_tsv(dte_interaction, "translation.dotseq.results.tsv") +write_results_tsv(dou_interaction, "dou.dotseq.results.tsv") -strategy_results <- tryCatch(getContrasts(d, type = "strategy"), error = function(e) NULL) -if (!is.null(strategy_results)) { - write_results(strategy_results\$DOU, "dou.strategy.dotseq.results.tsv") - write_results(strategy_results\$DTE, "dte.strategy.dotseq.results.tsv") +if (!is.null(dou_strategy) && nrow(dou_strategy) > 0) { + write_results_tsv(dou_strategy, "dou_strategy.dotseq.results.tsv") +} +if (!is.null(dte_strategy) && nrow(dte_strategy) > 0) { + write_results_tsv(dte_strategy, "dte_strategy.dotseq.results.tsv") } -# Serialise the full DOTSeqDataSets object for downstream use -saveRDS(d, file = paste(opt\$output_prefix, "DOTSeqDataSets.rds", sep = ".")) +saveRDS(d, file = paste0(prefix, ".DOTSeqDataSets.rds")) + +################################################################################ +## Plots ## +## ## +## Volcano / composite / venn / heatmap come from DOTSeq's native plotDOT(). ## +## The p-value distribution plot is a plain ggplot on top of the package's ## +## own DTE padj column. ## +################################################################################ + +if (opt\$generate_plots) { + + if (!is.null(dte_interaction) && nrow(dte_interaction) > 0) { + pdist <- dte_interaction |> filter(!is.na(padj)) + if (nrow(pdist) > 0) { + pdist_plot <- ggplot(pdist, aes(x = padj)) + + geom_histogram(bins = 40, fill = "#3498db", colour = "white", alpha = 0.85) + + geom_vline(xintercept = opt\$alpha, linetype = "dashed", colour = "#e74c3c") + + labs( + x = "Adjusted p-value (DTE interaction)", + y = "Count", + title = sprintf("DTE p-value distribution (alpha = %s)", opt\$alpha) + ) + + theme_bw(base_size = 13) + ggsave(paste0(prefix, ".interaction_p_distribution.png"), + plot = pdist_plot, width = 8, height = 6, dpi = 100) + } + } -################################################ -################################################ -## R session info ## -################################################ -################################################ + # plotDOT defaults to `force_new_device = TRUE` which unconditionally + # resets the active graphics device, killing the png() we just opened. + # Disable that so the PNG actually captures the plot. + safe_plot_dot <- function(plot_type, fname, results_df = NULL, data = NULL) { + if (is.null(results_df) || nrow(results_df) == 0) return(invisible(NULL)) + tryCatch({ + png(fname, width = 900, height = 800, res = 110) + plotDOT( + plot_type = plot_type, + results = results_df, + data = data, + id_mapping = FALSE, + plot_params = list(top_hits = opt\$top_hits), + force_new_device = FALSE + ) + dev.off() + }, error = \\(e) { + while (length(dev.list()) > 0) dev.off() + message(sprintf("plotDOT(%s) failed: %s", plot_type, conditionMessage(e))) + }) + } -sink(paste(opt\$output_prefix, "R_sessionInfo.log", sep = ".")) -print(sessionInfo()) -sink() + # plotDOT wants both DOU and DTE columns on a single results frame keyed by orf_id. + plotdot_df <- if (!is.null(dou_interaction) && !is.null(dte_interaction)) { + dou_interaction |> inner_join(dte_interaction, by = "orf_id", suffix = c("_dou", "_dte")) + } else NULL + + safe_plot_dot("volcano", paste0(prefix, ".volcano.png"), plotdot_df, getDOU(d)) + safe_plot_dot("composite", paste0(prefix, ".composite.png"), plotdot_df, getDOU(d)) + safe_plot_dot("venn", paste0(prefix, ".venn.png"), plotdot_df) + safe_plot_dot("heatmap", paste0(prefix, ".heatmap.png"), plotdot_df, getDOU(d)) +} -################################################ -################################################ -## Versions ## -################################################ -################################################ +################################################################################ +## Session info + versions ## +################################################################################ -dotseq.version <- as.character(packageVersion("DOTSeq")) +sink(paste0(prefix, ".R_sessionInfo.log")) +print(sessionInfo()) +sink() writeLines( c( '"${task.process}":', - paste(" bioconductor-dotseq:", dotseq.version) + paste0(" bioconductor-dotseq: ", packageVersion("DOTSeq")), + paste0(" r-optparse: ", packageVersion("optparse")), + paste0(" r-readr: ", packageVersion("readr")), + paste0(" r-dplyr: ", packageVersion("dplyr")) ), "versions.yml" ) diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test b/modules/nf-core/dotseq/dotseq/tests/main.nf.test index 779faeb149b9..94d46a0de1e5 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test @@ -36,12 +36,14 @@ nextflow_process { assertAll( { assert process.success }, { assert path(process.out.session_info[0][1]).getText().contains('DOTSeq') }, - { assert path(process.out.dou_interaction[0][1]).exists() }, - { assert path(process.out.dte_interaction[0][1]).exists() }, + { assert path(process.out.translation[0][1]).exists() }, + { assert path(process.out.dou[0][1]).exists() }, + { assert process.out.interaction_p_distribution_plot.size() > 0 }, { assert snapshot( - file(process.out.dou_interaction[0][1]).name, - file(process.out.dte_interaction[0][1]).name, + file(process.out.translation[0][1]).name, + file(process.out.dou[0][1]).name, file(process.out.rdata[0][1]).name, + file(process.out.interaction_p_distribution_plot[0][1]).name, process.out.versions, path(process.out.versions[0]).yaml ).match() } diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap index e2f63e690070..49ed37a1175e 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -1,19 +1,23 @@ { "human - featurecounts": { "content": [ - "cycling_vs_interphase.dou.interaction.dotseq.results.tsv", - "cycling_vs_interphase.dte.interaction.dotseq.results.tsv", + "cycling_vs_interphase.translation.dotseq.results.tsv", + "cycling_vs_interphase.dou.dotseq.results.tsv", "cycling_vs_interphase.DOTSeqDataSets.rds", + "cycling_vs_interphase.interaction_p_distribution.png", [ - "versions.yml:md5,8d12f412f693c1ad9f62e6c2a625ed4f" + "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" ], { "DOTSEQ_DOTSEQ": { - "bioconductor-dotseq": "1.0.0" + "bioconductor-dotseq": "1.0.0", + "r-optparse": "1.8.2", + "r-readr": "2.2.0", + "r-dplyr": "1.2.1" } } ], - "timestamp": "2026-05-21T22:32:27.567791436", + "timestamp": "2026-05-22T08:55:01.866775853", "meta": { "nf-test": "0.9.5", "nextflow": "26.04.1" From 4bf79c3e1af99832a50c2c643ea7ebb872483422 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 10:16:37 +0100 Subject: [PATCH 4/9] Simplify R template helpers, add heatmap sorf_type fallback - Drop the homemade read_delim_flexible() and write_results_tsv() wrappers in favour of read_tsv() / read_csv() / write_tsv() directly. The earlier to_orf_tibble() conditional is also gone now that we know getContrasts() always returns a frame with orf_id as a column (per the DOTSeq source in posthoc.R + main.R). - plotDOT(heatmap) requires gene-paired mORF + sorf entries; try uORF first (the package default) and fall back to dORF when no significant gene has both. tryCatch in safe_plot_dot still makes either a no-op when neither succeeds. --- .../nf-core/dotseq/dotseq/templates/dotseq.R | 99 +++++++++---------- .../dotseq/dotseq/tests/main.nf.test.snap | 2 +- 2 files changed, 46 insertions(+), 55 deletions(-) diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R index ea2f5c6f0f17..c013a3fee679 100644 --- a/modules/nf-core/dotseq/dotseq/templates/dotseq.R +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -84,47 +84,15 @@ walk(c("count_file", "sample_file", "flattened_gtf", "flattened_bed"), \\(x) { prefix <- opt\$output_prefix -################################################################################ -## Helpers ## -################################################################################ - -# Pick TSV/CSV from file extension; the `comment` argument lets us drop -# featureCounts' first-line program-version comment. -read_delim_flexible <- function(file, comment = "") { - base <- sub("\\\\.gz\$", "", basename(file)) - ext <- tolower(tools::file_ext(base)) - delim <- if (ext == "csv") "," else "\t" - suppressWarnings( - read_delim(file, delim = delim, comment = comment, - show_col_types = FALSE, progress = FALSE) - ) -} - -# DOTSeq's getContrasts() returns objects with ORF IDs in rownames; lift -# them into an `orf_id` column so the TSV is self-describing. -to_orf_tibble <- function(x) { - if (is.null(x)) return(NULL) - df <- as.data.frame(x) - if (!"orf_id" %in% names(df)) df <- rownames_to_column(df, "orf_id") - as_tibble(df) -} - -# Always emit the file (even when empty) so downstream Nextflow channels -# behave consistently across runs with different significance counts. -write_results_tsv <- function(df, suffix) { - out_path <- paste0(prefix, ".", suffix) - if (is.null(df) || nrow(df) == 0) { - write_tsv(tibble(), out_path) - } else { - write_tsv(df, out_path) - } -} - ################################################################################ ## Read inputs and normalise the sample sheet ## ################################################################################ -cnt <- as.data.frame(read_delim_flexible(opt\$count_file, comment = "#")) +# featureCounts emits a `# Program:...` banner as the first line; `comment` +# strips it. Returns a tibble; DOTSeqDataSetsFromFeatureCounts() wants a +# vanilla data.frame so coerce. +cnt <- read_tsv(opt\$count_file, comment = "#", show_col_types = FALSE, + progress = FALSE) |> as.data.frame() # featureCounts column names often carry the full BAM path; the vignette uses # `gsub(".*(SRR[0-9]+).*", "\\1", names(cnt))` to keep just the run accession. @@ -133,7 +101,7 @@ if (!is.null(opt\$sample_name_regex) && nzchar(opt\$sample_name_regex)) { names(cnt) <- gsub(opt\$sample_name_regex, "\\\\1", names(cnt)) } -cond <- read_delim_flexible(opt\$sample_file) |> +cond <- read_csv(opt\$sample_file, show_col_types = FALSE, progress = FALSE) |> as.data.frame() |> rename_with(\\(nm) tolower(trimws(nm))) @@ -197,10 +165,17 @@ d <- DOTSeq( verbose = FALSE ) -dou_interaction <- to_orf_tibble(tryCatch(getContrasts(getDOU(d), type = "interaction"), error = \\(e) NULL)) -dou_strategy <- to_orf_tibble(tryCatch(getContrasts(getDOU(d), type = "strategy"), error = \\(e) NULL)) -dte_interaction <- to_orf_tibble(tryCatch(getContrasts(getDTE(d), type = "interaction"), error = \\(e) NULL)) -dte_strategy <- to_orf_tibble(tryCatch(getContrasts(getDTE(d), type = "strategy"), error = \\(e) NULL)) +# testDOU() and the DTE wrap-up in DOTSeq both lift rownames into an `orf_id` +# column and clear the rownames before returning, so we just coerce to tibble. +get_contrasts_df <- function(x, type) { + res <- tryCatch(getContrasts(x, type = type), error = \\(e) NULL) + if (is.null(res)) NULL else as_tibble(as.data.frame(res)) +} + +dou_interaction <- get_contrasts_df(getDOU(d), "interaction") +dou_strategy <- get_contrasts_df(getDOU(d), "strategy") +dte_interaction <- get_contrasts_df(getDTE(d), "interaction") +dte_strategy <- get_contrasts_df(getDTE(d), "strategy") ################################################################################ ## Write result tables ## @@ -209,14 +184,18 @@ dte_strategy <- to_orf_tibble(tryCatch(getContrasts(getDTE(d), type = "strate ## is the per-ORF differential translation efficiency contrast. ## ################################################################################ -write_results_tsv(dte_interaction, "translation.dotseq.results.tsv") -write_results_tsv(dou_interaction, "dou.dotseq.results.tsv") +# Always emit the file (even empty) so downstream Nextflow channels stay +# consistent across runs with different significance counts. +empty_safe <- function(df) if (is.null(df)) tibble() else df + +write_tsv(empty_safe(dte_interaction), paste0(prefix, ".translation.dotseq.results.tsv")) +write_tsv(empty_safe(dou_interaction), paste0(prefix, ".dou.dotseq.results.tsv")) if (!is.null(dou_strategy) && nrow(dou_strategy) > 0) { - write_results_tsv(dou_strategy, "dou_strategy.dotseq.results.tsv") + write_tsv(dou_strategy, paste0(prefix, ".dou_strategy.dotseq.results.tsv")) } if (!is.null(dte_strategy) && nrow(dte_strategy) > 0) { - write_results_tsv(dte_strategy, "dte_strategy.dotseq.results.tsv") + write_tsv(dte_strategy, paste0(prefix, ".dte_strategy.dotseq.results.tsv")) } saveRDS(d, file = paste0(prefix, ".DOTSeqDataSets.rds")) @@ -251,17 +230,19 @@ if (opt\$generate_plots) { # plotDOT defaults to `force_new_device = TRUE` which unconditionally # resets the active graphics device, killing the png() we just opened. # Disable that so the PNG actually captures the plot. - safe_plot_dot <- function(plot_type, fname, results_df = NULL, data = NULL) { + safe_plot_dot <- function(plot_type, fname, results_df = NULL, data = NULL, + annotation_params = list()) { if (is.null(results_df) || nrow(results_df) == 0) return(invisible(NULL)) tryCatch({ png(fname, width = 900, height = 800, res = 110) plotDOT( - plot_type = plot_type, - results = results_df, - data = data, - id_mapping = FALSE, - plot_params = list(top_hits = opt\$top_hits), - force_new_device = FALSE + plot_type = plot_type, + results = results_df, + data = data, + id_mapping = FALSE, + plot_params = list(top_hits = opt\$top_hits), + annotation_params = annotation_params, + force_new_device = FALSE ) dev.off() }, error = \\(e) { @@ -278,7 +259,17 @@ if (opt\$generate_plots) { safe_plot_dot("volcano", paste0(prefix, ".volcano.png"), plotdot_df, getDOU(d)) safe_plot_dot("composite", paste0(prefix, ".composite.png"), plotdot_df, getDOU(d)) safe_plot_dot("venn", paste0(prefix, ".venn.png"), plotdot_df) - safe_plot_dot("heatmap", paste0(prefix, ".heatmap.png"), plotdot_df, getDOU(d)) + + # The heatmap pairs mORFs with a chosen short-ORF class within each gene; + # try uORF first (the package default) and fall back to dORF if no + # significant gene has both. tryCatch in safe_plot_dot makes either a + # no-op if the data don't support it. + safe_plot_dot("heatmap", paste0(prefix, ".heatmap.png"), + plotdot_df, getDOU(d), list(sorf_type = "uORF")) + if (!file.exists(paste0(prefix, ".heatmap.png"))) { + safe_plot_dot("heatmap", paste0(prefix, ".heatmap.png"), + plotdot_df, getDOU(d), list(sorf_type = "dORF")) + } } ################################################################################ diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap index 49ed37a1175e..bbb4d516c899 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -17,7 +17,7 @@ } } ], - "timestamp": "2026-05-22T08:55:01.866775853", + "timestamp": "2026-05-22T09:14:22.528383259", "meta": { "nf-test": "0.9.5", "nextflow": "26.04.1" From 8d1affd5ba43f3c08e1bda9c43f7d53ccc8b991c Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 11:25:50 +0100 Subject: [PATCH 5/9] Address code-review feedback: stub block, validation hardening, plot fallback robustness - Add stub: block to main.nf matching the proteus/readproteingroups precedent. - Read sample sheet with read_delim() picking comma/tab from the file extension so the meta.yml-advertised TSV variant actually works. - Refuse to clobber an existing canonical column (e.g. an existing 'condition' column when --contrast_variable=treatment is supplied). - Dedupe multi-lane sample sheets and validate that both Ribo and RNA strategies are present (DOTSeq's interaction design is unestimable otherwise). - Add an is_set() predicate that catches NULL / empty stringent + required options before the tri-state switch silently returns NULL. - safe_plot_dot now unlinks the partially-written PNG on plotDOT error and returns success so the heatmap fallback (uORF then dORF) keys off whether the first call actually drew, not file.exists() of a stale handle. - getContrasts(type='interaction') errors propagate (headline outputs); type='strategy' stays tryCatch'd because absence is legitimate. - Cache getDOU(d) / getDTE(d) once and share across contrasts + plotDOT. - Drop redundant file.exists() walk - Nextflow's path staging already guarantees the inputs exist. - Expand the test to assert volcano / composite / venn plot emission and add a -stub test. --- modules/nf-core/dotseq/dotseq/main.nf | 24 +++ .../nf-core/dotseq/dotseq/templates/dotseq.R | 137 ++++++++++-------- .../nf-core/dotseq/dotseq/tests/main.nf.test | 38 +++++ .../dotseq/dotseq/tests/main.nf.test.snap | 17 ++- 4 files changed, 158 insertions(+), 58 deletions(-) diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf index 1b9fee44f144..33c83bb7f100 100644 --- a/modules/nf-core/dotseq/dotseq/main.nf +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -36,4 +36,28 @@ process DOTSEQ_DOTSEQ { script: template 'dotseq.R' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.translation.dotseq.results.tsv + touch ${prefix}.dou.dotseq.results.tsv + touch ${prefix}.dou_strategy.dotseq.results.tsv + touch ${prefix}.dte_strategy.dotseq.results.tsv + touch ${prefix}.volcano.png + touch ${prefix}.composite.png + touch ${prefix}.venn.png + touch ${prefix}.heatmap.png + touch ${prefix}.interaction_p_distribution.png + touch ${prefix}.DOTSeqDataSets.rds + touch ${prefix}.R_sessionInfo.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-dotseq: \$(Rscript -e "cat(as.character(packageVersion('DOTSeq')))") + r-optparse: \$(Rscript -e "cat(as.character(packageVersion('optparse')))") + r-readr: \$(Rscript -e "cat(as.character(packageVersion('readr')))") + r-dplyr: \$(Rscript -e "cat(as.character(packageVersion('dplyr')))") + END_VERSIONS + """ } diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R index c013a3fee679..fc5ecd2f6537 100644 --- a/modules/nf-core/dotseq/dotseq/templates/dotseq.R +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -68,19 +68,23 @@ ext_argv <- if (identical(ext_args_raw, "null") || !nzchar(trimws(ext_args_raw)) opt <- parse_args(OptionParser(option_list = option_list), args = c(nf_defaults, ext_argv)) -# DOTSeq accepts TRUE, FALSE, or NULL for `stringent` (three filter modes); -# optparse won't natively parse a tri-state into a logical, so we round-trip. +is_set <- function(x) !is.null(x) && nzchar(trimws(x)) + +# DOTSeq's `stringent` is tri-state TRUE / FALSE / NULL; normalise via switch. +# Without the upfront is_set() guard, switch(toupper(NULL), ...) silently +# returns NULL and bypasses the `stop()` fallback. +if (!is_set(opt\$stringent)) stop("--stringent must be TRUE / FALSE / NULL.") stringent_val <- switch(toupper(opt\$stringent), "TRUE" = TRUE, "FALSE" = FALSE, "NULL" = NULL, - stop("`--stringent` must be one of TRUE, FALSE, NULL") + stop("--stringent must be TRUE / FALSE / NULL (got: ", opt\$stringent, ")") ) modules <- trimws(strsplit(opt\$modules, ",")[[1]]) -walk(c("count_file", "sample_file", "flattened_gtf", "flattened_bed"), \\(x) { - if (!file.exists(opt[[x]])) stop("Missing input file: ", x, " = ", opt[[x]]) -}) +# An empty output_prefix would produce dotfiles that the emit globs miss. +walk(c("contrast_variable", "reference_level", "target_level", "output_prefix"), + \\(x) if (!is_set(opt[[x]])) stop("Missing required option: --", x)) prefix <- opt\$output_prefix @@ -88,41 +92,55 @@ prefix <- opt\$output_prefix ## Read inputs and normalise the sample sheet ## ################################################################################ -# featureCounts emits a `# Program:...` banner as the first line; `comment` -# strips it. Returns a tibble; DOTSeqDataSetsFromFeatureCounts() wants a -# vanilla data.frame so coerce. +# `comment = "#"` strips featureCounts' `# Program:...` banner. cnt <- read_tsv(opt\$count_file, comment = "#", show_col_types = FALSE, progress = FALSE) |> as.data.frame() -# featureCounts column names often carry the full BAM path; the vignette uses -# `gsub(".*(SRR[0-9]+).*", "\\1", names(cnt))` to keep just the run accession. -# Expose the same regex via the CLI so users can adapt to their own naming. -if (!is.null(opt\$sample_name_regex) && nzchar(opt\$sample_name_regex)) { +# featureCounts column names often carry the full BAM path; the vignette +# strips them to the run accession via `gsub(".*(SRR[0-9]+).*", "\\1", ...)`. +if (is_set(opt\$sample_name_regex)) { names(cnt) <- gsub(opt\$sample_name_regex, "\\\\1", names(cnt)) } -cond <- read_csv(opt\$sample_file, show_col_types = FALSE, progress = FALSE) |> +# meta.yml documents CSV or TSV; pick the delim by file extension. +sample_ext <- tolower(tools::file_ext(sub("\\\\.gz\$", "", basename(opt\$sample_file)))) +cond <- read_delim(opt\$sample_file, delim = if (sample_ext == "csv") "," else "\t", + show_col_types = FALSE, progress = FALSE) |> as.data.frame() |> rename_with(\\(nm) tolower(trimws(nm))) -# DOTSeq's parse_condition_table() insists on columns named exactly -# `run, strategy, replicate, condition` (lower-case). Allow the user to point -# at differently-named columns via task.ext.args and rename in-place. +# DOTSeq insists on lower-case `run, strategy, replicate, condition` columns; +# rename from user-specified column names if necessary, refusing collisions. col_map <- c( run = tolower(opt\$sample_id_col), strategy = tolower(opt\$strategy_col), replicate = tolower(opt\$replicate_col), condition = tolower(opt\$contrast_variable) ) -missing_cols <- col_map[!col_map %in% names(cond)] +missing_cols <- setdiff(col_map, names(cond)) if (length(missing_cols) > 0) { - stop(sprintf("Sample sheet missing column(s): %s. Have: %s", - paste(missing_cols, collapse = ", "), - paste(names(cond), collapse = ", "))) + stop("Sample sheet missing column(s): ", paste(missing_cols, collapse = ", ")) } +nm <- names(cond) for (req in names(col_map)) { src <- col_map[[req]] - if (src != req) names(cond)[names(cond) == src] <- req + if (src == req) next + if (req %in% nm) stop("Cannot rename '", src, "' to '", req, "': '", req, "' column already present.") + nm[nm == src] <- req +} +names(cond) <- nm + +if (anyDuplicated(cond\$run)) { + cond <- distinct(cond, run, .keep_all = TRUE) +} + +if (!opt\$reference_level %in% cond\$condition) { + stop("--reference_level '", opt\$reference_level, "' not in `condition`. Have: ", + paste(unique(cond\$condition), collapse = ", ")) +} +if (!opt\$target_level %in% cond\$condition) { + stop("--target_level '", opt\$target_level, "' not in `condition`. Have: ", + paste(unique(cond\$condition), collapse = ", ")) } # Subset to the two contrast levels and put `reference` first so it becomes @@ -136,6 +154,12 @@ cond <- cond |> if (nrow(cond) == 0) stop("No samples remain after filtering on the contrast levels.") +# DOTSeq's design `~ condition * strategy` requires both Ribo and RNA samples. +if (nlevels(droplevels(cond\$strategy)) < 2) { + stop(sprintf("Strategy column must contain both Ribo and RNA after filtering; got: %s", + paste(unique(as.character(cond\$strategy)), collapse = ", "))) +} + ################################################################################ ## DOTSeq: DOU + DTE ## ################################################################################ @@ -166,16 +190,20 @@ d <- DOTSeq( ) # testDOU() and the DTE wrap-up in DOTSeq both lift rownames into an `orf_id` -# column and clear the rownames before returning, so we just coerce to tibble. -get_contrasts_df <- function(x, type) { - res <- tryCatch(getContrasts(x, type = type), error = \\(e) NULL) - if (is.null(res)) NULL else as_tibble(as.data.frame(res)) -} - -dou_interaction <- get_contrasts_df(getDOU(d), "interaction") -dou_strategy <- get_contrasts_df(getDOU(d), "strategy") -dte_interaction <- get_contrasts_df(getDTE(d), "interaction") -dte_strategy <- get_contrasts_df(getDTE(d), "strategy") +# column and clear the rownames, so we just coerce to tibble. Interaction +# contrasts are the module's headline output: let real errors propagate +# rather than catching them and writing an empty TSV that looks like a +# successful "no significant ORFs". Strategy contrasts can legitimately be +# absent, so tryCatch is fine there. +dou_d <- getDOU(d) +dte_d <- getDTE(d) +contrasts_tibble <- function(res) if (is.null(res)) NULL else as_tibble(as.data.frame(res)) +try_contrasts <- function(x, type) tryCatch(contrasts_tibble(getContrasts(x, type = type)), error = \\(e) NULL) + +dou_interaction <- contrasts_tibble(getContrasts(dou_d, "interaction")) +dte_interaction <- contrasts_tibble(getContrasts(dte_d, "interaction")) +dou_strategy <- try_contrasts(dou_d, "strategy") +dte_strategy <- try_contrasts(dte_d, "strategy") ################################################################################ ## Write result tables ## @@ -184,19 +212,16 @@ dte_strategy <- get_contrasts_df(getDTE(d), "strategy") ## is the per-ORF differential translation efficiency contrast. ## ################################################################################ -# Always emit the file (even empty) so downstream Nextflow channels stay -# consistent across runs with different significance counts. +# Always emit the mandatory tables (even empty) so Nextflow channels are +# consistent; strategy tables are optional and only written when populated. empty_safe <- function(df) if (is.null(df)) tibble() else df - write_tsv(empty_safe(dte_interaction), paste0(prefix, ".translation.dotseq.results.tsv")) write_tsv(empty_safe(dou_interaction), paste0(prefix, ".dou.dotseq.results.tsv")) - -if (!is.null(dou_strategy) && nrow(dou_strategy) > 0) { - write_tsv(dou_strategy, paste0(prefix, ".dou_strategy.dotseq.results.tsv")) -} -if (!is.null(dte_strategy) && nrow(dte_strategy) > 0) { - write_tsv(dte_strategy, paste0(prefix, ".dte_strategy.dotseq.results.tsv")) +write_optional <- function(df, suffix) { + if (!is.null(df) && nrow(df) > 0) write_tsv(df, paste0(prefix, ".", suffix)) } +write_optional(dou_strategy, "dou_strategy.dotseq.results.tsv") +write_optional(dte_strategy, "dte_strategy.dotseq.results.tsv") saveRDS(d, file = paste0(prefix, ".DOTSeqDataSets.rds")) @@ -227,12 +252,12 @@ if (opt\$generate_plots) { } } - # plotDOT defaults to `force_new_device = TRUE` which unconditionally - # resets the active graphics device, killing the png() we just opened. - # Disable that so the PNG actually captures the plot. + # plotDOT's default `force_new_device = TRUE` would reset our png() device; + # disable it. Return success so the heatmap fallback can distinguish a real + # plot from one that left a 0-byte file behind. safe_plot_dot <- function(plot_type, fname, results_df = NULL, data = NULL, annotation_params = list()) { - if (is.null(results_df) || nrow(results_df) == 0) return(invisible(NULL)) + if (is.null(results_df) || nrow(results_df) == 0) return(invisible(FALSE)) tryCatch({ png(fname, width = 900, height = 800, res = 110) plotDOT( @@ -245,30 +270,28 @@ if (opt\$generate_plots) { force_new_device = FALSE ) dev.off() + invisible(TRUE) }, error = \\(e) { while (length(dev.list()) > 0) dev.off() + if (file.exists(fname)) unlink(fname) message(sprintf("plotDOT(%s) failed: %s", plot_type, conditionMessage(e))) + invisible(FALSE) }) } - # plotDOT wants both DOU and DTE columns on a single results frame keyed by orf_id. plotdot_df <- if (!is.null(dou_interaction) && !is.null(dte_interaction)) { dou_interaction |> inner_join(dte_interaction, by = "orf_id", suffix = c("_dou", "_dte")) } else NULL - safe_plot_dot("volcano", paste0(prefix, ".volcano.png"), plotdot_df, getDOU(d)) - safe_plot_dot("composite", paste0(prefix, ".composite.png"), plotdot_df, getDOU(d)) + safe_plot_dot("volcano", paste0(prefix, ".volcano.png"), plotdot_df, dou_d) + safe_plot_dot("composite", paste0(prefix, ".composite.png"), plotdot_df, dou_d) safe_plot_dot("venn", paste0(prefix, ".venn.png"), plotdot_df) - # The heatmap pairs mORFs with a chosen short-ORF class within each gene; - # try uORF first (the package default) and fall back to dORF if no - # significant gene has both. tryCatch in safe_plot_dot makes either a - # no-op if the data don't support it. - safe_plot_dot("heatmap", paste0(prefix, ".heatmap.png"), - plotdot_df, getDOU(d), list(sorf_type = "uORF")) - if (!file.exists(paste0(prefix, ".heatmap.png"))) { - safe_plot_dot("heatmap", paste0(prefix, ".heatmap.png"), - plotdot_df, getDOU(d), list(sorf_type = "dORF")) + # Heatmap needs mORF + sorf_type pairs per gene; try uORF (package default) + # then dORF. + heatmap_path <- paste0(prefix, ".heatmap.png") + if (!safe_plot_dot("heatmap", heatmap_path, plotdot_df, dou_d, list(sorf_type = "uORF"))) { + safe_plot_dot("heatmap", heatmap_path, plotdot_df, dou_d, list(sorf_type = "dORF")) } } diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test b/modules/nf-core/dotseq/dotseq/tests/main.nf.test index 94d46a0de1e5..b501199b1b9f 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test @@ -39,15 +39,53 @@ nextflow_process { { assert path(process.out.translation[0][1]).exists() }, { assert path(process.out.dou[0][1]).exists() }, { assert process.out.interaction_p_distribution_plot.size() > 0 }, + { assert process.out.volcano_plot.size() > 0 }, + { assert process.out.composite_plot.size() > 0 }, + { assert process.out.venn_plot.size() > 0 }, { assert snapshot( file(process.out.translation[0][1]).name, file(process.out.dou[0][1]).name, file(process.out.rdata[0][1]).name, file(process.out.interaction_p_distribution_plot[0][1]).name, + file(process.out.volcano_plot[0][1]).name, + file(process.out.composite_plot[0][1]).name, + file(process.out.venn_plot[0][1]).name, process.out.versions, path(process.out.versions[0]).yaml ).match() } ) } } + + test("human - featurecounts - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'cycling_vs_interphase' ], + 'condition', + 'Interphase', + 'Mitotic_Cycling' + ] + input[1] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/featureCounts.cell_cycle_subset.txt.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.gtf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.bed.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + } } diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap index bbb4d516c899..c10a9f6fd88e 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -1,10 +1,25 @@ { + "human - featurecounts - stub": { + "content": [ + [ + "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" + ] + ], + "timestamp": "2026-05-22T10:25:21.129013917", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, "human - featurecounts": { "content": [ "cycling_vs_interphase.translation.dotseq.results.tsv", "cycling_vs_interphase.dou.dotseq.results.tsv", "cycling_vs_interphase.DOTSeqDataSets.rds", "cycling_vs_interphase.interaction_p_distribution.png", + "cycling_vs_interphase.volcano.png", + "cycling_vs_interphase.composite.png", + "cycling_vs_interphase.venn.png", [ "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" ], @@ -17,7 +32,7 @@ } } ], - "timestamp": "2026-05-22T09:14:22.528383259", + "timestamp": "2026-05-22T10:25:15.491771515", "meta": { "nf-test": "0.9.5", "nextflow": "26.04.1" From 164ddbcd05089eb726fb7993b65f2f5ea3381902 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 11:38:21 +0100 Subject: [PATCH 6/9] TEMPORARY: point test at the pending test-datasets PR fork branch Lets CI verify the module is actually green; revert this commit once nf-core/test-datasets#2072 merges and the canonical modules-branch URL resolves. --- modules/nf-core/dotseq/dotseq/tests/nextflow.config | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/modules/nf-core/dotseq/dotseq/tests/nextflow.config b/modules/nf-core/dotseq/dotseq/tests/nextflow.config index 868dd7f0789d..4c2904c62f85 100644 --- a/modules/nf-core/dotseq/dotseq/tests/nextflow.config +++ b/modules/nf-core/dotseq/dotseq/tests/nextflow.config @@ -3,3 +3,10 @@ process { ext.args = '--sample_name_regex .*(SRR[0-9]+).*' } } + +// TEMPORARY: point at the test-datasets PR branch until +// https://github.com/nf-core/test-datasets/pull/2072 merges. +// Revert this block once the canonical `modules` branch carries the fixtures. +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-dotseq-testdata/data/' +} From f4ba3de5ada62e5bb8a7ef822f0fe5ce3cec74d4 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 12:27:53 +0100 Subject: [PATCH 7/9] refactor(dotseq/dotseq): take a count-matrix shape for consumer parity Aligns the module's input contract with deltate / anota2seq so that consumers can dispatch between the three ORF-DTE methods without maintaining a separate prep step for dotseq. The four featureCounts/GTF/BED inputs collapse to a per-ORF count matrix (orf_id + sample columns) plus a per-ORF annotation TSV (orf_id + gene_id + optional orf_type/coords). The R template now calls DOTSeqDataSetsFromSummarizeOverlaps() and builds the required GRanges in-process from the annotation TSV; the model fit, contrast tables, and plotDOT outputs are unchanged. Test fixtures updated alongside in nf-core/test-datasets#2072 (commit 8c9b27c). Co-Authored-By: Claude Opus 4.7 (1M context) --- modules/nf-core/dotseq/dotseq/main.nf | 8 +- modules/nf-core/dotseq/dotseq/meta.yml | 30 ++-- .../nf-core/dotseq/dotseq/templates/dotseq.R | 140 ++++++++++++++---- .../nf-core/dotseq/dotseq/tests/main.nf.test | 14 +- .../dotseq/dotseq/tests/main.nf.test.snap | 10 +- .../dotseq/dotseq/tests/nextflow.config | 6 - 6 files changed, 143 insertions(+), 65 deletions(-) diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf index 33c83bb7f100..732421c886ae 100644 --- a/modules/nf-core/dotseq/dotseq/main.nf +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -9,24 +9,18 @@ process DOTSEQ_DOTSEQ { input: tuple val(meta), val(contrast_variable), val(reference), val(target) - tuple val(meta2), path(samplesheet), path(counts), path(flattened_gtf), path(flattened_bed) + tuple val(meta2), path(samplesheet), path(counts), path(annotation) output: - // Per-ORF differential translation efficiency (DTE interaction term) tuple val(meta), path("*.translation.dotseq.results.tsv") , emit: translation - // Per-ORF differential ORF usage tuple val(meta), path("*.dou.dotseq.results.tsv") , emit: dou - // Per-condition Ribo-vs-RNA strategy contrasts, when DOTSeq emits them tuple val(meta), path("*.dou_strategy.dotseq.results.tsv") , emit: dou_strategy , optional: true tuple val(meta), path("*.dte_strategy.dotseq.results.tsv") , emit: dte_strategy , optional: true - // plotDOT() outputs tuple val(meta), path("*.volcano.png") , emit: volcano_plot , optional: true tuple val(meta), path("*.composite.png") , emit: composite_plot, optional: true tuple val(meta), path("*.venn.png") , emit: venn_plot , optional: true tuple val(meta), path("*.heatmap.png") , emit: heatmap_plot , optional: true - // Histogram of DTE adjusted p-values tuple val(meta), path("*.interaction_p_distribution.png") , emit: interaction_p_distribution_plot, optional: true - // Serialised dataset + session info + versions tuple val(meta), path("*.DOTSeqDataSets.rds") , emit: rdata tuple val(meta), path("*.R_sessionInfo.log") , emit: session_info path "versions.yml" , emit: versions, topic: versions diff --git a/modules/nf-core/dotseq/dotseq/meta.yml b/modules/nf-core/dotseq/dotseq/meta.yml index e23d1bd3893c..37b7f2895cad 100644 --- a/modules/nf-core/dotseq/dotseq/meta.yml +++ b/modules/nf-core/dotseq/dotseq/meta.yml @@ -3,7 +3,7 @@ name: "dotseq_dotseq" description: | Detect differential ORF usage (DOU) and ORF-level differential translation efficiency (DTE) from Ribo-seq with matched RNA-seq using - DOTSeq. Wraps DOTSeqDataSetsFromFeatureCounts() + DOTSeq() + + DOTSeq. Wraps DOTSeqDataSetsFromSummarizeOverlaps() + DOTSeq() + getContrasts() and emits the package's native contrast tables plus plotDOT() visualisations. keywords: @@ -52,23 +52,27 @@ input: - counts: type: file description: | - ORF-level featureCounts output - header must include - `Geneid, Chr, Start, End, Strand, Length` followed by per-sample - count columns matching the `run` values in the sample sheet. + Per-ORF count matrix. First column is the ORF identifier (default + `orf_id`, override via `--orf_id_col`); remaining columns are + sample IDs that must match the `run` values in the sample sheet. + Both Ribo-seq and RNA-seq sample columns belong in this single + matrix; the sample sheet's `strategy` column distinguishes them. ontologies: - edam: "http://edamontology.org/format_3475" - - flattened_gtf: + - annotation: type: file description: | - Flattened ORF annotation in GTF format (gene_id + exon_number, - type=="exon"). + Per-ORF annotation table (one row per ORF). Required columns: + `orf_id` (matches the count matrix) and `gene_id` (parent gene; + DOTSeq's DOU model groups child ORFs by gene). Optional columns: + `orf_type` (mORF / uORF / dORF; used by plotDOT()'s heatmap and + defaults to mORF when absent) and `chrom`, `start`, `end`, + `strand` (used only for downstream inspection - dummy ranges + are generated when absent because DOTSeq's fit does not depend + on genomic coordinates). Column names can be overridden via + task.ext.args (`--gene_id_col`, `--orf_type_col`, etc.). ontologies: - - edam: "http://edamontology.org/format_2306" - - flattened_bed: - type: file - description: Flattened ORF annotation in BED format matching the GTF. - ontologies: - - edam: "http://edamontology.org/format_3003" + - edam: "http://edamontology.org/format_3475" output: translation: diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R index fc5ecd2f6537..451b42d9dfdc 100644 --- a/modules/nf-core/dotseq/dotseq/templates/dotseq.R +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -8,6 +8,8 @@ suppressPackageStartupMessages({ library(tibble) library(purrr) library(ggplot2) + library(GenomicRanges) + library(IRanges) library(DOTSeq) library(SummarizedExperiment) }) @@ -20,16 +22,27 @@ option_list <- list( make_option("--output_prefix", type = "character", default = NULL), make_option("--count_file", type = "character", default = NULL), make_option("--sample_file", type = "character", default = NULL), - make_option("--flattened_gtf", type = "character", default = NULL), - make_option("--flattened_bed", type = "character", default = NULL), + make_option("--annotation_file", type = "character", default = NULL), make_option("--contrast_variable", type = "character", default = NULL), make_option("--reference_level", type = "character", default = NULL), make_option("--target_level", type = "character", default = NULL), make_option("--sample_id_col", type = "character", default = "run"), make_option("--strategy_col", type = "character", default = "strategy"), make_option("--replicate_col", type = "character", default = "replicate"), - make_option("--sample_name_regex", type = "character", default = NULL, - help = "Regex applied to count-table column names; the first capture group is kept (matches DOTSeq's vignette pattern)."), + make_option("--orf_id_col", type = "character", default = "orf_id", + help = "Annotation column holding the ORF id (must match count_file's first column) [default: %default]"), + make_option("--gene_id_col", type = "character", default = "gene_id", + help = "Annotation column holding the parent gene id [default: %default]"), + make_option("--orf_type_col", type = "character", default = "orf_type", + help = "Annotation column holding the ORF biotype (mORF/uORF/dORF/etc.); used by plotDOT()'s heatmap [default: %default]"), + make_option("--chrom_col", type = "character", default = "chrom", + help = "Optional annotation column with the ORF chromosome; dummy ranges built if absent."), + make_option("--start_col", type = "character", default = "start", + help = "Optional annotation column with the ORF start (1-based)."), + make_option("--end_col", type = "character", default = "end", + help = "Optional annotation column with the ORF end."), + make_option("--strand_col", type = "character", default = "strand", + help = "Optional annotation column with the ORF strand."), make_option("--modules", type = "character", default = "DOU,DTE", help = "Which DOTSeq modules to run [default: %default]"), make_option("--min_count", type = "integer", default = 1L), @@ -51,8 +64,7 @@ nf_defaults <- c( paste0("--output_prefix=", ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix')), paste0("--count_file=", '$counts'), paste0("--sample_file=", '$samplesheet'), - paste0("--flattened_gtf=", '$flattened_gtf'), - paste0("--flattened_bed=", '$flattened_bed'), + paste0("--annotation_file=", '$annotation'), paste0("--contrast_variable=", '$contrast_variable'), paste0("--reference_level=", '$reference'), paste0("--target_level=", '$target'), @@ -71,8 +83,6 @@ opt <- parse_args(OptionParser(option_list = option_list), args = c(nf_defaults, is_set <- function(x) !is.null(x) && nzchar(trimws(x)) # DOTSeq's `stringent` is tri-state TRUE / FALSE / NULL; normalise via switch. -# Without the upfront is_set() guard, switch(toupper(NULL), ...) silently -# returns NULL and bypasses the `stop()` fallback. if (!is_set(opt\$stringent)) stop("--stringent must be TRUE / FALSE / NULL.") stringent_val <- switch(toupper(opt\$stringent), "TRUE" = TRUE, @@ -82,7 +92,6 @@ stringent_val <- switch(toupper(opt\$stringent), ) modules <- trimws(strsplit(opt\$modules, ",")[[1]]) -# An empty output_prefix would produce dotfiles that the emit globs miss. walk(c("contrast_variable", "reference_level", "target_level", "output_prefix"), \\(x) if (!is_set(opt[[x]])) stop("Missing required option: --", x)) @@ -92,21 +101,21 @@ prefix <- opt\$output_prefix ## Read inputs and normalise the sample sheet ## ################################################################################ -# `comment = "#"` strips featureCounts' `# Program:...` banner. -cnt <- read_tsv(opt\$count_file, comment = "#", show_col_types = FALSE, - progress = FALSE) |> as.data.frame() +read_delim_auto <- function(path) { + ext <- tolower(tools::file_ext(sub("\\\\.gz\$", "", basename(path)))) + delim <- if (ext == "csv") "," else "\t" + read_delim(path, delim = delim, show_col_types = FALSE, progress = FALSE) |> as.data.frame() +} -# featureCounts column names often carry the full BAM path; the vignette -# strips them to the run accession via `gsub(".*(SRR[0-9]+).*", "\\1", ...)`. -if (is_set(opt\$sample_name_regex)) { - names(cnt) <- gsub(opt\$sample_name_regex, "\\\\1", names(cnt)) +cnt <- read_delim_auto(opt\$count_file) +if (!opt\$orf_id_col %in% names(cnt)) { + stop("Count file missing ORF id column '", opt\$orf_id_col, "'. Have: ", + paste(names(cnt), collapse = ", ")) } +rownames(cnt) <- cnt[[opt\$orf_id_col]] +cnt[[opt\$orf_id_col]] <- NULL -# meta.yml documents CSV or TSV; pick the delim by file extension. -sample_ext <- tolower(tools::file_ext(sub("\\\\.gz\$", "", basename(opt\$sample_file)))) -cond <- read_delim(opt\$sample_file, delim = if (sample_ext == "csv") "," else "\t", - show_col_types = FALSE, progress = FALSE) |> - as.data.frame() |> +cond <- read_delim_auto(opt\$sample_file) |> rename_with(\\(nm) tolower(trimws(nm))) # DOTSeq insists on lower-case `run, strategy, replicate, condition` columns; @@ -154,21 +163,100 @@ cond <- cond |> if (nrow(cond) == 0) stop("No samples remain after filtering on the contrast levels.") -# DOTSeq's design `~ condition * strategy` requires both Ribo and RNA samples. if (nlevels(droplevels(cond\$strategy)) < 2) { stop(sprintf("Strategy column must contain both Ribo and RNA after filtering; got: %s", paste(unique(as.character(cond\$strategy)), collapse = ", "))) } +# Restrict counts to the samples retained in cond, in the same order. +missing_samples <- setdiff(cond\$run, colnames(cnt)) +if (length(missing_samples) > 0) { + stop("Count file missing column(s) for sample(s): ", paste(missing_samples, collapse = ", ")) +} +cnt <- cnt[, cond\$run, drop = FALSE] + +################################################################################ +## Build the per-ORF GRanges annotation ## +## ## +## DOTSeq's DOU module fits a beta-binomial GLM per parent gene, grouping ## +## the gene's child ORFs, so gene_id is load-bearing on the annotation. The ## +## genomic ranges themselves are stored for downstream inspection only - the ## +## model fit does not depend on them. plotDOT()'s heatmap uses orf_type to ## +## bucket uORF/dORF, so we honour it when present. ## +################################################################################ + +ann <- read_delim_auto(opt\$annotation_file) +required_ann_cols <- c(opt\$orf_id_col, opt\$gene_id_col) +missing_ann_cols <- setdiff(required_ann_cols, names(ann)) +if (length(missing_ann_cols) > 0) { + stop("Annotation file missing required column(s): ", paste(missing_ann_cols, collapse = ", ")) +} + +ann <- ann |> + distinct(.data[[opt\$orf_id_col]], .keep_all = TRUE) +rownames(ann) <- ann[[opt\$orf_id_col]] + +orfs_with_counts <- intersect(rownames(cnt), rownames(ann)) +if (length(orfs_with_counts) == 0) { + stop("No ORF ids overlap between count file and annotation file.") +} +dropped_from_counts <- setdiff(rownames(cnt), orfs_with_counts) +if (length(dropped_from_counts) > 0) { + message(sprintf( + "Dropping %d ORF(s) from counts that have no annotation row (e.g. %s).", + length(dropped_from_counts), + paste(head(dropped_from_counts, 3), collapse = ", ") + )) +} +cnt <- cnt[orfs_with_counts, , drop = FALSE] +ann <- ann[orfs_with_counts, , drop = FALSE] + +# Build the GRanges. Coordinates default to a dummy range when absent; DOTSeq +# only consumes mcols (gene_id, orf_number, orf_type) for the fit + plotting. +has_coords <- all(c(opt\$chrom_col, opt\$start_col, opt\$end_col) %in% names(ann)) +if (has_coords) { + chrom <- as.character(ann[[opt\$chrom_col]]) + start <- as.integer(ann[[opt\$start_col]]) + end <- as.integer(ann[[opt\$end_col]]) + strand <- if (opt\$strand_col %in% names(ann)) as.character(ann[[opt\$strand_col]]) else "*" +} else { + chrom <- rep("chrUnknown", nrow(ann)) + start <- seq_len(nrow(ann)) + end <- start + strand <- rep("*", nrow(ann)) +} +strand[!strand %in% c("+", "-", "*")] <- "*" + +gene_ids <- as.character(ann[[opt\$gene_id_col]]) +orf_number <- ave(gene_ids, gene_ids, FUN = seq_along) +orf_type <- if (opt\$orf_type_col %in% names(ann)) as.character(ann[[opt\$orf_type_col]]) else rep("mORF", nrow(ann)) +orf_type[is.na(orf_type) | !nzchar(trimws(orf_type))] <- "mORF" + +annotation_gr <- GRanges( + seqnames = chrom, + ranges = IRanges(start = start, end = end), + strand = strand +) +names(annotation_gr) <- rownames(ann) +mcols(annotation_gr)\$gene_id <- gene_ids +mcols(annotation_gr)\$orf_number <- orf_number +mcols(annotation_gr)\$orf_type <- orf_type + +# DOTSeqDataSetsFromSummarizeOverlaps wants a data.frame for the count table. +# DESeqDataSetFromMatrix (called downstream) requires integer counts so coerce +# any double columns here before they hit the constructor. +cnt_df <- as.data.frame(lapply(cnt, function(col) as.integer(round(col))), check.names = FALSE) +rownames(cnt_df) <- rownames(cnt) +rownames(cond) <- cond\$run + ################################################################################ ## DOTSeq: DOU + DTE ## ################################################################################ -d <- DOTSeqDataSetsFromFeatureCounts( - count_table = cnt, +d <- DOTSeqDataSetsFromSummarizeOverlaps( + count_table = cnt_df, condition_table = cond, - flattened_gtf = opt\$flattened_gtf, - flattened_bed = opt\$flattened_bed, + annotation = annotation_gr, min_count = opt\$min_count, stringent = stringent_val, baseline = opt\$reference_level, diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test b/modules/nf-core/dotseq/dotseq/tests/main.nf.test index b501199b1b9f..382b00856170 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test @@ -10,7 +10,7 @@ nextflow_process { tag "dotseq" tag "dotseq/dotseq" - test("human - featurecounts") { + test("human - count matrix + annotation") { when { process { @@ -24,9 +24,8 @@ nextflow_process { input[1] = [ [ id:'test' ], file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/featureCounts.cell_cycle_subset.txt.gz", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.gtf.gz", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.bed.gz", checkIfExists: true) + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/counts.tsv.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/annotation.tsv.gz", checkIfExists: true) ] """ } @@ -57,7 +56,7 @@ nextflow_process { } } - test("human - featurecounts - stub") { + test("human - count matrix + annotation - stub") { options "-stub" @@ -73,9 +72,8 @@ nextflow_process { input[1] = [ [ id:'test' ], file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/featureCounts.cell_cycle_subset.txt.gz", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.gtf.gz", checkIfExists: true), - file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/gencode.v47.orf_flattened_subset.bed.gz", checkIfExists: true) + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/counts.tsv.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/annotation.tsv.gz", checkIfExists: true) ] """ } diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap index c10a9f6fd88e..6475a467e5a6 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -1,17 +1,17 @@ { - "human - featurecounts - stub": { + "human - count matrix + annotation - stub": { "content": [ [ "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" ] ], - "timestamp": "2026-05-22T10:25:21.129013917", + "timestamp": "2026-05-22T12:30:00.000000000", "meta": { "nf-test": "0.9.5", "nextflow": "26.04.1" } }, - "human - featurecounts": { + "human - count matrix + annotation": { "content": [ "cycling_vs_interphase.translation.dotseq.results.tsv", "cycling_vs_interphase.dou.dotseq.results.tsv", @@ -32,10 +32,10 @@ } } ], - "timestamp": "2026-05-22T10:25:15.491771515", + "timestamp": "2026-05-22T12:30:00.000000000", "meta": { "nf-test": "0.9.5", "nextflow": "26.04.1" } } -} \ No newline at end of file +} diff --git a/modules/nf-core/dotseq/dotseq/tests/nextflow.config b/modules/nf-core/dotseq/dotseq/tests/nextflow.config index 4c2904c62f85..9534234e60a5 100644 --- a/modules/nf-core/dotseq/dotseq/tests/nextflow.config +++ b/modules/nf-core/dotseq/dotseq/tests/nextflow.config @@ -1,9 +1,3 @@ -process { - withName: 'DOTSEQ_DOTSEQ' { - ext.args = '--sample_name_regex .*(SRR[0-9]+).*' - } -} - // TEMPORARY: point at the test-datasets PR branch until // https://github.com/nf-core/test-datasets/pull/2072 merges. // Revert this block once the canonical `modules` branch carries the fixtures. From 679ba79abaf405ecbab492bf62ecf5d6ea8ac7d8 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 22 May 2026 12:32:29 +0100 Subject: [PATCH 8/9] fix(dotseq/dotseq): synthesize `replicate` column when absent DOTSeq's parse_condition_table() requires a `replicate` column for stable ordering of samples within strategy+condition. Pipeline samplesheets often have a `pair` column (or none at all), so the R template now treats the column as optional: when present it is renamed to `replicate` as before; when absent the template assigns a per-(strategy, condition) row counter so the model fit is unaffected. This matches how anota2seq/deltate consume the same samplesheets. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../nf-core/dotseq/dotseq/templates/dotseq.R | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R index 451b42d9dfdc..fafca9223706 100644 --- a/modules/nf-core/dotseq/dotseq/templates/dotseq.R +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -120,25 +120,42 @@ cond <- read_delim_auto(opt\$sample_file) |> # DOTSeq insists on lower-case `run, strategy, replicate, condition` columns; # rename from user-specified column names if necessary, refusing collisions. -col_map <- c( +# `replicate` may be absent: it is used only for stable ordering, so synthesize +# a per-(strategy, contrast_variable) counter when the user has not supplied +# their own column. +col_map_required <- c( run = tolower(opt\$sample_id_col), strategy = tolower(opt\$strategy_col), - replicate = tolower(opt\$replicate_col), condition = tolower(opt\$contrast_variable) ) -missing_cols <- setdiff(col_map, names(cond)) -if (length(missing_cols) > 0) { - stop("Sample sheet missing column(s): ", paste(missing_cols, collapse = ", ")) +missing_required <- setdiff(col_map_required, names(cond)) +if (length(missing_required) > 0) { + stop("Sample sheet missing column(s): ", paste(missing_required, collapse = ", ")) } nm <- names(cond) -for (req in names(col_map)) { - src <- col_map[[req]] +for (req in names(col_map_required)) { + src <- col_map_required[[req]] if (src == req) next if (req %in% nm) stop("Cannot rename '", src, "' to '", req, "': '", req, "' column already present.") nm[nm == src] <- req } names(cond) <- nm +replicate_src <- tolower(opt\$replicate_col) +if (replicate_src %in% names(cond)) { + if (replicate_src != "replicate") { + if ("replicate" %in% names(cond)) { + stop("Cannot rename '", replicate_src, "' to 'replicate': 'replicate' column already present.") + } + names(cond)[names(cond) == replicate_src] <- "replicate" + } +} else { + cond <- cond |> + group_by(.data\$strategy, .data\$condition) |> + mutate(replicate = row_number()) |> + ungroup() +} + if (anyDuplicated(cond\$run)) { cond <- distinct(cond, run, .keep_all = TRUE) } From b65bc0646329b334f65a3e271997d8cc53017f8b Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 12 Jun 2026 15:49:08 +0100 Subject: [PATCH 9/9] fix(dotseq/dotseq): support running a single module Dropping a module from --modules left DOTSeq()'s skipped slot unfitted (a bare DESeqDataSet for DTE), and getContrasts() has no method for it, so a DOU-only run crashed when extracting the DTE interaction table. Gate interaction and strategy contrast extraction on the selected modules, and write each module's interaction table only when that module ran. Mark the translation and dou outputs optional to match, and add a DOU-only regression test. Co-Authored-By: Claude Opus 4.8 (1M context) --- modules/nf-core/dotseq/dotseq/main.nf | 4 +- modules/nf-core/dotseq/dotseq/meta.yml | 6 ++- .../nf-core/dotseq/dotseq/templates/dotseq.R | 29 ++++++++------ .../dotseq/dotseq/tests/dou_only.config | 7 ++++ .../nf-core/dotseq/dotseq/tests/main.nf.test | 38 +++++++++++++++++++ .../dotseq/dotseq/tests/main.nf.test.snap | 22 +++++++++++ 6 files changed, 91 insertions(+), 15 deletions(-) create mode 100644 modules/nf-core/dotseq/dotseq/tests/dou_only.config diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf index 732421c886ae..1560835b5210 100644 --- a/modules/nf-core/dotseq/dotseq/main.nf +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -12,8 +12,8 @@ process DOTSEQ_DOTSEQ { tuple val(meta2), path(samplesheet), path(counts), path(annotation) output: - tuple val(meta), path("*.translation.dotseq.results.tsv") , emit: translation - tuple val(meta), path("*.dou.dotseq.results.tsv") , emit: dou + tuple val(meta), path("*.translation.dotseq.results.tsv") , emit: translation , optional: true + tuple val(meta), path("*.dou.dotseq.results.tsv") , emit: dou , optional: true tuple val(meta), path("*.dou_strategy.dotseq.results.tsv") , emit: dou_strategy , optional: true tuple val(meta), path("*.dte_strategy.dotseq.results.tsv") , emit: dte_strategy , optional: true tuple val(meta), path("*.volcano.png") , emit: volcano_plot , optional: true diff --git a/modules/nf-core/dotseq/dotseq/meta.yml b/modules/nf-core/dotseq/dotseq/meta.yml index 37b7f2895cad..d2a45778a038 100644 --- a/modules/nf-core/dotseq/dotseq/meta.yml +++ b/modules/nf-core/dotseq/dotseq/meta.yml @@ -83,7 +83,8 @@ output: type: file description: | Per-ORF differential translation efficiency: DOTSeq's DTE - interaction-term results (DESeq2 + ashr shrinkage). + interaction-term results (DESeq2 + ashr shrinkage). Emitted only + when the DTE module is selected (the default). pattern: ".translation.dotseq.results.tsv" ontologies: - edam: "http://edamontology.org/format_3475" @@ -96,7 +97,8 @@ output: description: | DOTSeq Differential ORF Usage results (beta-binomial GLM modelling Ribo / RNA proportion changes within each gene, - shrunk with ashr). DOTSeq-unique. + shrunk with ashr). DOTSeq-unique. Emitted only when the DOU + module is selected (the default). pattern: ".dou.dotseq.results.tsv" ontologies: - edam: "http://edamontology.org/format_3475" diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R index fafca9223706..260c305ff4b3 100644 --- a/modules/nf-core/dotseq/dotseq/templates/dotseq.R +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -294,21 +294,26 @@ d <- DOTSeq( verbose = FALSE ) -# testDOU() and the DTE wrap-up in DOTSeq both lift rownames into an `orf_id` -# column and clear the rownames, so we just coerce to tibble. Interaction -# contrasts are the module's headline output: let real errors propagate +# A module dropped from --modules leaves its slot unfitted: DOTSeq() returns a +# plain DESeqDataSet in the DTE slot and an unfitted DOUData in the DOU slot, +# and getContrasts() has no method for an unfitted DTE slot, so contrasts are +# only pulled for the modules that actually ran. For a module that ran, +# interaction contrasts are its headline output: let real errors propagate # rather than catching them and writing an empty TSV that looks like a # successful "no significant ORFs". Strategy contrasts can legitimately be # absent, so tryCatch is fine there. +run_dou <- "DOU" %in% modules +run_dte <- "DTE" %in% modules + dou_d <- getDOU(d) dte_d <- getDTE(d) contrasts_tibble <- function(res) if (is.null(res)) NULL else as_tibble(as.data.frame(res)) try_contrasts <- function(x, type) tryCatch(contrasts_tibble(getContrasts(x, type = type)), error = \\(e) NULL) -dou_interaction <- contrasts_tibble(getContrasts(dou_d, "interaction")) -dte_interaction <- contrasts_tibble(getContrasts(dte_d, "interaction")) -dou_strategy <- try_contrasts(dou_d, "strategy") -dte_strategy <- try_contrasts(dte_d, "strategy") +dou_interaction <- if (run_dou) contrasts_tibble(getContrasts(dou_d, "interaction")) else NULL +dte_interaction <- if (run_dte) contrasts_tibble(getContrasts(dte_d, "interaction")) else NULL +dou_strategy <- if (run_dou) try_contrasts(dou_d, "strategy") else NULL +dte_strategy <- if (run_dte) try_contrasts(dte_d, "strategy") else NULL ################################################################################ ## Write result tables ## @@ -317,11 +322,13 @@ dte_strategy <- try_contrasts(dte_d, "strategy") ## is the per-ORF differential translation efficiency contrast. ## ################################################################################ -# Always emit the mandatory tables (even empty) so Nextflow channels are -# consistent; strategy tables are optional and only written when populated. +# Each module's interaction table is written only when that module ran, so a +# single-module run does not leave a misleading empty table for the module that +# was skipped; the matching Nextflow outputs are declared optional. Strategy +# tables are written only when populated. empty_safe <- function(df) if (is.null(df)) tibble() else df -write_tsv(empty_safe(dte_interaction), paste0(prefix, ".translation.dotseq.results.tsv")) -write_tsv(empty_safe(dou_interaction), paste0(prefix, ".dou.dotseq.results.tsv")) +if (run_dte) write_tsv(empty_safe(dte_interaction), paste0(prefix, ".translation.dotseq.results.tsv")) +if (run_dou) write_tsv(empty_safe(dou_interaction), paste0(prefix, ".dou.dotseq.results.tsv")) write_optional <- function(df, suffix) { if (!is.null(df) && nrow(df) > 0) write_tsv(df, paste0(prefix, ".", suffix)) } diff --git a/modules/nf-core/dotseq/dotseq/tests/dou_only.config b/modules/nf-core/dotseq/dotseq/tests/dou_only.config new file mode 100644 index 000000000000..7ce4f52339b2 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/dou_only.config @@ -0,0 +1,7 @@ +includeConfig './nextflow.config' + +process { + withName: 'DOTSEQ_DOTSEQ' { + ext.args = '--modules DOU' + } +} diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test b/modules/nf-core/dotseq/dotseq/tests/main.nf.test index 382b00856170..4d3153a3e437 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test @@ -56,6 +56,44 @@ nextflow_process { } } + test("human - count matrix + annotation - DOU module only") { + + config "./dou_only.config" + + when { + process { + """ + input[0] = [ + [ id:'cycling_vs_interphase' ], + 'condition', + 'Interphase', + 'Mitotic_Cycling' + ] + input[1] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/counts.tsv.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/annotation.tsv.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.dou[0][1]).exists() }, + { assert process.out.translation.size() == 0 }, + { assert snapshot( + file(process.out.dou[0][1]).name, + file(process.out.rdata[0][1]).name, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + test("human - count matrix + annotation - stub") { options "-stub" diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap index 6475a467e5a6..b2441c4917a5 100644 --- a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -37,5 +37,27 @@ "nf-test": "0.9.5", "nextflow": "26.04.1" } + }, + "human - count matrix + annotation - DOU module only": { + "content": [ + "cycling_vs_interphase.dou.dotseq.results.tsv", + "cycling_vs_interphase.DOTSeqDataSets.rds", + [ + "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" + ], + { + "DOTSEQ_DOTSEQ": { + "bioconductor-dotseq": "1.0.0", + "r-optparse": "1.8.2", + "r-readr": "2.2.0", + "r-dplyr": "1.2.1" + } + } + ], + "timestamp": "2026-05-22T12:30:00.000000000", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } } }