diff --git a/modules/nf-core/dotseq/dotseq/environment.yml b/modules/nf-core/dotseq/dotseq/environment.yml new file mode 100644 index 000000000000..620960ff719f --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/environment.yml @@ -0,0 +1,17 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bioconductor-dotseq=1.0.0 + - conda-forge::r-dplyr=1.2.1 + - conda-forge::r-eulerr=7.1.0 + - conda-forge::r-ggplot2=4.0.3 + - conda-forge::r-ggrepel=0.9.8 + - conda-forge::r-ggsignif=0.6.4 + - conda-forge::r-optparse=1.8.2 + - conda-forge::r-purrr=1.2.2 + - conda-forge::r-readr=2.2.0 + - conda-forge::r-tibble=3.3.1 + - conda-forge::r-tidyr=1.3.2 diff --git a/modules/nf-core/dotseq/dotseq/main.nf b/modules/nf-core/dotseq/dotseq/main.nf new file mode 100644 index 000000000000..1560835b5210 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/main.nf @@ -0,0 +1,57 @@ +process DOTSEQ_DOTSEQ { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/12/12667d472e9ae0f1602041dc018ba6bde294e6190e67999d71b65e7a2df7ea1f/data' : + 'community.wave.seqera.io/library/bioconductor-dotseq_r-dplyr_r-eulerr_r-ggplot2_pruned:6c8a9ebdec36c958' }" + + input: + tuple val(meta), val(contrast_variable), val(reference), val(target) + tuple val(meta2), path(samplesheet), path(counts), path(annotation) + + output: + tuple val(meta), path("*.translation.dotseq.results.tsv") , emit: translation , optional: true + tuple val(meta), path("*.dou.dotseq.results.tsv") , emit: dou , optional: true + tuple val(meta), path("*.dou_strategy.dotseq.results.tsv") , emit: dou_strategy , optional: true + tuple val(meta), path("*.dte_strategy.dotseq.results.tsv") , emit: dte_strategy , optional: true + tuple val(meta), path("*.volcano.png") , emit: volcano_plot , optional: true + tuple val(meta), path("*.composite.png") , emit: composite_plot, optional: true + tuple val(meta), path("*.venn.png") , emit: venn_plot , optional: true + tuple val(meta), path("*.heatmap.png") , emit: heatmap_plot , optional: true + tuple val(meta), path("*.interaction_p_distribution.png") , emit: interaction_p_distribution_plot, optional: true + tuple val(meta), path("*.DOTSeqDataSets.rds") , emit: rdata + tuple val(meta), path("*.R_sessionInfo.log") , emit: session_info + path "versions.yml" , emit: versions, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'dotseq.R' + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.translation.dotseq.results.tsv + touch ${prefix}.dou.dotseq.results.tsv + touch ${prefix}.dou_strategy.dotseq.results.tsv + touch ${prefix}.dte_strategy.dotseq.results.tsv + touch ${prefix}.volcano.png + touch ${prefix}.composite.png + touch ${prefix}.venn.png + touch ${prefix}.heatmap.png + touch ${prefix}.interaction_p_distribution.png + touch ${prefix}.DOTSeqDataSets.rds + touch ${prefix}.R_sessionInfo.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioconductor-dotseq: \$(Rscript -e "cat(as.character(packageVersion('DOTSeq')))") + r-optparse: \$(Rscript -e "cat(as.character(packageVersion('optparse')))") + r-readr: \$(Rscript -e "cat(as.character(packageVersion('readr')))") + r-dplyr: \$(Rscript -e "cat(as.character(packageVersion('dplyr')))") + END_VERSIONS + """ +} diff --git a/modules/nf-core/dotseq/dotseq/meta.yml b/modules/nf-core/dotseq/dotseq/meta.yml new file mode 100644 index 000000000000..d2a45778a038 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/meta.yml @@ -0,0 +1,209 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "dotseq_dotseq" +description: | + Detect differential ORF usage (DOU) and ORF-level differential + translation efficiency (DTE) from Ribo-seq with matched RNA-seq using + DOTSeq. Wraps DOTSeqDataSetsFromSummarizeOverlaps() + DOTSeq() + + getContrasts() and emits the package's native contrast tables plus + plotDOT() visualisations. +keywords: + - riboseq + - rnaseq + - translation + - differential + - orf +tools: + - "dotseq": + description: "Differential ORF Translation analysis for Ribo-seq with matched RNA-seq" + homepage: "https://bioconductor.org/packages/release/bioc/html/DOTSeq.html" + documentation: "https://bioconductor.org/packages/release/bioc/vignettes/DOTSeq/inst/doc/DOTSeq.html" + tool_dev_url: "https://github.com/compgenom/DOTSeq" + licence: + - "MIT" + identifier: "" + +input: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - contrast_variable: + type: string + description: Sample-sheet column that holds the experimental condition (mapped to DOTSeq's `condition` internally). + - reference: + type: string + description: Value of the contrast_variable to use as reference (baseline). + - target: + type: string + description: Value of the contrast_variable to use as target (non-reference). + - - meta2: + type: map + description: Groovy map containing study-wide metadata + - samplesheet: + type: file + description: | + CSV or TSV sample sheet with `run`, `strategy`, `replicate`, and + `condition` columns (defaults; can be overridden via + task.ext.args). Both Ribo-seq and RNA-seq samples are required: + DOTSeq's design is `~ condition * strategy` and the interaction + term is unestimable without both strategies. + ontologies: + - edam: "http://edamontology.org/format_3752" + - edam: "http://edamontology.org/format_3475" + - counts: + type: file + description: | + Per-ORF count matrix. First column is the ORF identifier (default + `orf_id`, override via `--orf_id_col`); remaining columns are + sample IDs that must match the `run` values in the sample sheet. + Both Ribo-seq and RNA-seq sample columns belong in this single + matrix; the sample sheet's `strategy` column distinguishes them. + ontologies: + - edam: "http://edamontology.org/format_3475" + - annotation: + type: file + description: | + Per-ORF annotation table (one row per ORF). Required columns: + `orf_id` (matches the count matrix) and `gene_id` (parent gene; + DOTSeq's DOU model groups child ORFs by gene). Optional columns: + `orf_type` (mORF / uORF / dORF; used by plotDOT()'s heatmap and + defaults to mORF when absent) and `chrom`, `start`, `end`, + `strand` (used only for downstream inspection - dummy ranges + are generated when absent because DOTSeq's fit does not depend + on genomic coordinates). Column names can be overridden via + task.ext.args (`--gene_id_col`, `--orf_type_col`, etc.). + ontologies: + - edam: "http://edamontology.org/format_3475" + +output: + translation: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.translation.dotseq.results.tsv": + type: file + description: | + Per-ORF differential translation efficiency: DOTSeq's DTE + interaction-term results (DESeq2 + ashr shrinkage). Emitted only + when the DTE module is selected (the default). + pattern: ".translation.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" + dou: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dou.dotseq.results.tsv": + type: file + description: | + DOTSeq Differential ORF Usage results (beta-binomial GLM + modelling Ribo / RNA proportion changes within each gene, + shrunk with ashr). DOTSeq-unique. Emitted only when the DOU + module is selected (the default). + pattern: ".dou.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" + dou_strategy: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dou_strategy.dotseq.results.tsv": + type: file + description: DOU strategy contrasts (Ribo vs RNA effect per condition), when present. + pattern: ".dou_strategy.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" + dte_strategy: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.dte_strategy.dotseq.results.tsv": + type: file + description: DTE strategy contrasts (Ribo vs RNA effect per condition), when present. + pattern: ".dte_strategy.dotseq.results.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" + volcano_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.volcano.png": + type: file + description: DOTSeq plotDOT() volcano (DOU + DTE significance). + pattern: ".volcano.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + composite_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.composite.png": + type: file + description: DOTSeq plotDOT() composite scatter (DOU vs DTE effect sizes). + pattern: ".composite.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + venn_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.venn.png": + type: file + description: DOTSeq plotDOT() Venn diagram of DOU vs DTE significant ORFs. + pattern: ".venn.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + heatmap_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.heatmap.png": + type: file + description: DOTSeq plotDOT() heatmap of DOU across top genes. + pattern: ".heatmap.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + interaction_p_distribution_plot: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.interaction_p_distribution.png": + type: file + description: Histogram of DOTSeq's DTE adjusted p-values. + pattern: ".interaction_p_distribution.png" + ontologies: + - edam: "http://edamontology.org/format_3603" + rdata: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.DOTSeqDataSets.rds": + type: file + description: Serialised DOTSeqDataSets object containing DOU + DTE fits + pattern: ".DOTSeqDataSets.rds" + ontologies: [] + session_info: + - - meta: + type: map + description: Groovy Map containing contrast information. e.g. [ id:'treatment_vs_control' ] + - "*.R_sessionInfo.log": + type: file + description: dump of R sessionInfo() + pattern: "*.log" + ontologies: + - edam: "http://edamontology.org/data_1678" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" +topics: + versions: + - versions.yml: + type: string + description: The name of the process +authors: + - "@pinin4fjords" +maintainers: + - "@pinin4fjords" diff --git a/modules/nf-core/dotseq/dotseq/templates/dotseq.R b/modules/nf-core/dotseq/dotseq/templates/dotseq.R new file mode 100644 index 000000000000..260c305ff4b3 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/templates/dotseq.R @@ -0,0 +1,427 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(optparse) + library(readr) + library(dplyr) + library(tidyr) + library(tibble) + library(purrr) + library(ggplot2) + library(GenomicRanges) + library(IRanges) + library(DOTSeq) + library(SummarizedExperiment) +}) + +################################################################################ +## Parse parameters ## +################################################################################ + +option_list <- list( + make_option("--output_prefix", type = "character", default = NULL), + make_option("--count_file", type = "character", default = NULL), + make_option("--sample_file", type = "character", default = NULL), + make_option("--annotation_file", type = "character", default = NULL), + make_option("--contrast_variable", type = "character", default = NULL), + make_option("--reference_level", type = "character", default = NULL), + make_option("--target_level", type = "character", default = NULL), + make_option("--sample_id_col", type = "character", default = "run"), + make_option("--strategy_col", type = "character", default = "strategy"), + make_option("--replicate_col", type = "character", default = "replicate"), + make_option("--orf_id_col", type = "character", default = "orf_id", + help = "Annotation column holding the ORF id (must match count_file's first column) [default: %default]"), + make_option("--gene_id_col", type = "character", default = "gene_id", + help = "Annotation column holding the parent gene id [default: %default]"), + make_option("--orf_type_col", type = "character", default = "orf_type", + help = "Annotation column holding the ORF biotype (mORF/uORF/dORF/etc.); used by plotDOT()'s heatmap [default: %default]"), + make_option("--chrom_col", type = "character", default = "chrom", + help = "Optional annotation column with the ORF chromosome; dummy ranges built if absent."), + make_option("--start_col", type = "character", default = "start", + help = "Optional annotation column with the ORF start (1-based)."), + make_option("--end_col", type = "character", default = "end", + help = "Optional annotation column with the ORF end."), + make_option("--strand_col", type = "character", default = "strand", + help = "Optional annotation column with the ORF strand."), + make_option("--modules", type = "character", default = "DOU,DTE", + help = "Which DOTSeq modules to run [default: %default]"), + make_option("--min_count", type = "integer", default = 1L), + make_option("--stringent", type = "character", default = "TRUE", + help = "TRUE / FALSE / NULL [default: %default]"), + make_option("--dispersion_modeling", type = "character", default = "auto"), + make_option("--nullweight", type = "double", default = 500), + make_option("--contrasts_method", type = "character", default = "revpairwise"), + make_option("--generate_plots", type = "logical", default = TRUE), + make_option("--alpha", type = "double", default = 0.05, + help = "Padj cut-off for the DTE p-value distribution plot"), + make_option("--top_hits", type = "integer", default = 25L), + make_option("--cores", type = "integer", default = 1L) +) + +# Defaults wired in by the Nextflow template; task.ext.args (if any) layers +# on top so users can override anything via `--key value`. +nf_defaults <- c( + paste0("--output_prefix=", ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix')), + paste0("--count_file=", '$counts'), + paste0("--sample_file=", '$samplesheet'), + paste0("--annotation_file=", '$annotation'), + paste0("--contrast_variable=", '$contrast_variable'), + paste0("--reference_level=", '$reference'), + paste0("--target_level=", '$target'), + paste0("--cores=", '$task.cpus') +) + +ext_args_raw <- '$task.ext.args' +ext_argv <- if (identical(ext_args_raw, "null") || !nzchar(trimws(ext_args_raw))) { + character(0) +} else { + strsplit(ext_args_raw, "\\\\s+", perl = TRUE)[[1]] |> (\\(x) x[nzchar(x)])() +} + +opt <- parse_args(OptionParser(option_list = option_list), args = c(nf_defaults, ext_argv)) + +is_set <- function(x) !is.null(x) && nzchar(trimws(x)) + +# DOTSeq's `stringent` is tri-state TRUE / FALSE / NULL; normalise via switch. +if (!is_set(opt\$stringent)) stop("--stringent must be TRUE / FALSE / NULL.") +stringent_val <- switch(toupper(opt\$stringent), + "TRUE" = TRUE, + "FALSE" = FALSE, + "NULL" = NULL, + stop("--stringent must be TRUE / FALSE / NULL (got: ", opt\$stringent, ")") +) +modules <- trimws(strsplit(opt\$modules, ",")[[1]]) + +walk(c("contrast_variable", "reference_level", "target_level", "output_prefix"), + \\(x) if (!is_set(opt[[x]])) stop("Missing required option: --", x)) + +prefix <- opt\$output_prefix + +################################################################################ +## Read inputs and normalise the sample sheet ## +################################################################################ + +read_delim_auto <- function(path) { + ext <- tolower(tools::file_ext(sub("\\\\.gz\$", "", basename(path)))) + delim <- if (ext == "csv") "," else "\t" + read_delim(path, delim = delim, show_col_types = FALSE, progress = FALSE) |> as.data.frame() +} + +cnt <- read_delim_auto(opt\$count_file) +if (!opt\$orf_id_col %in% names(cnt)) { + stop("Count file missing ORF id column '", opt\$orf_id_col, "'. Have: ", + paste(names(cnt), collapse = ", ")) +} +rownames(cnt) <- cnt[[opt\$orf_id_col]] +cnt[[opt\$orf_id_col]] <- NULL + +cond <- read_delim_auto(opt\$sample_file) |> + rename_with(\\(nm) tolower(trimws(nm))) + +# DOTSeq insists on lower-case `run, strategy, replicate, condition` columns; +# rename from user-specified column names if necessary, refusing collisions. +# `replicate` may be absent: it is used only for stable ordering, so synthesize +# a per-(strategy, contrast_variable) counter when the user has not supplied +# their own column. +col_map_required <- c( + run = tolower(opt\$sample_id_col), + strategy = tolower(opt\$strategy_col), + condition = tolower(opt\$contrast_variable) +) +missing_required <- setdiff(col_map_required, names(cond)) +if (length(missing_required) > 0) { + stop("Sample sheet missing column(s): ", paste(missing_required, collapse = ", ")) +} +nm <- names(cond) +for (req in names(col_map_required)) { + src <- col_map_required[[req]] + if (src == req) next + if (req %in% nm) stop("Cannot rename '", src, "' to '", req, "': '", req, "' column already present.") + nm[nm == src] <- req +} +names(cond) <- nm + +replicate_src <- tolower(opt\$replicate_col) +if (replicate_src %in% names(cond)) { + if (replicate_src != "replicate") { + if ("replicate" %in% names(cond)) { + stop("Cannot rename '", replicate_src, "' to 'replicate': 'replicate' column already present.") + } + names(cond)[names(cond) == replicate_src] <- "replicate" + } +} else { + cond <- cond |> + group_by(.data\$strategy, .data\$condition) |> + mutate(replicate = row_number()) |> + ungroup() +} + +if (anyDuplicated(cond\$run)) { + cond <- distinct(cond, run, .keep_all = TRUE) +} + +if (!opt\$reference_level %in% cond\$condition) { + stop("--reference_level '", opt\$reference_level, "' not in `condition`. Have: ", + paste(unique(cond\$condition), collapse = ", ")) +} +if (!opt\$target_level %in% cond\$condition) { + stop("--target_level '", opt\$target_level, "' not in `condition`. Have: ", + paste(unique(cond\$condition), collapse = ", ")) +} + +# Subset to the two contrast levels and put `reference` first so it becomes +# the implicit baseline in the DESeq2 / glmmTMB design. +cond <- cond |> + filter(.data\$condition %in% c(opt\$reference_level, opt\$target_level)) |> + mutate( + condition = factor(.data\$condition, levels = c(opt\$reference_level, opt\$target_level)), + strategy = factor(.data\$strategy) + ) + +if (nrow(cond) == 0) stop("No samples remain after filtering on the contrast levels.") + +if (nlevels(droplevels(cond\$strategy)) < 2) { + stop(sprintf("Strategy column must contain both Ribo and RNA after filtering; got: %s", + paste(unique(as.character(cond\$strategy)), collapse = ", "))) +} + +# Restrict counts to the samples retained in cond, in the same order. +missing_samples <- setdiff(cond\$run, colnames(cnt)) +if (length(missing_samples) > 0) { + stop("Count file missing column(s) for sample(s): ", paste(missing_samples, collapse = ", ")) +} +cnt <- cnt[, cond\$run, drop = FALSE] + +################################################################################ +## Build the per-ORF GRanges annotation ## +## ## +## DOTSeq's DOU module fits a beta-binomial GLM per parent gene, grouping ## +## the gene's child ORFs, so gene_id is load-bearing on the annotation. The ## +## genomic ranges themselves are stored for downstream inspection only - the ## +## model fit does not depend on them. plotDOT()'s heatmap uses orf_type to ## +## bucket uORF/dORF, so we honour it when present. ## +################################################################################ + +ann <- read_delim_auto(opt\$annotation_file) +required_ann_cols <- c(opt\$orf_id_col, opt\$gene_id_col) +missing_ann_cols <- setdiff(required_ann_cols, names(ann)) +if (length(missing_ann_cols) > 0) { + stop("Annotation file missing required column(s): ", paste(missing_ann_cols, collapse = ", ")) +} + +ann <- ann |> + distinct(.data[[opt\$orf_id_col]], .keep_all = TRUE) +rownames(ann) <- ann[[opt\$orf_id_col]] + +orfs_with_counts <- intersect(rownames(cnt), rownames(ann)) +if (length(orfs_with_counts) == 0) { + stop("No ORF ids overlap between count file and annotation file.") +} +dropped_from_counts <- setdiff(rownames(cnt), orfs_with_counts) +if (length(dropped_from_counts) > 0) { + message(sprintf( + "Dropping %d ORF(s) from counts that have no annotation row (e.g. %s).", + length(dropped_from_counts), + paste(head(dropped_from_counts, 3), collapse = ", ") + )) +} +cnt <- cnt[orfs_with_counts, , drop = FALSE] +ann <- ann[orfs_with_counts, , drop = FALSE] + +# Build the GRanges. Coordinates default to a dummy range when absent; DOTSeq +# only consumes mcols (gene_id, orf_number, orf_type) for the fit + plotting. +has_coords <- all(c(opt\$chrom_col, opt\$start_col, opt\$end_col) %in% names(ann)) +if (has_coords) { + chrom <- as.character(ann[[opt\$chrom_col]]) + start <- as.integer(ann[[opt\$start_col]]) + end <- as.integer(ann[[opt\$end_col]]) + strand <- if (opt\$strand_col %in% names(ann)) as.character(ann[[opt\$strand_col]]) else "*" +} else { + chrom <- rep("chrUnknown", nrow(ann)) + start <- seq_len(nrow(ann)) + end <- start + strand <- rep("*", nrow(ann)) +} +strand[!strand %in% c("+", "-", "*")] <- "*" + +gene_ids <- as.character(ann[[opt\$gene_id_col]]) +orf_number <- ave(gene_ids, gene_ids, FUN = seq_along) +orf_type <- if (opt\$orf_type_col %in% names(ann)) as.character(ann[[opt\$orf_type_col]]) else rep("mORF", nrow(ann)) +orf_type[is.na(orf_type) | !nzchar(trimws(orf_type))] <- "mORF" + +annotation_gr <- GRanges( + seqnames = chrom, + ranges = IRanges(start = start, end = end), + strand = strand +) +names(annotation_gr) <- rownames(ann) +mcols(annotation_gr)\$gene_id <- gene_ids +mcols(annotation_gr)\$orf_number <- orf_number +mcols(annotation_gr)\$orf_type <- orf_type + +# DOTSeqDataSetsFromSummarizeOverlaps wants a data.frame for the count table. +# DESeqDataSetFromMatrix (called downstream) requires integer counts so coerce +# any double columns here before they hit the constructor. +cnt_df <- as.data.frame(lapply(cnt, function(col) as.integer(round(col))), check.names = FALSE) +rownames(cnt_df) <- rownames(cnt) +rownames(cond) <- cond\$run + +################################################################################ +## DOTSeq: DOU + DTE ## +################################################################################ + +d <- DOTSeqDataSetsFromSummarizeOverlaps( + count_table = cnt_df, + condition_table = cond, + annotation = annotation_gr, + min_count = opt\$min_count, + stringent = stringent_val, + baseline = opt\$reference_level, + verbose = FALSE +) + +d <- DOTSeq( + datasets = d, + modules = modules, + target = opt\$target_level, + baseline = opt\$reference_level, + min_count = opt\$min_count, + stringent = stringent_val, + dispersion_modeling = opt\$dispersion_modeling, + nullweight = opt\$nullweight, + contrasts_method = opt\$contrasts_method, + parallel = list(n = opt\$cores, autopar = TRUE), + verbose = FALSE +) + +# A module dropped from --modules leaves its slot unfitted: DOTSeq() returns a +# plain DESeqDataSet in the DTE slot and an unfitted DOUData in the DOU slot, +# and getContrasts() has no method for an unfitted DTE slot, so contrasts are +# only pulled for the modules that actually ran. For a module that ran, +# interaction contrasts are its headline output: let real errors propagate +# rather than catching them and writing an empty TSV that looks like a +# successful "no significant ORFs". Strategy contrasts can legitimately be +# absent, so tryCatch is fine there. +run_dou <- "DOU" %in% modules +run_dte <- "DTE" %in% modules + +dou_d <- getDOU(d) +dte_d <- getDTE(d) +contrasts_tibble <- function(res) if (is.null(res)) NULL else as_tibble(as.data.frame(res)) +try_contrasts <- function(x, type) tryCatch(contrasts_tibble(getContrasts(x, type = type)), error = \\(e) NULL) + +dou_interaction <- if (run_dou) contrasts_tibble(getContrasts(dou_d, "interaction")) else NULL +dte_interaction <- if (run_dte) contrasts_tibble(getContrasts(dte_d, "interaction")) else NULL +dou_strategy <- if (run_dou) try_contrasts(dou_d, "strategy") else NULL +dte_strategy <- if (run_dte) try_contrasts(dte_d, "strategy") else NULL + +################################################################################ +## Write result tables ## +## ## +## DTE interaction is written as `translation.dotseq.results.tsv` because it ## +## is the per-ORF differential translation efficiency contrast. ## +################################################################################ + +# Each module's interaction table is written only when that module ran, so a +# single-module run does not leave a misleading empty table for the module that +# was skipped; the matching Nextflow outputs are declared optional. Strategy +# tables are written only when populated. +empty_safe <- function(df) if (is.null(df)) tibble() else df +if (run_dte) write_tsv(empty_safe(dte_interaction), paste0(prefix, ".translation.dotseq.results.tsv")) +if (run_dou) write_tsv(empty_safe(dou_interaction), paste0(prefix, ".dou.dotseq.results.tsv")) +write_optional <- function(df, suffix) { + if (!is.null(df) && nrow(df) > 0) write_tsv(df, paste0(prefix, ".", suffix)) +} +write_optional(dou_strategy, "dou_strategy.dotseq.results.tsv") +write_optional(dte_strategy, "dte_strategy.dotseq.results.tsv") + +saveRDS(d, file = paste0(prefix, ".DOTSeqDataSets.rds")) + +################################################################################ +## Plots ## +## ## +## Volcano / composite / venn / heatmap come from DOTSeq's native plotDOT(). ## +## The p-value distribution plot is a plain ggplot on top of the package's ## +## own DTE padj column. ## +################################################################################ + +if (opt\$generate_plots) { + + if (!is.null(dte_interaction) && nrow(dte_interaction) > 0) { + pdist <- dte_interaction |> filter(!is.na(padj)) + if (nrow(pdist) > 0) { + pdist_plot <- ggplot(pdist, aes(x = padj)) + + geom_histogram(bins = 40, fill = "#3498db", colour = "white", alpha = 0.85) + + geom_vline(xintercept = opt\$alpha, linetype = "dashed", colour = "#e74c3c") + + labs( + x = "Adjusted p-value (DTE interaction)", + y = "Count", + title = sprintf("DTE p-value distribution (alpha = %s)", opt\$alpha) + ) + + theme_bw(base_size = 13) + ggsave(paste0(prefix, ".interaction_p_distribution.png"), + plot = pdist_plot, width = 8, height = 6, dpi = 100) + } + } + + # plotDOT's default `force_new_device = TRUE` would reset our png() device; + # disable it. Return success so the heatmap fallback can distinguish a real + # plot from one that left a 0-byte file behind. + safe_plot_dot <- function(plot_type, fname, results_df = NULL, data = NULL, + annotation_params = list()) { + if (is.null(results_df) || nrow(results_df) == 0) return(invisible(FALSE)) + tryCatch({ + png(fname, width = 900, height = 800, res = 110) + plotDOT( + plot_type = plot_type, + results = results_df, + data = data, + id_mapping = FALSE, + plot_params = list(top_hits = opt\$top_hits), + annotation_params = annotation_params, + force_new_device = FALSE + ) + dev.off() + invisible(TRUE) + }, error = \\(e) { + while (length(dev.list()) > 0) dev.off() + if (file.exists(fname)) unlink(fname) + message(sprintf("plotDOT(%s) failed: %s", plot_type, conditionMessage(e))) + invisible(FALSE) + }) + } + + plotdot_df <- if (!is.null(dou_interaction) && !is.null(dte_interaction)) { + dou_interaction |> inner_join(dte_interaction, by = "orf_id", suffix = c("_dou", "_dte")) + } else NULL + + safe_plot_dot("volcano", paste0(prefix, ".volcano.png"), plotdot_df, dou_d) + safe_plot_dot("composite", paste0(prefix, ".composite.png"), plotdot_df, dou_d) + safe_plot_dot("venn", paste0(prefix, ".venn.png"), plotdot_df) + + # Heatmap needs mORF + sorf_type pairs per gene; try uORF (package default) + # then dORF. + heatmap_path <- paste0(prefix, ".heatmap.png") + if (!safe_plot_dot("heatmap", heatmap_path, plotdot_df, dou_d, list(sorf_type = "uORF"))) { + safe_plot_dot("heatmap", heatmap_path, plotdot_df, dou_d, list(sorf_type = "dORF")) + } +} + +################################################################################ +## Session info + versions ## +################################################################################ + +sink(paste0(prefix, ".R_sessionInfo.log")) +print(sessionInfo()) +sink() + +writeLines( + c( + '"${task.process}":', + paste0(" bioconductor-dotseq: ", packageVersion("DOTSeq")), + paste0(" r-optparse: ", packageVersion("optparse")), + paste0(" r-readr: ", packageVersion("readr")), + paste0(" r-dplyr: ", packageVersion("dplyr")) + ), + "versions.yml" +) diff --git a/modules/nf-core/dotseq/dotseq/tests/dou_only.config b/modules/nf-core/dotseq/dotseq/tests/dou_only.config new file mode 100644 index 000000000000..7ce4f52339b2 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/dou_only.config @@ -0,0 +1,7 @@ +includeConfig './nextflow.config' + +process { + withName: 'DOTSEQ_DOTSEQ' { + ext.args = '--modules DOU' + } +} diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test b/modules/nf-core/dotseq/dotseq/tests/main.nf.test new file mode 100644 index 000000000000..4d3153a3e437 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test @@ -0,0 +1,127 @@ +nextflow_process { + + name "Test Process DOTSEQ_DOTSEQ" + script "../main.nf" + process "DOTSEQ_DOTSEQ" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "dotseq" + tag "dotseq/dotseq" + + test("human - count matrix + annotation") { + + when { + process { + """ + input[0] = [ + [ id:'cycling_vs_interphase' ], + 'condition', + 'Interphase', + 'Mitotic_Cycling' + ] + input[1] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/counts.tsv.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/annotation.tsv.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.session_info[0][1]).getText().contains('DOTSeq') }, + { assert path(process.out.translation[0][1]).exists() }, + { assert path(process.out.dou[0][1]).exists() }, + { assert process.out.interaction_p_distribution_plot.size() > 0 }, + { assert process.out.volcano_plot.size() > 0 }, + { assert process.out.composite_plot.size() > 0 }, + { assert process.out.venn_plot.size() > 0 }, + { assert snapshot( + file(process.out.translation[0][1]).name, + file(process.out.dou[0][1]).name, + file(process.out.rdata[0][1]).name, + file(process.out.interaction_p_distribution_plot[0][1]).name, + file(process.out.volcano_plot[0][1]).name, + file(process.out.composite_plot[0][1]).name, + file(process.out.venn_plot[0][1]).name, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + + test("human - count matrix + annotation - DOU module only") { + + config "./dou_only.config" + + when { + process { + """ + input[0] = [ + [ id:'cycling_vs_interphase' ], + 'condition', + 'Interphase', + 'Mitotic_Cycling' + ] + input[1] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/counts.tsv.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/annotation.tsv.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.dou[0][1]).exists() }, + { assert process.out.translation.size() == 0 }, + { assert snapshot( + file(process.out.dou[0][1]).name, + file(process.out.rdata[0][1]).name, + process.out.versions, + path(process.out.versions[0]).yaml + ).match() } + ) + } + } + + test("human - count matrix + annotation - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'cycling_vs_interphase' ], + 'condition', + 'Interphase', + 'Mitotic_Cycling' + ] + input[1] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/samplesheet.csv", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/counts.tsv.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/riboseq_expression/dotseq/annotation.tsv.gz", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() } + ) + } + } +} diff --git a/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap new file mode 100644 index 000000000000..b2441c4917a5 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/main.nf.test.snap @@ -0,0 +1,63 @@ +{ + "human - count matrix + annotation - stub": { + "content": [ + [ + "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" + ] + ], + "timestamp": "2026-05-22T12:30:00.000000000", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, + "human - count matrix + annotation": { + "content": [ + "cycling_vs_interphase.translation.dotseq.results.tsv", + "cycling_vs_interphase.dou.dotseq.results.tsv", + "cycling_vs_interphase.DOTSeqDataSets.rds", + "cycling_vs_interphase.interaction_p_distribution.png", + "cycling_vs_interphase.volcano.png", + "cycling_vs_interphase.composite.png", + "cycling_vs_interphase.venn.png", + [ + "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" + ], + { + "DOTSEQ_DOTSEQ": { + "bioconductor-dotseq": "1.0.0", + "r-optparse": "1.8.2", + "r-readr": "2.2.0", + "r-dplyr": "1.2.1" + } + } + ], + "timestamp": "2026-05-22T12:30:00.000000000", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + }, + "human - count matrix + annotation - DOU module only": { + "content": [ + "cycling_vs_interphase.dou.dotseq.results.tsv", + "cycling_vs_interphase.DOTSeqDataSets.rds", + [ + "versions.yml:md5,5a3b9c79d3821d41e26d6147fe04421f" + ], + { + "DOTSEQ_DOTSEQ": { + "bioconductor-dotseq": "1.0.0", + "r-optparse": "1.8.2", + "r-readr": "2.2.0", + "r-dplyr": "1.2.1" + } + } + ], + "timestamp": "2026-05-22T12:30:00.000000000", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.04.1" + } + } +} diff --git a/modules/nf-core/dotseq/dotseq/tests/nextflow.config b/modules/nf-core/dotseq/dotseq/tests/nextflow.config new file mode 100644 index 000000000000..9534234e60a5 --- /dev/null +++ b/modules/nf-core/dotseq/dotseq/tests/nextflow.config @@ -0,0 +1,6 @@ +// TEMPORARY: point at the test-datasets PR branch until +// https://github.com/nf-core/test-datasets/pull/2072 merges. +// Revert this block once the canonical `modules` branch carries the fixtures. +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-dotseq-testdata/data/' +}