From ba41df6b5707df4b59b9e9ad775daeda959ba381 Mon Sep 17 00:00:00 2001 From: Jorisvansteenbrugge <7196110+Jorisvansteenbrugge@users.noreply.github.com> Date: Wed, 18 Feb 2026 13:33:43 +0100 Subject: [PATCH 1/5] add fusioninspector to fusion subworkflow --- conf/modules.config | 16 ++ modules.json | 5 + .../nf-core/fusioninspector/environment.yml | 12 ++ modules/nf-core/fusioninspector/main.nf | 66 ++++++ modules/nf-core/fusioninspector/meta.yml | 150 +++++++++++++ .../fusioninspector/tests/main.nf.test | 111 ++++++++++ .../fusioninspector/tests/main.nf.test.snap | 199 ++++++++++++++++++ .../fusioninspector/tests/nextflow.config | 5 + nextflow.config | 1 + subworkflows/local/bam_gene_fusion/main.nf | 26 ++- .../local/bam_gene_fusion/tests/main.nf.test | 14 +- .../fastq_trim_filter_align_dedup/main.nf | 18 +- workflows/dxnextflowrna.nf | 2 +- 13 files changed, 606 insertions(+), 19 deletions(-) create mode 100644 modules/nf-core/fusioninspector/environment.yml create mode 100644 modules/nf-core/fusioninspector/main.nf create mode 100644 modules/nf-core/fusioninspector/meta.yml create mode 100644 modules/nf-core/fusioninspector/tests/main.nf.test create mode 100644 modules/nf-core/fusioninspector/tests/main.nf.test.snap create mode 100644 modules/nf-core/fusioninspector/tests/nextflow.config diff --git a/conf/modules.config b/conf/modules.config index f309638..9fd1ae5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -46,6 +46,22 @@ process { time = { 10.m + (1.ms * reads.sum { it.size() } / reads.count { it } / 1000) * task.attempt } } + withName: FUSIONINSPECTOR { + cpus = { 12 } + memory = { 40.GB * task.attempt } + time = { 4.h * task.attempt } + + + ext.args = { params.fusioninspector_limitSjdbInsertNsj != 1000000 ? "--STAR_xtra_params \"--limitSjdbInsertNsj ${params.fusioninspector_limitSjdbInsertNsj}\"" : '' } + ext.args2 = '--annotate --examine_coding_effect' + publishDir = [ + path: { "${params.outdir}/fusioninspector/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MULTIQC { clusterOptions = "${params.cluster_options}" ext.args = { params.multiqc_title ? "--title \"${params.multiqc_title}\"" : "--title \"${params.analysis_id}\"" } diff --git a/modules.json b/modules.json index d231cd1..fcd52fc 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,11 @@ "git_sha": "dc94b6ee04a05ddb9f7ae050712ff30a13149164", "installed_by": ["modules"] }, + "fusioninspector": { + "branch": "master", + "git_sha": "310a7a59c7f2362d25070e5928f3139f92377eaf", + "installed_by": ["modules"] + }, "fusionreport/detect": { "branch": "master", "git_sha": "e753770db613ce014b3c4bc94f6cba443427b726", diff --git a/modules/nf-core/fusioninspector/environment.yml b/modules/nf-core/fusioninspector/environment.yml new file mode 100644 index 0000000..e0e8ab3 --- /dev/null +++ b/modules/nf-core/fusioninspector/environment.yml @@ -0,0 +1,12 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fusion-inspector=2.10.0 + - bioconda::igv-reports=1.14.1 + - bioconda::perl-json-xs=4.03 + - bioconda::pysam=0.22.1 + - conda-forge::perl-carp-assert=0.21 + - conda-forge::pip==24.3.1 + - pip: + - intervaltree==3.1.0 diff --git a/modules/nf-core/fusioninspector/main.nf b/modules/nf-core/fusioninspector/main.nf new file mode 100644 index 0000000..5e25466 --- /dev/null +++ b/modules/nf-core/fusioninspector/main.nf @@ -0,0 +1,66 @@ +process FUSIONINSPECTOR { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/13/139b94a1f10c6e23a8c27eaed1e5a689db978a513d0ee155e74d35f0970814fe/data' : + 'community.wave.seqera.io/library/fusion-inspector_igv-reports_perl-json-xs_pysam_pruned:c6147971d107ab11'}" + + input: + tuple val(meta), path(reads), path(fusion_list) + tuple val(meta2), path(reference) + + output: + tuple val(meta), path("*FusionInspector.fusions.tsv"), emit: tsv , optional:true + tuple val(meta), path("fi_workdir/*.gtf") , emit: out_gtf , optional:true + tuple val(meta), path("*FusionInspector.log") , emit: log , optional:true + tuple val(meta), path("*html") , emit: html , optional:true + tuple val(meta), path("*abridged.tsv") , emit: abridged_tsv, optional:true + tuple val(meta), path("IGV_inputs") , emit: igv_inputs , optional:true + tuple val(meta), path("fi_workdir") , emit: fi_workdir , optional:true + tuple val(meta), path("chckpts_dir") , emit: chckpts_dir , optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def fasta = meta.single_end ? "--left_fq ${reads[0]}" : "--left_fq ${reads[0]} --right_fq ${reads[1]}" + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + """ + FusionInspector \\ + --fusions $fusion_list \\ + --genome_lib ${reference} \\ + $fasta \\ + --CPU ${task.cpus} \\ + -O . \\ + --out_prefix $prefix \\ + --vis $args $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + FusionInspector: \$(FusionInspector --version 2>&1 | grep -i 'version' | sed -e 's/FusionInspector version: //' -e 's/[[:space:]]//g') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch FusionInspector.log + touch ${prefix}.FusionInspector.fusions.abridged.tsv + touch ${prefix}.FusionInspector.fusions.tsv + touch ${prefix}.fusion_inspector_web.html + mkdir -p chckpts_dir + mkdir -p fi_workdir + touch fi_workdir/${prefix}.gtf + mkdir -p IGV_inputs + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + FusionInspector: \$(FusionInspector --version 2>&1 | grep -i 'version' | sed -e 's/FusionInspector version: //' -e 's/[[:space:]]//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fusioninspector/meta.yml b/modules/nf-core/fusioninspector/meta.yml new file mode 100644 index 0000000..1e94eee --- /dev/null +++ b/modules/nf-core/fusioninspector/meta.yml @@ -0,0 +1,150 @@ +name: fusioninspector +description: Validation of Fusion Transcript Predictions +keywords: + - fusioninspector + - fusion + - RNA-seq + - fastq +tools: + - fusioninspector: + description: Validation of Fusion Transcript Predictions + homepage: https://github.com/FusionInspector/FusionInspector + documentation: https://github.com/FusionInspector/FusionInspector/wiki + tool_dev_url: https://github.com/FusionInspector/FusionInspector + doi: 10.1101/2021.08.02.454639" + licence: ["BSD-3-Clause"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FASTQ file + pattern: "*.{fastq*}" + ontologies: + - edam: "http://edamontology.org/format_1930" # FASTQ + - fusion_list: + type: file + description: Fusion targets list + pattern: "*.{txt}" + ontologies: + - edam: "http://edamontology.org/format_2330" # Textual format + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reference: + type: directory + description: Path to CTAT references + pattern: "*" + +output: + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*FusionInspector.fusions.tsv": + type: file + description: FusionInspector output TSV file + pattern: "*.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + out_gtf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fi_workdir/*.gtf: + type: file + description: GTF output file + pattern: "*.gtf" + ontologies: + - edam: "http://edamontology.org/format_2306" # GTF + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*FusionInspector.log": + type: file + description: FusionInspector log file + pattern: "*.log" + ontologies: + - edam: "http://edamontology.org/data_1678" # Log file + - edam: "http://edamontology.org/format_2330" # Plain text + html: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*html": + type: file + description: HTML output files + pattern: "*.html" + ontologies: + - edam: "http://edamontology.org/format_2331" # HTML + abridged_tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*abridged.tsv": + type: file + description: Abridged TSV output file + pattern: "*.tsv" + ontologies: + - edam: "http://edamontology.org/format_3475" # TSV + igv_inputs: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - IGV_inputs: + type: directory + description: IGV inputs directory + pattern: "IGV_inputs" + fi_workdir: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fi_workdir: + type: directory + description: FusionInspector work directory + pattern: "fi_workdir" + chckpts_dir: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - chckpts_dir: + type: directory + description: Checkpoints directory + pattern: "chckpts_dir" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@rannick" + - "@delfiterradas" + - "@sofiromano" + - "@alanmmobbs93" + - "@martings" diff --git a/modules/nf-core/fusioninspector/tests/main.nf.test b/modules/nf-core/fusioninspector/tests/main.nf.test new file mode 100644 index 0000000..01e186f --- /dev/null +++ b/modules/nf-core/fusioninspector/tests/main.nf.test @@ -0,0 +1,111 @@ +nextflow_process { + + name "Test Process FUSIONINSPECTOR" + script "../main.nf" + process "FUSIONINSPECTOR" + tag "modules" + tag "modules_local" + tag "fusioninspector" + tag "starfusion/build" + tag "modules_nfcore" + + test("FUSIONINSPECTOR - test") { + config './nextflow.config' + + setup { + run("STARFUSION_BUILD") { + script "../../starfusion/build/main.nf" + process { + """ + input[0] = [ + [ id:'minigenome_fasta' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/minigenome.fa') + ] + input[1] = [ + [ id:'minigenome_gtf' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/minigenome.gtf') + ] + + input[2] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/CTAT_HumanFusionLib.mini.dat.gz') + input[3] = "homo_sapiens" + input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/Pfam-A.hmm.gz') + input[5] = [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_dfam.hmm'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_dfam.hmm.h3f'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_dfam.hmm.h3i'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_dfam.hmm.h3m'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_dfam.hmm.h3p') + ] + input[6] = "https://data.broadinstitute.org/Trinity/CTAT_RESOURCE_LIB/AnnotFilterRule.pm" + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_rnaseq_1.fastq.gz'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_rnaseq_2.fastq.gz') + ], // reads + [ + file("https://github.com/FusionInspector/FusionInspector/raw/master/test/fusion_targets.A.txt") + ] + ] + input[1] = STARFUSION_BUILD.out.reference + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert file(process.out.tsv[0][1]).getText().contains("THRA--AC090627.1\t72\t83\t72.00\t80.15") }, + { assert file(process.out.abridged_tsv[0][1]).getText().contains("THRA--AC090627.1\t72\t83\t72.00\t80.15") }, + { assert snapshot( + file(process.out.tsv[0][1]).name, + process.out.out_gtf, + file(process.out.abridged_tsv[0][1]).name, + file(process.out.log[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("FUSIONINSPECTOR - test - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_rnaseq_1.fastq.gz'), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/test_starfusion_rnaseq_2.fastq.gz') + ], // reads + [ + file("https://github.com/FusionInspector/FusionInspector/raw/master/test/fusion_targets.A.txt") + ] + ] + input[1] = [[id:'minigenome_refs'],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fusioninspector/tests/main.nf.test.snap b/modules/nf-core/fusioninspector/tests/main.nf.test.snap new file mode 100644 index 0000000..4a8d000 --- /dev/null +++ b/modules/nf-core/fusioninspector/tests/main.nf.test.snap @@ -0,0 +1,199 @@ +{ + "FUSIONINSPECTOR - test": { + "content": [ + "test.FusionInspector.fusions.tsv", + [ + [ + { + "id": "test", + "single_end": false + }, + "test.gtf:md5,d523061bd8b443a014d4cca9406ec772" + ] + ], + "test.FusionInspector.fusions.abridged.tsv", + "FusionInspector.log", + [ + "versions.yml:md5,7c9694f4a2d8edbd30fd9674566b764c" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-07-07T13:48:15.295143368" + }, + "FUSIONINSPECTOR - test - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.FusionInspector.fusions.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "FusionInspector.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fusion_inspector_web.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + "test.FusionInspector.fusions.abridged.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "8": [ + "versions.yml:md5,7c9694f4a2d8edbd30fd9674566b764c" + ], + "abridged_tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.FusionInspector.fusions.abridged.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "chckpts_dir": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "fi_workdir": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "html": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fusion_inspector_web.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "igv_inputs": [ + [ + { + "id": "test", + "single_end": false + }, + [ + + ] + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "FusionInspector.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "out_gtf": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gtf:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.FusionInspector.fusions.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7c9694f4a2d8edbd30fd9674566b764c" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-07-18T20:42:22.142466729" + } +} \ No newline at end of file diff --git a/modules/nf-core/fusioninspector/tests/nextflow.config b/modules/nf-core/fusioninspector/tests/nextflow.config new file mode 100644 index 0000000..ca61431 --- /dev/null +++ b/modules/nf-core/fusioninspector/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'FUSIONINSPECTOR' { + ext.args = '--annotate --examine_coding_effect' + } +} diff --git a/nextflow.config b/nextflow.config index f69d66c..ecff672 100644 --- a/nextflow.config +++ b/nextflow.config @@ -36,6 +36,7 @@ params { arriba_protein = null arriba_known_fusions = null fusion_tools_cutoff = 1 + fusioninspector_limitSjdbInsertNsj = 1000000 // Dx_tracks references dx_tracks_path = null diff --git a/subworkflows/local/bam_gene_fusion/main.nf b/subworkflows/local/bam_gene_fusion/main.nf index 8a905e0..b657937 100644 --- a/subworkflows/local/bam_gene_fusion/main.nf +++ b/subworkflows/local/bam_gene_fusion/main.nf @@ -1,15 +1,16 @@ -include { STARFUSION_DETECT } from '../../../modules/nf-core/starfusion/detect/main' -include { ARRIBA_ARRIBA } from '../../../modules/nf-core/arriba/arriba/main' -include { FUSIONREPORT_DETECT } from '../../../modules/nf-core/fusionreport/detect/main' +include { STARFUSION_DETECT } from '../../../modules/nf-core/starfusion/detect/main' +include { ARRIBA_ARRIBA } from '../../../modules/nf-core/arriba/arriba/main' +include { FUSIONINSPECTOR } from '../../../modules/nf-core/fusioninspector/main' +include { FUSIONREPORT_DETECT } from '../../../modules/nf-core/fusionreport/detect/main' include { FUSIONREPORT_DOWNLOAD } from '../../../modules/nf-core/fusionreport/download/main' workflow BAM_GENE_FUSION { - take: - ch_star_junctions + ch_star_junctions //: Channel> ch_starfusion_ref ch_bam + ch_fastq ch_genome_fasta ch_genome_gtf arriba_blacklist @@ -49,6 +50,21 @@ workflow BAM_GENE_FUSION { params.fusion_tools_cutoff ) + ch_reads_fusions = ch_fastq + .map{ meta, reads -> [meta, reads] } + .join(FUSIONREPORT_DETECT.out.fusion_list + .map{meta, fusion_list -> [meta,[fusion_list]]}) + + ch_reads_fusions.view() + + ch_starfusion_ref.view() + + FUSIONINSPECTOR( + ch_reads_fusions, + ch_starfusion_ref.map{fusion_ref -> + [[id: "starfusion_index"], fusion_ref]} + ) + emit: starfusion_fusions = STARFUSION_DETECT.out.fusions diff --git a/subworkflows/local/bam_gene_fusion/tests/main.nf.test b/subworkflows/local/bam_gene_fusion/tests/main.nf.test index c90a068..83c1705 100644 --- a/subworkflows/local/bam_gene_fusion/tests/main.nf.test +++ b/subworkflows/local/bam_gene_fusion/tests/main.nf.test @@ -26,7 +26,6 @@ nextflow_workflow { [ id:'minigenome_gtf' ], file(params.nfcore_modules_testdata + 'genomics/homo_sapiens/genome/minigenome.gtf') ] - input[2] = file(params.nfcore_modules_testdata + 'genomics/homo_sapiens/genome/CTAT_HumanFusionLib.mini.dat.gz') input[3] = "homo_sapiens" input[4] = file(params.nfcore_modules_testdata + 'genomics/homo_sapiens/genome/Pfam-A.hmm.gz') @@ -43,7 +42,7 @@ nextflow_workflow { } } - test("Run starfusion with junctions") { + test("Run gene fusion subworkflow") { options "-stub" when{ workflow { @@ -59,18 +58,25 @@ nextflow_workflow { file("${TEST_DATA_BASE}/fraser_module/bam/sample1.bam.bai") ]) input[3] = Channel.of([ + [id:'sample1'], + [ + file("sample1_1.fastq.gz"), + file("sample1_2.fastq.gz") + ] + ]) + input[4] = Channel.of([ [ id:'minigenome_fasta' ], file(params.nfcore_modules_testdata + 'genomics/homo_sapiens/genome/minigenome.fa'), file(params.nfcore_modules_testdata + 'genomics/homo_sapiens/genome/minigenome.fa.fai') ]) - input[4] = Channel.of([ + input[5] = Channel.of([ [ id:'minigenome_gtf' ], file(params.nfcore_modules_testdata + 'genomics/homo_sapiens/genome/minigenome.gtf') ]) - input[5] = [] input[6] = [] input[7] = [] input[8] = [] + input[9] = [] """ } } diff --git a/subworkflows/local/fastq_trim_filter_align_dedup/main.nf b/subworkflows/local/fastq_trim_filter_align_dedup/main.nf index abbf412..61d5389 100644 --- a/subworkflows/local/fastq_trim_filter_align_dedup/main.nf +++ b/subworkflows/local/fastq_trim_filter_align_dedup/main.nf @@ -31,18 +31,17 @@ workflow FASTQ_TRIM_FILTER_ALIGN_DEDUP { ch_versions = ch_versions.mix(SORTMERNA_READS.out.versions.first()) - - - STAR_ALIGN( - SORTMERNA_READS.out.reads.map {meta, reads -> + ch_fastq = SORTMERNA_READS.out.reads + .map {meta, reads -> def new_id = meta.id.split('_')[0] [meta + [id: new_id], reads]} .groupTuple() - .map{ - meta, reads -> - def reads_flat = reads.flatten() - [meta, reads_flat] - }, + .map { meta, reads -> + def reads_flat = reads.flatten() + [meta, reads_flat]} + + STAR_ALIGN( + ch_fastq, ch_star_index, ch_gtf, star_ignore_sjdbgtf, @@ -91,6 +90,7 @@ workflow FASTQ_TRIM_FILTER_ALIGN_DEDUP { emit: trim_reads = TRIMGALORE.out.reads // channel: [ val(meta), path(fq.gz) ] trim_unpaired = TRIMGALORE.out.unpaired // channel: [ val(meta), path(fq.gz) ] + ch_fastq = ch_fastq trim_html = TRIMGALORE.out.html // channel: [ val(meta), path(html) ] trim_zip = TRIMGALORE.out.zip // channel: [ val(meta), path(zip) ] trim_log = TRIMGALORE.out.log // channel: [ val(meta), path(txt) ] diff --git a/workflows/dxnextflowrna.nf b/workflows/dxnextflowrna.nf index a9cfb25..91b5e0e 100644 --- a/workflows/dxnextflowrna.nf +++ b/workflows/dxnextflowrna.nf @@ -158,11 +158,11 @@ workflow DXNEXTFLOWRNA { if (params.run_gene_fusion){ ch_starfusion_ref = Channel.fromPath(params.starfusion_ref).collect() - BAM_GENE_FUSION( FASTQ_TRIM_FILTER_ALIGN_DEDUP.out.star_align_junction, ch_starfusion_ref, FASTQ_TRIM_FILTER_ALIGN_DEDUP.out.ch_bam_bai, + FASTQ_TRIM_FILTER_ALIGN_DEDUP.out.ch_fastq, ch_fasta_fai, ch_gtf, params.arriba_blacklist, From 373682660471ac3bbd92284bf08d4d6792df5b42 Mon Sep 17 00:00:00 2001 From: Jorisvansteenbrugge <7196110+Jorisvansteenbrugge@users.noreply.github.com> Date: Fri, 20 Feb 2026 11:56:31 +0100 Subject: [PATCH 2/5] test fusion vcf export --- conf/modules.config | 22 + modules/local/hgnc_download/main.nf | 38 ++ modules/local/vcf_collect/main.nf | 46 ++ .../resources/usr/bin/vcf_collect.py | 622 ++++++++++++++++++ subworkflows/local/bam_gene_fusion/main.nf | 26 +- 5 files changed, 749 insertions(+), 5 deletions(-) create mode 100644 modules/local/hgnc_download/main.nf create mode 100644 modules/local/vcf_collect/main.nf create mode 100755 modules/local/vcf_collect/resources/usr/bin/vcf_collect.py diff --git a/conf/modules.config b/conf/modules.config index 9fd1ae5..95dd3bc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -411,4 +411,26 @@ process { ] } + + withName: HGNC_DOWNLOAD { + cpus = 1 + memory = { 1.GB } + time = { 5.m * task.attempt } + } + + withName: VCF_COLLECT { + cpus = 2 + memory = { 5.GB } + time = { 10.m * task.attempt } + + publishDir = [ + [ + path: "${params.outdir}/fusions/vcf/", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ] + } + + } diff --git a/modules/local/hgnc_download/main.nf b/modules/local/hgnc_download/main.nf new file mode 100644 index 0000000..ee14e2c --- /dev/null +++ b/modules/local/hgnc_download/main.nf @@ -0,0 +1,38 @@ +process HGNC_DOWNLOAD { + tag "hgnc" + label 'process_low' + + conda "bioconda::gnu-wget=1.18" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3b/3b54fa9135194c72a18d00db6b399c03248103f87e43ca75e4b50d61179994b3/data' : + 'community.wave.seqera.io/library/wget:1.21.4--8b0fcde81c17be5e' }" + + output: + path "hgnc_complete_set.txt" , emit: hgnc_ref + path "HGNC-DB-timestamp.txt" , emit: hgnc_date + path "versions.yml" , emit: versions + + + script: + """ + wget --no-check-certificate https://storage.googleapis.com/public-download-files/hgnc/tsv/tsv/hgnc_complete_set.txt + date +%Y-%m-%d/%H:%M > HGNC-DB-timestamp.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -1 | cut -d ' ' -f 3) + END_VERSIONS + """ + + stub: + """ + touch "hgnc_complete_set.txt" + touch "HGNC-DB-timestamp.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -1 | cut -d ' ' -f 3) + END_VERSIONS + """ + +} diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf new file mode 100644 index 0000000..e43a117 --- /dev/null +++ b/modules/local/vcf_collect/main.nf @@ -0,0 +1,46 @@ +process VCF_COLLECT { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'biocontainers/pandas:1.5.2' }" + + input: + tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report), path(fusionreport_csv) + tuple val(meta2), path(hgnc_ref) + tuple val(meta3), path(hgnc_date) + + output: + path "versions.yml" , emit: versions + tuple val(meta), path("*vcf") , emit: vcf + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --fusionreport_csv $fusionreport_csv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + vcf_collect: 0.1 + HGNC DB retrieval: \$(cat $hgnc_date) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_fusion_data.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + vcf_collect: 0.1 + END_VERSIONS + """ +} diff --git a/modules/local/vcf_collect/resources/usr/bin/vcf_collect.py b/modules/local/vcf_collect/resources/usr/bin/vcf_collect.py new file mode 100755 index 0000000..292c052 --- /dev/null +++ b/modules/local/vcf_collect/resources/usr/bin/vcf_collect.py @@ -0,0 +1,622 @@ +#!/usr/bin/env python3 + +# Author: Annick Renevey, annick.renevey@scilifelab.se +# License: MIT +# +# This script is part of the rnafusion pipeline. +# For full license and authorship information, see the repository README. +# + +import argparse +import logging +import sys +from pathlib import Path +import pandas as pd +import ast +import numpy as np +import csv + +logger = logging.getLogger() + + +def vcf_collect( + fusioninspector_in_file: str, + fusionreport_in_file: str, + gtf: str, + fusionreport_csv: str, + hgnc: str, + sample: str, + out_file, +) -> None: + """ + Process FusionInspector and FusionReport data, + merge with GTF from FusionInspector and HGNC database, + and write a VCF file. + + Args: + fusioninspector_in_file (str): Path to FusionInspector input file. + fusionreport_in_file (str): Path to Fusion-report input file. + sample (str): Sample name for the header. + hgnc (str): Path to HGNC file. + gtf (str): Path to output GTF file from FusionInspector in TSV format. + fusionreport_csv (str): Path to Fusion-report CSV output file. + out (str): Output VCF file path. + + Adapted from: https://github.com/J35P312/MegaFusion + """ + merged_df = ( + build_fusioninspector_dataframe(fusioninspector_in_file) + .join(read_build_fusionreport(fusionreport_in_file), how="outer", on="FUSION") + .reset_index() + ) + hgnc_df = build_hgnc_dataframe(hgnc) + df_symbol = merged_df[merged_df["Left_ensembl_gene_id"].isna()] + df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()] + + df_not_symbol = hgnc_df.merge( + df_not_symbol, + how="right", + left_on="ensembl_gene_id", + right_on="Left_ensembl_gene_id", + ) + df_symbol = hgnc_df.merge( + df_symbol, how="right", left_on="symbol", right_on="GeneA" + ) + df = pd.concat([df_not_symbol, df_symbol]) + df = df.rename(columns={"hgnc_id": "Left_hgnc_id"}) + + df_symbol = df[df["Right_ensembl_gene_id"].isna()] + df_not_symbol = df[df["Right_ensembl_gene_id"].notna()] + + df_not_symbol = hgnc_df.merge( + df_not_symbol, + how="right", + left_on="ensembl_gene_id", + right_on="Right_ensembl_gene_id", + ) + df_symbol = hgnc_df.merge( + df_symbol, how="right", left_on="symbol", right_on="GeneB" + ) + df = pd.concat([df_not_symbol, df_symbol]) + df = df.rename(columns={"hgnc_id": "Right_hgnc_id"}) + + gtf_df = build_gtf_dataframe(gtf) + all_df = df.merge( + gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id" + ) + all_df[["PosA", "orig_start", "orig_end"]] = ( + all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + ) + + all_df = all_df[ + ( + (all_df["PosA"] >= all_df["orig_start"]) + & (all_df["PosA"] <= all_df["orig_end"]) + ) + | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) + ] + + all_df["Left_transcript_version"] = all_df["CDS_LEFT_ID"].astype(str).str.split(".").str[-1] + + all_df.replace("", np.nan, inplace=True) + all_df = all_df.drop_duplicates() + + all_df[["exon_number", "Left_transcript_version"]] = all_df[ + ["exon_number", "Left_transcript_version"] + ].replace(0, np.nan) + # Fill non-empty values within each group for 'exon_number' and 'transcript_version' + all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform( + lambda x: x.fillna(method="ffill").fillna(method="bfill") + ) + all_df["Left_transcript_version"] = all_df.groupby("PosA")[ + "Left_transcript_version" + ].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill")) + + all_df = all_df.rename(columns={"exon_number": "Left_exon_number"}) + all_df = all_df[ + [ + "FUSION", + "GeneA", + "GeneB", + "PosA", + "PosB", + "ChromosomeA", + "ChromosomeB", + "TOOLS_HITS", + "SCORE", + "FOUND_DB", + "FOUND_IN", + "JunctionReadCount", + "SpanningFragCount", + "FFPM", + "PROT_FUSION_TYPE", + "CDS_LEFT_ID", + "CDS_RIGHT_ID", + "Left_transcript_version", + "Left_exon_number", + "Left_hgnc_id", + "Right_hgnc_id", + "Strand1", + "Strand2", + "annots", + ] + ].drop_duplicates() + all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str") + all_df = all_df.merge( + gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id" + ) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[ + ["PosB", "orig_start", "orig_end"] + ].fillna(0) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[ + ["PosB", "orig_start", "orig_end"] + ].astype(int) + all_df = all_df[ + ( + (all_df["PosB"] >= all_df["orig_start"]) + & (all_df["PosB"] <= all_df["orig_end"]) + ) + | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) + ] + + all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan) + all_df = all_df.replace("", np.nan) + + all_df["Right_transcript_version"] = all_df["CDS_RIGHT_ID"].astype(str).str.split(".").str[-1] + + + all_df[["exon_number", "Right_transcript_version"]] = all_df[ + ["exon_number", "Right_transcript_version"] + ].replace(0, np.nan) + # Fill non-empty values within each group for 'exon_number' and 'transcript_version' + all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform( + lambda x: x.fillna(method="ffill").fillna(method="bfill") + ) + all_df["Right_transcript_version"] = all_df.groupby("PosB")[ + "Right_transcript_version" + ].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill")) + + all_df = all_df.rename(columns={"exon_number": "Right_exon_number"}) + + all_df = all_df[ + [ + "FUSION", + "GeneA", + "GeneB", + "PosA", + "PosB", + "ChromosomeA", + "ChromosomeB", + "TOOLS_HITS", + "SCORE", + "FOUND_DB", + "FOUND_IN", + "JunctionReadCount", + "SpanningFragCount", + "FFPM", + "PROT_FUSION_TYPE", + "CDS_LEFT_ID", + "CDS_RIGHT_ID", + "Left_transcript_version", + "Left_exon_number", + "Left_hgnc_id", + "Right_transcript_version", + "Right_exon_number", + "Right_hgnc_id", + "Strand1", + "Strand2", + "annots", + ] + ].drop_duplicates() + all_df = all_df.rename(columns={"FUSION": "Fusion"}) + all_df = all_df.set_index("Fusion") + + all_df = all_df.combine_first(read_fusionreport_csv(fusionreport_csv)) + + return write_vcf(column_manipulation(all_df), header_def(sample), out_file) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "--fusioninspector", + metavar="FUSIONINSPECTOR", + type=Path, + help="FusionInspector output in TSV format.", + ) + parser.add_argument( + "--fusionreport", + metavar="FUSIONREPORT", + type=Path, + help="Fusionreport output in index/html format.", + ) + parser.add_argument( + "--fusionreport_csv", + metavar="FUSIONREPORT_CSV", + type=Path, + help="Fusionreport output in CSV format.", + ) + parser.add_argument( + "--fusioninspector_gtf", + metavar="GTF", + type=Path, + help="FusionInspector GTF output.", + ) + parser.add_argument( + "--hgnc", + metavar="HGNC", + type=Path, + help="HGNC database.", + ) + parser.add_argument( + "--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample" + ) + parser.add_argument( + "--out", + metavar="OUT", + type=Path, + help="VCF output path.", + ) + return parser.parse_args(argv) + + +def header_def(sample: str) -> str: + """ + Define the header of the VCF file + """ + return '##fileformat=VCFv4.1\n\ +##ALT=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##FORMAT=\n\ +##FORMAT=\n\ +##FORMAT=\n\ +##FORMAT=\n\ +#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format( + sample + ) + + +def convert_to_list(annots_str: str) -> list: + try: + return ast.literal_eval(annots_str) + except (SyntaxError, ValueError): + return np.nan + + +def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: + """ + Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index. + """ + df = pd.read_csv(file, sep="\t") + df = df.rename(columns={"#FusionName": "FUSION"}) + if not (df.empty): + df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split( + ":", expand=True + ) + df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split( + ":", expand=True + ) + df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split( + "^", expand=True + ) + df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split( + "^", expand=True + ) + df["annots"] = ( + df["annots"] + .apply(convert_to_list) + .apply( + lambda x: ( + ",".join(map(str, x)) + if isinstance(x, list) + else str(x) if pd.notna(x) else "" + ) + ) + ) + else: + for i in [ + "ChromosomeA", + "Strand1", + "ChromosomeB", + "Strand2", + "LeftGeneName", + "Left_ensembl_gene_id", + "RightGeneName", + "Right_ensembl_gene_id", + "annots", + ]: + df[i] = "" + for j in [ + "PosA", + "PosB", + ]: + df[j] = np.nan + + return df.set_index(["FUSION"]) + + +def replace_value_with_column_name( + row: pd.Series, value_to_replace: str, column_name: str +) -> str: + """ + Replace a specific value in a row with the corresponding column name. + """ + new_values = "" + for col_name, value in row.items(): + if col_name == column_name: + if value == value_to_replace: + new_values = col_name + else: + new_values = "" + return new_values + + +def concatenate_columns(row: pd.Series) -> str: + """ + Concatenate non-empty values in a row into a single string separated by commas. + """ + non_empty_values = [str(value) for value in row if value != ""] + return ",".join(non_empty_values) + + +def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: + """ + Read and preprocess fusion-report data from a file, including handling missing tool columns, + getting the columns with each tool and create a new FOUND_IN column with all the tool hits. + Convert the list of databases in FOUND_DB into a joined string with a comma separator. + Make all column headers uppercase. + """ + with open(fusionreport_file) as f: + from_html = [ + line.split('rows": ')[1] for line in f if 'name="fusion_list' in line + ] + tmp = str(from_html)[2:] + tmp2 = tmp.split(', "tools": ')[0] + fusion_report = pd.DataFrame(ast.literal_eval(tmp2)) + if not "arriba" in fusion_report.columns: + fusion_report["arriba"] = "" + if not "fusioncatcher" in fusion_report.columns: + fusion_report["fusioncatcher"] = "" + if not "starfusion" in fusion_report.columns: + fusion_report["starfusion"] = "" + fusion_report["arriba"] = fusion_report[["arriba"]].apply( + replace_value_with_column_name, args=("true", "arriba"), axis=1 + ) + fusion_report["fusioncatcher"] = fusion_report[["fusioncatcher"]].apply( + replace_value_with_column_name, args=("true", "fusioncatcher"), axis=1 + ) + fusion_report["starfusion"] = fusion_report[["starfusion"]].apply( + replace_value_with_column_name, args=("true", "starfusion"), axis=1 + ) + fusion_report["FOUND_IN"] = fusion_report[ + ["arriba", "starfusion", "fusioncatcher"] + ].apply(concatenate_columns, axis=1) + fusion_report.columns = fusion_report.columns.str.upper() + fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply( + lambda x: ",".join(x) if len(x) > 0 else "" + ) + fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split( + "--", expand=True + ) + + return fusion_report[ + ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"] + ].set_index(["FUSION"]) + + +def read_fusionreport_csv(file: str) -> pd.DataFrame: + df = pd.read_csv(file) + columns_to_iterate = ["starfusion", "arriba", "fusioncatcher"] + for column in columns_to_iterate: + if column not in df.columns: + df[column] = "" + df[["starfusion", "arriba", "fusioncatcher"]] = df[ + ["starfusion", "arriba", "fusioncatcher"] + ].astype("str") + for index, row in df.iterrows(): + for column in columns_to_iterate: + cell_value = row[column] + + if "#" in cell_value: + df.at[index, column] = df.at[index, column].split(",")[0] + df.at[index, column] = df.at[index, column].replace("position: ", "") + df.at[index, "A"] = df.at[index, column].split("#")[0] + df.at[index, "B"] = df.at[index, column].split("#")[1] + df.at[index, "ChromosomeA"] = df.at[index, "A"].split(":")[0] + df.at[index, "PosA"] = df.at[index, "A"].split(":")[1] + if "+" in df.at[index, "A"] or "-" in df.at[index, "A"]: + df.at[index, "StrandA"] = df.at[index, "A"].split(":")[2] + else: + df.at[index, "StrandA"] = "" + + df.at[index, "ChromosomeB"] = df.at[index, "B"].split(":")[0] + df.at[index, "PosB"] = df.at[index, "B"].split(":")[1] + if "+" in df.at[index, "B"] or "-" in df.at[index, "B"]: + df.at[index, "StrandB"] = df.at[index, "B"].split(":")[2] + else: + df.at[index, "StrandB"] = "" + + break + df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True) + df = df.set_index("Fusion") + df.to_csv("tmp.csv") + return df[ + [ + "GeneA", + "GeneB", + "ChromosomeA", + "PosA", + "StrandA", + "ChromosomeB", + "PosB", + "StrandB", + ] + ] + + +def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: + """ + Manipulate and prepare DataFrame for VCF file creation. + """ + df["ALT"] = "" + df = df.reset_index() + df["FORMAT"] = "GT:DV:RV:FFPM" + df["ID"] = "." + df["QUAL"] = "." + df["FILTER"] = "PASS" + df["REF"] = "N" + df["INFO"] = "" + df["Sample"] = "" + df["Strand1"] = df["Strand1"].astype(str) + df["JunctionReadCount"] = df["JunctionReadCount"].fillna(0).astype(int).astype(str) + df["SpanningFragCount"] = df["SpanningFragCount"].fillna(0).astype(int).astype(str) + df["FFPM"] = df["FFPM"].fillna(0).astype(float).astype(str) + df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(str) + df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(str) + df["Left_hgnc_id"] = df["Left_hgnc_id"].fillna(0).astype(int).astype(str) + df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str) + df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) + df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) + df["Left_transcript_version"] = ( + pd.to_numeric(df["Left_transcript_version"], errors="coerce").fillna(0).astype(int).astype(str) + ) + df["Right_transcript_version"] = ( + pd.to_numeric(df["Right_transcript_version"], errors="coerce").fillna(0).astype(int).astype(str) + ) + df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str) + df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str) + df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan") + df["CDS_LEFT_ID"] = df["CDS_LEFT_ID"].replace(".", "nan") + df["CDS_RIGHT_ID"] = df["CDS_RIGHT_ID"].replace(".", "nan") + + for index, row in df.iterrows(): + if row["Strand1"] == "-" and row["Strand2"] == "-": + df.loc[index, "ALT"] = f'[{row["ChromosomeB"]}:{row["PosB"]}[N' + elif row["Strand1"] == "+" and row["Strand2"] == "-": + df.loc[index, "ALT"] = f'N]{row["ChromosomeB"]}:{row["PosB"]}]' + elif row["Strand1"] == "-" and row["Strand2"] == "+": + df.loc[index, "ALT"] = f'N]{row["ChromosomeB"]}:{row["PosB"]}]' + else: + df.loc[index, "ALT"] = f'N[{row["ChromosomeB"]}:{row["PosB"]}[' + + df.loc[index, "INFO"] = ( + f"SVTYPE=BND;CHRA={row['ChromosomeA']};CHRB={row['ChromosomeB']};GENEA={row['GeneA']};GENEB={row['GeneB']};" + f"POSA={row['PosA']};POSB={row['PosB']};ORIENTATION={row['Strand1']},{row['Strand2']};FOUND_DB={row['FOUND_DB']};" + f"FOUND_IN={row['FOUND_IN']};TOOL_HITS={row['TOOLS_HITS']};SCORE={row['SCORE']};FRAME_STATUS={row['PROT_FUSION_TYPE']};" + f"TRANSCRIPT_ID_A={row['CDS_LEFT_ID']};TRANSCRIPT_ID_B={row['CDS_RIGHT_ID']};" + f"TRANSCRIPT_VERSION_A={row['Left_transcript_version']};TRANSCRIPT_VERSION_B={row['Right_transcript_version']};" + f"HGNC_ID_A={row['Left_hgnc_id']};HGNC_ID_B={row['Right_hgnc_id']};" + f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};" + f"ANNOTATIONS={row['annots']}" + ) + df.loc[index, "Sample"] = ( + f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + ) + + return df + + +def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: + """ + Write a VCF file with a specified DataFrame, header, and output file path. + """ + df_to_print[ + [ + "ChromosomeA", + "PosA", + "ID", + "REF", + "ALT", + "QUAL", + "FILTER", + "INFO", + "FORMAT", + "Sample", + ] + ].to_csv( + path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE + ) + + with open(out_file, "r+") as f: + content = f.read() + f.seek(0, 0) + f.write(header.rstrip("\r\n") + "\n" + content) + + +def build_hgnc_dataframe(file: str) -> pd.DataFrame: + """ + Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. + """ + df = pd.read_csv(file, sep="\t", low_memory=False) + df["hgnc_id"] = df["hgnc_id"].str.replace("HGNC:", "") + return df[["hgnc_id", "ensembl_gene_id", "symbol"]].dropna() + + +def build_gtf_dataframe(file: str) -> pd.DataFrame: + """ + Build a DataFrame from GTF file converted in TSV, extracting relevant columns. + """ + df = pd.read_csv(file, sep="\t") + df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split( + "^", expand=True + ) + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df[ + "orig_coord_info" + ].str.split(",", expand=True) + return df[ + ["Transcript_id", "exon_number", "orig_start", "orig_end"] + ] + + +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + if ( + not args.fusioninspector.is_file() + or not args.fusionreport.is_file() + or not args.fusioninspector_gtf + or not args.fusionreport_csv + or not args.hgnc + ): + logger.error( + f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!" + ) + sys.exit(2) + vcf_collect( + args.fusioninspector, + args.fusionreport, + args.fusioninspector_gtf, + args.fusionreport_csv, + args.hgnc, + args.sample, + args.out, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/subworkflows/local/bam_gene_fusion/main.nf b/subworkflows/local/bam_gene_fusion/main.nf index b657937..75d1539 100644 --- a/subworkflows/local/bam_gene_fusion/main.nf +++ b/subworkflows/local/bam_gene_fusion/main.nf @@ -3,7 +3,8 @@ include { ARRIBA_ARRIBA } from '../../../modules/nf-core/arriba/arriba/m include { FUSIONINSPECTOR } from '../../../modules/nf-core/fusioninspector/main' include { FUSIONREPORT_DETECT } from '../../../modules/nf-core/fusionreport/detect/main' include { FUSIONREPORT_DOWNLOAD } from '../../../modules/nf-core/fusionreport/download/main' - +include { VCF_COLLECT } from '../../../modules/local/vcf_collect/main' +include { HGNC_DOWNLOAD } from '../../../modules/local/hgnc_download/main' workflow BAM_GENE_FUSION { take: @@ -54,10 +55,6 @@ workflow BAM_GENE_FUSION { .map{ meta, reads -> [meta, reads] } .join(FUSIONREPORT_DETECT.out.fusion_list .map{meta, fusion_list -> [meta,[fusion_list]]}) - - ch_reads_fusions.view() - - ch_starfusion_ref.view() FUSIONINSPECTOR( ch_reads_fusions, @@ -65,6 +62,25 @@ workflow BAM_GENE_FUSION { [[id: "starfusion_index"], fusion_ref]} ) + HGNC_DOWNLOAD() + + FUSIONINSPECTOR.out.tsv + .join(FUSIONINSPECTOR.out.out_gtf) + .join(FUSIONREPORT_DETECT.out.report) + .join(FUSIONREPORT_DETECT.out.csv).view() + + + VCF_COLLECT( + FUSIONINSPECTOR.out.tsv + .join(FUSIONINSPECTOR.out.out_gtf) + .join(FUSIONREPORT_DETECT.out.report) + .join(FUSIONREPORT_DETECT.out.csv), + HGNC_DOWNLOAD.out.hgnc_ref + .map{ hgnc_ref -> [[id: hgnc_ref.getSimpleName()], hgnc_ref]}, + HGNC_DOWNLOAD.out.hgnc_date + .map{ hgnc_date -> [[id: hgnc_date.getSimpleName()], hgnc_date]} + ) + emit: starfusion_fusions = STARFUSION_DETECT.out.fusions From 44df2a5280e22c89c396caf61392c80604270790 Mon Sep 17 00:00:00 2001 From: Jorisvansteenbrugge <7196110+Jorisvansteenbrugge@users.noreply.github.com> Date: Fri, 20 Feb 2026 13:17:08 +0100 Subject: [PATCH 3/5] debug statements --- subworkflows/local/bam_gene_fusion/main.nf | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/subworkflows/local/bam_gene_fusion/main.nf b/subworkflows/local/bam_gene_fusion/main.nf index 75d1539..3ae9669 100644 --- a/subworkflows/local/bam_gene_fusion/main.nf +++ b/subworkflows/local/bam_gene_fusion/main.nf @@ -64,10 +64,12 @@ workflow BAM_GENE_FUSION { HGNC_DOWNLOAD() - FUSIONINSPECTOR.out.tsv - .join(FUSIONINSPECTOR.out.out_gtf) - .join(FUSIONREPORT_DETECT.out.report) - .join(FUSIONREPORT_DETECT.out.csv).view() + FUSIONINSPECTOR.out.tsv.view() + + + + FUSIONREPORT_DETECT.out.report.view() + VCF_COLLECT( From 04e6c9e5f04170099f354ed96267c9afc62b5068 Mon Sep 17 00:00:00 2001 From: Jorisvansteenbrugge <7196110+Jorisvansteenbrugge@users.noreply.github.com> Date: Fri, 20 Feb 2026 14:18:06 +0100 Subject: [PATCH 4/5] add agat gff2tsv --- modules.json | 5 ++ .../agat/convertspgff2tsv/environment.yml | 5 ++ modules/nf-core/agat/convertspgff2tsv/main.nf | 45 ++++++++++++++ .../nf-core/agat/convertspgff2tsv/meta.yml | 54 +++++++++++++++++ .../agat/convertspgff2tsv/tests/main.nf.test | 59 +++++++++++++++++++ .../convertspgff2tsv/tests/main.nf.test.snap | 48 +++++++++++++++ 6 files changed, 216 insertions(+) create mode 100644 modules/nf-core/agat/convertspgff2tsv/environment.yml create mode 100644 modules/nf-core/agat/convertspgff2tsv/main.nf create mode 100644 modules/nf-core/agat/convertspgff2tsv/meta.yml create mode 100644 modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test create mode 100644 modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test.snap diff --git a/modules.json b/modules.json index fcd52fc..f653380 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "agat/convertspgff2tsv": { + "branch": "master", + "git_sha": "55ed5f4aabcdcd7a4cc44e1700f956756c7e8532", + "installed_by": ["modules"] + }, "arriba/arriba": { "branch": "master", "git_sha": "025a6f75e1f72e4ab60abb4bd65b3f289d4ad910", diff --git a/modules/nf-core/agat/convertspgff2tsv/environment.yml b/modules/nf-core/agat/convertspgff2tsv/environment.yml new file mode 100644 index 0000000..0e342bf --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::agat=1.5.1" diff --git a/modules/nf-core/agat/convertspgff2tsv/main.nf b/modules/nf-core/agat/convertspgff2tsv/main.nf new file mode 100644 index 0000000..7e7f946 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/main.nf @@ -0,0 +1,45 @@ +process AGAT_CONVERTSPGFF2TSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/03/033434db0bd6ba28660401e1059286f36641fd8ce55faa11973fe5eaf312adcd/data' : + 'community.wave.seqera.io/library/agat:1.5.1--ae3cd948ce5e9795' }" + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + agat_convert_sp_gff2tsv.pl \\ + --gff ${gff} \\ + --output ${prefix}.tsv \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gff2tsv.pl --help | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gff2tsv.pl --help | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/agat/convertspgff2tsv/meta.yml b/modules/nf-core/agat/convertspgff2tsv/meta.yml new file mode 100644 index 0000000..4c80ee8 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/meta.yml @@ -0,0 +1,54 @@ +name: agat_convertspgff2tsv +description: | + Converts a GFF/GTF file into a TSV file +keywords: + - genome + - gff + - gtf + - conversion + - tsv +tools: + - agat: + description: "AGAT is a toolkit for manipulation and getting information from + GFF/GTF files" + homepage: "https://github.com/NBISweden/AGAT" + documentation: "https://agat.readthedocs.io/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] + identifier: biotools:AGAT +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gff: + type: file + description: Annotation file in GFF3/GTF format + pattern: "*.{gff, gtf}" + ontologies: + - edam: "http://edamontology.org/format_1975" # GFF3 + - edam: "http://edamontology.org/format_2306" # GTF +output: + tsv: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.tsv": + type: file + description: Annotation file in TSV format + pattern: "*.{gtf}" + ontologies: + - edam: http://edamontology.org/format_2306 # GTF + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML +authors: + - "@rannick" +maintainers: + - "@gallvp" diff --git a/modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test b/modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test new file mode 100644 index 0000000..6a2e894 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test @@ -0,0 +1,59 @@ +nextflow_process { + + name "Test Process AGAT_CONVERTSPGFF2TSV" + script "../main.nf" + process "AGAT_CONVERTSPGFF2TSV" + + tag "modules" + tag "modules_nfcore" + tag "agat" + tag "agat/convertspgff2tsv" + + test("sarscov2 - genome [gff3]") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - genome [gff3] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.gff3', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tsv.collect { file(it[1]).getName() } + + process.out.versions).match() } + ) + } + + } + +} diff --git a/modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test.snap b/modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test.snap new file mode 100644 index 0000000..581e56f --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/tests/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "sarscov2 - genome [gff3] - stub": { + "content": [ + [ + "test.tsv", + "versions.yml:md5,e17c06a74b2cfa52ea1ef4703dae4ee3" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.04.8" + }, + "timestamp": "2025-10-17T11:13:08.280583" + }, + "sarscov2 - genome [gff3]": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tsv:md5,8373d2035689d23694f87606116cdccd" + ] + ], + "1": [ + "versions.yml:md5,e17c06a74b2cfa52ea1ef4703dae4ee3" + ], + "tsv": [ + [ + { + "id": "test" + }, + "test.tsv:md5,8373d2035689d23694f87606116cdccd" + ] + ], + "versions": [ + "versions.yml:md5,e17c06a74b2cfa52ea1ef4703dae4ee3" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.04.8" + }, + "timestamp": "2025-10-17T11:13:02.775461" + } +} \ No newline at end of file From 4b457345de2a8bdd829a141fba25b6ee026e4987 Mon Sep 17 00:00:00 2001 From: Jorisvansteenbrugge <7196110+Jorisvansteenbrugge@users.noreply.github.com> Date: Fri, 20 Feb 2026 17:24:16 +0100 Subject: [PATCH 5/5] VCF collect export --- conf/modules.config | 3 ++- subworkflows/local/bam_gene_fusion/main.nf | 17 ++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 95dd3bc..3261deb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -55,7 +55,7 @@ process { ext.args = { params.fusioninspector_limitSjdbInsertNsj != 1000000 ? "--STAR_xtra_params \"--limitSjdbInsertNsj ${params.fusioninspector_limitSjdbInsertNsj}\"" : '' } ext.args2 = '--annotate --examine_coding_effect' publishDir = [ - path: { "${params.outdir}/fusioninspector/${meta.id}" }, + path: { "${params.outdir}/fusions/fusioninspector/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -246,6 +246,7 @@ process { } withName: FUSIONREPORT_DETECT { ext.args = {'--no-cosmic'} + ext.args2 = "--export csv" publishDir = [ path: { "${params.outdir}/fusions/fusion_report/${meta.id}/" }, diff --git a/subworkflows/local/bam_gene_fusion/main.nf b/subworkflows/local/bam_gene_fusion/main.nf index 3ae9669..ff0278f 100644 --- a/subworkflows/local/bam_gene_fusion/main.nf +++ b/subworkflows/local/bam_gene_fusion/main.nf @@ -1,4 +1,5 @@ include { STARFUSION_DETECT } from '../../../modules/nf-core/starfusion/detect/main' +include { AGAT_CONVERTSPGFF2TSV } from '../../../modules/nf-core/agat/convertspgff2tsv/main' include { ARRIBA_ARRIBA } from '../../../modules/nf-core/arriba/arriba/main' include { FUSIONINSPECTOR } from '../../../modules/nf-core/fusioninspector/main' include { FUSIONREPORT_DETECT } from '../../../modules/nf-core/fusionreport/detect/main' @@ -64,17 +65,15 @@ workflow BAM_GENE_FUSION { HGNC_DOWNLOAD() - FUSIONINSPECTOR.out.tsv.view() - - - - FUSIONREPORT_DETECT.out.report.view() - - + AGAT_CONVERTSPGFF2TSV( + FUSIONINSPECTOR.out.out_gtf + .filter { _meta, file -> file.exists() && file.size() > 0 } + ) VCF_COLLECT( - FUSIONINSPECTOR.out.tsv - .join(FUSIONINSPECTOR.out.out_gtf) + FUSIONINSPECTOR.out.abridged_tsv + .filter{ _meta, file -> file.exists() && file.size() > 0 } + .join(AGAT_CONVERTSPGFF2TSV.out.tsv) .join(FUSIONREPORT_DETECT.out.report) .join(FUSIONREPORT_DETECT.out.csv), HGNC_DOWNLOAD.out.hgnc_ref