Skip to content

Commit e340200

Browse files
New fasta_bgzip_index_dict_samtools subworkflow (#11783)
* Template 4.0.2 * New `fasta_bgzip_index_dict_samtools` subworkflow Here is a new `fasta_bgzip_index_dict_samtools` subworkflow that takes FASTA files as input (compressed or not) and returns them BGZF-compressed, with their flat and binary indexes, and their sequence dictionary. This subworkflow is especially useful when making CRAM files. * Also collect the sequence size summary.
1 parent b871d05 commit e340200

5 files changed

Lines changed: 158 additions & 0 deletions

File tree

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
include { HTSLIB_BGZIPTABIX } from '../../../modules/nf-core/htslib/bgziptabix/main'
2+
include { SAMTOOLS_DICT } from '../../../modules/nf-core/samtools/dict/main'
3+
include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main'
4+
5+
workflow FASTA_BGZIP_INDEX_DICT_SAMTOOLS {
6+
7+
take:
8+
ch_fasta // channel: [ val(meta), fasta ]
9+
10+
main:
11+
12+
HTSLIB_BGZIPTABIX (
13+
ch_fasta.map { meta, fasta -> [meta, fasta, [], []] },
14+
'compress',
15+
[],
16+
[]
17+
)
18+
19+
SAMTOOLS_FAIDX (
20+
HTSLIB_BGZIPTABIX.out.output.map {meta, fasta -> [meta, fasta, []]},
21+
true
22+
)
23+
24+
SAMTOOLS_DICT (
25+
HTSLIB_BGZIPTABIX.out.output
26+
)
27+
28+
ch_joined = HTSLIB_BGZIPTABIX.out.output
29+
.join(SAMTOOLS_FAIDX.out.fai)
30+
.join(SAMTOOLS_FAIDX.out.gzi)
31+
.join(SAMTOOLS_FAIDX.out.sizes)
32+
.join(SAMTOOLS_DICT.out.dict)
33+
34+
emit:
35+
fasta_fai_gzi_dict = ch_joined // channel: [ val(meta), fasta.gz, fai, gzi, sizes, dict ]
36+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
2+
name: "fasta_bgzip_index_dict_samtools"
3+
description: Ensure BGZF compression, index, and produce a sequence size summary and a sequence dictionary for each FASTA file.
4+
keywords:
5+
- bgzip
6+
- faidx
7+
- dict
8+
- fasta
9+
components:
10+
- htslib/bgziptabix
11+
- samtools/dict
12+
- samtools/faidx
13+
14+
input:
15+
- ch_fasta:
16+
type: file
17+
description: |
18+
FASTA file, compressed or not.
19+
Structure: [ val(meta), path(fasta) ]
20+
pattern: "*.{fa,fa.gz,fa.bz2,fa.xz,fasta,fasta.gz,fasta.bz2,fasta.xz}"
21+
22+
output:
23+
- fasta_fai_gzi_dict:
24+
type: file
25+
description: |
26+
Channel containing:
27+
- BGZF-compressed FASTA
28+
- FASTA index (.fai)
29+
- BGZF index (.gzi)
30+
- sequence size summary (.sizes)
31+
- sequence dictionary (.dict)
32+
Structure:
33+
[ val(meta), path(fasta_gz), path(fai), path(gzi), path(sizes), path(dict) ]
34+
pattern: "*"
35+
36+
authors:
37+
- "@charles-plessy"
38+
maintainers:
39+
- "@charles-plessy"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
nextflow_workflow {
2+
3+
name "Test Subworkflow FASTA_BGZIP_INDEX_DICT_SAMTOOLS"
4+
script "../main.nf"
5+
workflow "FASTA_BGZIP_INDEX_DICT_SAMTOOLS"
6+
7+
tag "subworkflows"
8+
tag "subworkflows_nfcore"
9+
tag "subworkflows/fasta_bgzip_index_dict_samtools"
10+
tag "htslib"
11+
tag "htslib/bgziptabix"
12+
tag "samtools"
13+
tag "samtools/dict"
14+
tag "samtools/faidx"
15+
16+
config "./nextflow.config"
17+
18+
test("sarscov2 - fasta - genomes") {
19+
20+
when {
21+
workflow {
22+
"""
23+
input[0] = channel.of([
24+
[ id:'genome_complete' ], // meta map
25+
file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)
26+
])
27+
"""
28+
}
29+
}
30+
then {
31+
assert workflow.success
32+
assertAll(
33+
{ assert snapshot(
34+
workflow.out
35+
).match() }
36+
)
37+
}
38+
}
39+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{
2+
"sarscov2 - fasta - genomes": {
3+
"content": [
4+
{
5+
"0": [
6+
[
7+
{
8+
"id": "genome_complete"
9+
},
10+
"genome_complete.gz:md5,6e9fe4042a72f2345f644f239272b7e6",
11+
"genome_complete.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5",
12+
"genome_complete.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474",
13+
"genome_complete.gz.sizes:md5,a57c401f27ae5133823fb09fb21c8a3c",
14+
"genome_complete.gz.dict:md5,7259d9fba4f0029e294b70a7bf05af6a"
15+
]
16+
],
17+
"fasta_fai_gzi_dict": [
18+
[
19+
{
20+
"id": "genome_complete"
21+
},
22+
"genome_complete.gz:md5,6e9fe4042a72f2345f644f239272b7e6",
23+
"genome_complete.gz.fai:md5,9da2a56e2853dc8c0b86a9e7229c9fe5",
24+
"genome_complete.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474",
25+
"genome_complete.gz.sizes:md5,a57c401f27ae5133823fb09fb21c8a3c",
26+
"genome_complete.gz.dict:md5,7259d9fba4f0029e294b70a7bf05af6a"
27+
]
28+
]
29+
}
30+
],
31+
"timestamp": "2026-05-28T11:12:13.641917963",
32+
"meta": {
33+
"nf-test": "0.9.5",
34+
"nextflow": "25.10.4"
35+
}
36+
}
37+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
process {
2+
withName: 'SAMTOOLS_DICT' {
3+
// This makes the output deterministic,
4+
// otherwise you get temporary file folder names in the file path.
5+
ext.args = { "-u ./${fasta} -a ${meta.id}" }
6+
}
7+
}

0 commit comments

Comments
 (0)