Skip to content

Commit 52c4bf3

Browse files
committed
Add local module dshbio/fastatoparquet
1 parent e37c63d commit 52c4bf3

5 files changed

Lines changed: 114 additions & 0 deletions

File tree

conf/base.config

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,7 @@ process {
6262
withName: SEQKIT_STATS {
6363
ext.args = ' ' // turn off --all default argument
6464
}
65+
withName: DSHBIO_FASTATOPARQUET {
66+
ext.args = '--alphabet protein'
67+
}
6568
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- bioconda::dsh-bio=3.0
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
process DSHBIO_FASTATOPARQUET {
2+
tag "${meta.id}"
3+
label 'process_medium'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/dsh-bio:3.0--hdfd78af_0' :
8+
'biocontainers/dsh-bio:3.0--hdfd78af_0' }"
9+
10+
input:
11+
tuple val(meta), path(fasta)
12+
13+
output:
14+
tuple val(meta), path("*.sequences.parquet"), emit: parquet
15+
path "versions.yml" , emit: versions
16+
17+
when:
18+
task.ext.when == null || task.ext.when
19+
20+
script:
21+
def args = task.ext.args ?: ''
22+
def prefix = task.ext.prefix ?: "${meta.id}"
23+
"""
24+
dsh-bio \\
25+
fasta-to-parquet3 \\
26+
$args \\
27+
-i $fasta \\
28+
-o ${prefix}.sequences.parquet
29+
30+
cat <<-END_VERSIONS > versions.yml
31+
"${task.process}":
32+
dshbio: \$(dsh-bio --version 2>&1 | grep -o 'dsh-bio-tools .*' | cut -f2 -d ' ')
33+
END_VERSIONS
34+
"""
35+
36+
stub:
37+
def args = task.ext.args ?: ''
38+
def prefix = task.ext.prefix ?: "${meta.id}"
39+
"""
40+
mkdir -p ${prefix}.sequences.parquet
41+
42+
cat <<-END_VERSIONS > versions.yml
43+
"${task.process}":
44+
dshbio: \$(dsh-bio --version 2>&1 | grep -o 'dsh-bio-tools .*' | cut -f2 -d ' ')
45+
END_VERSIONS
46+
"""
47+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
3+
name: "dshbio_fastatoparquet"
4+
description: Convert DNA or protein sequences in FASTA format to Parquet format
5+
keywords:
6+
- fasta
7+
- parquet
8+
- sequence
9+
tools:
10+
- dshbio:
11+
description: |
12+
Reads, features, variants, assemblies, alignments, genomic range trees, pangenome
13+
graphs, and a bunch of random command line tools for bioinformatics. LGPL version 3
14+
or later.
15+
homepage: https://github.com/heuermh/dishevelled-bio
16+
documentation: https://github.com/heuermh/dishevelled-bio
17+
doi: "10.5281/zenodo.15027131"
18+
licence: ["LGPL-3.0-or-later"]
19+
20+
input:
21+
- - meta:
22+
type: map
23+
description: |
24+
Groovy Map containing sample information
25+
e.g. [ id:'test', single_end:false ]
26+
- fasta:
27+
type: file
28+
description: DNA or protein sequences in compressed FASTA format
29+
pattern: "*.fasta.{gz|zst|bgz|bgzf|bzip2}"
30+
31+
output:
32+
- parquet:
33+
- meta:
34+
type: map
35+
description: |
36+
Groovy Map containing sample information
37+
e.g. [ id:'test', single_end:false ]
38+
- "*.sequences.parquet":
39+
type: directory
40+
description: |
41+
Directory of DNA or protein sequences in Parquet format with zstd compression
42+
pattern: "*.sequences.parquet"
43+
- versions:
44+
- versions.yml:
45+
type: file
46+
description: File containing software versions
47+
pattern: "versions.yml"
48+
49+
authors:
50+
- "@heuermh"
51+
maintainers:
52+
- "@heuermh"

workflows/proteinannotator.nf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS
44
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55
*/
6+
include { DSHBIO_FASTATOPARQUET } from '../modules/local/dshbio/fastatoparquet/main'
67
include { MULTIQC } from '../modules/nf-core/multiqc/main'
78
include { SEQKIT_STATS } from '../modules/nf-core/seqkit/stats/main'
89
include { paramsSummaryMap } from 'plugin/nf-schema'
@@ -34,6 +35,10 @@ workflow PROTEINANNOTATOR {
3435
SEQKIT_STATS(ch_samplesheet)
3536
ch_versions = ch_versions.mix(SEQKIT_STATS.out.versions)
3637

38+
// todo: move this to stats on input fasta subworkflow
39+
DSHBIO_FASTATOPARQUET(ch_samplesheet)
40+
ch_versions = ch_versions.mix(DSHBIO_FASTATOPARQUET.out.versions)
41+
3742
//
3843
// Collate and save software versions
3944
//

0 commit comments

Comments
 (0)