Skip to content

Commit df5a7fe

Browse files
committed
Add local module duckdb/aminoacidhistogram
1 parent 26ee27a commit df5a7fe

6 files changed

Lines changed: 84 additions & 7 deletions

File tree

conf/base.config

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,4 @@ process {
5959
errorStrategy = 'retry'
6060
maxRetries = 2
6161
}
62-
withName: SEQKIT_STATS {
63-
ext.args = ' ' // turn off --all default argument
64-
}
65-
withName: DSHBIO_FASTATOPARQUET {
66-
ext.args = '--alphabet protein'
67-
}
6862
}

conf/modules.config

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
----------------------------------------------------------------------------------------
1111
*/
1212

13+
nextflow.enable.moduleBinaries = true
14+
1315
process {
1416

1517
publishDir = [
@@ -26,5 +28,10 @@ process {
2628
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
2729
]
2830
}
29-
31+
withName: SEQKIT_STATS {
32+
ext.args = ' ' // turn off --all default argument
33+
}
34+
withName: DSHBIO_FASTATOPARQUET {
35+
ext.args = '--alphabet protein'
36+
}
3037
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
6+
dependencies:
7+
- conda-forge::duckdb-cli=1.0.0
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
process DUCKDB_AMINOACIDHISTOGRAM {
2+
tag "${meta.id}"
3+
label 'process_medium'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container 'community.wave.seqera.io/library/duckdb-cli:1.0.0--a85d12a2a9de17c9'
7+
8+
input:
9+
tuple val(meta), path(parquet)
10+
11+
output:
12+
tuple val(meta), path("*.histogram.tsv"), emit: histogram
13+
path "versions.yml" , emit: versions
14+
15+
when:
16+
task.ext.when == null || task.ext.when
17+
18+
script:
19+
def prefix = task.ext.prefix ?: "${meta.id}"
20+
def sql = "INSTALL parquet; LOAD parquet; COPY (WITH p AS (SELECT * FROM read_parquet('${parquet}/*.parquet')), s AS (SELECT unnest(string_to_array(sequence, '')) AS aa FROM p), h AS (SELECT unnest(map_entries(histogram(aa))) AS kv FROM s), e AS (SELECT * from read_csv_auto('amino_acid_properties.tsv')) SELECT h.kv['key'] AS amino_acid, h.kv['value'] AS count, e.* FROM h JOIN e ON h.kv['key'] = e.one_letter_symbol) TO '${prefix}.histogram.tsv' (HEADER, DELIMITER '\t')"
21+
"""
22+
create_amino_acid_properties.sh
23+
duckdb :memory: "$sql"
24+
25+
cat <<-END_VERSIONS > versions.yml
26+
"${task.process}":
27+
duckdb: \$( duckdb --version | cut -f 1 -d " " )
28+
END_VERSIONS
29+
"""
30+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/usr/bin/env bash
2+
3+
#
4+
# See https://en.wikipedia.org/wiki/Amino_acid
5+
6+
cat <<END_PROPERTIES > amino_acid_properties.tsv
7+
amino_acid three_letter_symbol one_letter_symbol class chemical_polarity net_charge hydropathy_index molecular_mass abundance_in_proteins standard_genetic_coding hydrophobic aromatic aliphatic small hydrophilic positively_charged negatively_charged
8+
Alanine Ala A Aliphatic Nonpolar Neutral 1.8 89.094 8.76 GCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
9+
Arginine Arg R Fixed cation Basic polar Positive −4.5 174.203 5.78 MGR, CGY FALSE FALSE FALSE FALSE TRUE TRUE FALSE
10+
Asparagine Asn N Amide Polar Neutral −3.5 132.119 3.93 AAY FALSE FALSE FALSE FALSE TRUE FALSE FALSE
11+
Aspartate Asp D Anion Brønsted base Negative −3.5 133.104 5.49 GAY FALSE FALSE FALSE FALSE TRUE FALSE TRUE
12+
Cysteine Cys C Thiol Brønsted acid Neutral 2.5 121.154 1.38 UGY FALSE FALSE FALSE FALSE FALSE FALSE FALSE
13+
Glutamine Gln Q Amide Polar Neutral −3.5 146.146 3.9 CAR FALSE FALSE FALSE FALSE TRUE FALSE FALSE
14+
Glutamate Glu E Anion Brønsted base Negative −3.5 147.131 6.32 GAR FALSE FALSE FALSE FALSE TRUE FALSE TRUE
15+
Glycine Gly G Aliphatic Nonpolar Neutral −0.4 75.067 7.03 GGN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
16+
Histidine His H Cationic Brønsted acid and base Positive, 10% Neutral, 90% −3.2 155.156 2.26 CAY FALSE TRUE FALSE FALSE TRUE TRUE FALSE
17+
Isoleucine Ile I Aliphatic Nonpolar Neutral 4.5 131.175 5.49 AUH TRUE FALSE TRUE FALSE FALSE FALSE FALSE
18+
Leucine Leu L Aliphatic Nonpolar Neutral 3.8 131.175 9.68 YUR, CUY TRUE FALSE TRUE FALSE FALSE FALSE FALSE
19+
Lysine Lys K Cation Brønsted acid Positive −3.9 146.189 5.19 AAR FALSE FALSE FALSE FALSE TRUE TRUE FALSE
20+
Methionine Met M Thioether Nonpolar Neutral 1.9 149.208 2.32 AUG TRUE FALSE TRUE FALSE FALSE FALSE FALSE
21+
Phenylalanine Phe F Aromatic Nonpolar Neutral 2.8 165.192 3.87 UUY TRUE TRUE FALSE FALSE FALSE FALSE FALSE
22+
Proline Pro P Cyclic Nonpolar Neutral −1.6 115.132 5.02 CCN FALSE FALSE FALSE TRUE FALSE FALSE FALSE
23+
Serine Ser S Hydroxylic Polar Neutral −0.8 105.093 7.14 UCN, AGY FALSE FALSE FALSE TRUE TRUE FALSE FALSE
24+
Threonine Thr T Hydroxylic Polar Neutral −0.7 119.119 5.53 ACN FALSE FALSE FALSE FALSE TRUE FALSE FALSE
25+
Tryptophan Trp W Aromatic Nonpolar Neutral −0.9 204.228 1.25 UGG TRUE TRUE FALSE FALSE FALSE FALSE FALSE
26+
Tyrosine Tyr Y Aromatic Brønsted acid Neutral −1.3 181.191 2.91 UAY TRUE TRUE FALSE FALSE FALSE FALSE FALSE
27+
Valine Val V Aliphatic Nonpolar Neutral 4.2 117.148 6.73 GUN TRUE FALSE TRUE FALSE FALSE FALSE FALSE
28+
Selenocysteine Sec U 168.064
29+
Pyrrolysine Pyl O 255.313
30+
Any/unknown Xaa X
31+
Asparagine or aspartate Asx B
32+
Glutamine or glutamate Glx Z
33+
Leucine or isoleucine Xle J
34+
END_PROPERTIES

workflows/proteinannotator.nf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55
*/
66
include { DSHBIO_FASTATOPARQUET } from '../modules/local/dshbio/fastatoparquet/main'
7+
include { DUCKDB_AMINOACIDHISTOGRAM } from '../modules/local/duckdb/aminoacidhistogram/main'
78
include { MULTIQC } from '../modules/nf-core/multiqc/main'
89
include { SEQKIT_STATS } from '../modules/nf-core/seqkit/stats/main'
910
include { paramsSummaryMap } from 'plugin/nf-schema'
@@ -34,6 +35,10 @@ workflow PROTEINANNOTATOR {
3435
DSHBIO_FASTATOPARQUET(ch_samplesheet)
3536
ch_versions = ch_versions.mix(DSHBIO_FASTATOPARQUET.out.versions)
3637

38+
// todo: move this to stats on input fasta subworkflow
39+
DUCKDB_AMINOACIDHISTOGRAM(DSHBIO_FASTATOPARQUET.out.parquet)
40+
ch_versions = ch_versions.mix(DUCKDB_AMINOACIDHISTOGRAM.out.versions)
41+
3742
//
3843
// Collate and save software versions
3944
//

0 commit comments

Comments
 (0)