pannotator/main.nf at main · sysbio-vo/pannotator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env nextflow
// Copyright (C) 2024 Genome Research Ltd.

/*
========================================================================================
    HELP
========================================================================================
*/

// def logo = NextflowTool.logo(workflow, params.monochrome_logs)

// log.info logo

// NextflowTool.commandLineParams(workflow.commandLine, log, params.monochrome_logs)


def printHelp() {
    NextflowTool.help_message("${workflow.ProjectDir}/schema.json",
                               [],
    params.monochrome_logs, log)
}

def sampleIdFromName = {name -> name.replaceFirst(~/(\.[^\.]+)+$/, '')}

/*
========================================================================================
    IMPORT MODULES/SUBWORKFLOWS
========================================================================================
*/

include { FIND_CDSS } from './subworkflows/find_cdss.nf'
include { ANNOTATE_PROTEINS } from './subworkflows/annotate_proteins.nf'
include { CLUSTER_PROTEOME } from './subworkflows/proteome_clustering.nf'
include { MERGE_ANNOTATIONS } from './modules/merge_annotations.nf'
include { DETECT_PSEUDOGENES } from './subworkflows/detect_pseudogenes.nf'
include { FIND_RNAS } from './modules/find_rnas.nf'
include { DOWNLOAD_BAKTA_DB } from './modules/helpers.nf'
include { SORF_EXTRA } from './modules/find_sorf_extra.nf'


/*
========================================================================================
    RUN MAIN WORKFLOW
========================================================================================
*/

workflow {
    if (params.help) {
        printHelp()
        exit 0
    }

    // TODO: even wich `cache` directive set to false
    // the database is stored twice - in the workdir and publishdir
    // use the bakta_db config parameter as input for subsequent processes
    // instead of the output of the DOWNLOAD_BAKTA_DB process
    // and set publishDir move to `move` in that process
    if ( file(params.bakta_db).exists() ) {
        bakta_db = Channel.of(file(params.bakta_db))
    } else {
        println "Downloading bakta db to ${params.bakta_db}"
        bakta_db = DOWNLOAD_BAKTA_DB(params.bakta_db_type)
    }

    infiles = Channel.fromPath("${params.indir}/*${params.infile_extension}") // TODO: add input file extension as a parameter

    infiles
        .combine(bakta_db)
        .set { infiles_and_bakta_db }

    ch_asm = infiles.map { asm -> tuple(sampleIdFromName(asm.name), asm) }

    //-----------------------------
    // CDS prediction
    //-----------------------------
    cds_outputs = FIND_CDSS(infiles_and_bakta_db)
    all_cds_outputs = cds_outputs.collect()

    cds_pkl_list_ch = all_cds_outputs
        .flatten()
        .filter { it.name.endsWith('.pkl') }
        .collect()

    ch_cds_pkl = cds_pkl_list_ch.flatten()

    //-----------------------------
    // Cluster + annotate
    //-----------------------------
    CLUSTER_PROTEOME(cds_outputs)
    CLUSTER_PROTEOME
        .out
        .map { all_seqs, clustering_tsv, rep_seq -> rep_seq }
        .set { rep_proteins_ch }

    rep_proteins_ch
        .combine(bakta_db)
        .set { rep_proteins_and_bakta_db }

    ANNOTATE_PROTEINS(rep_proteins_and_bakta_db)

    //-----------------------------
    // Merge annotations
    //-----------------------------

    // NOTE: cache is not utilised if channel values are collected in a different order
    // TODO: sort collected values in cds_pkl_list_ch?
    MERGE_ANNOTATIONS(
        cds_pkl_list_ch,
        ANNOTATE_PROTEINS.out.bulk_annotations
    )


    if ( params.bakta_db_type == 'full' ) {
        // predict pseudogenes using annotated pickle objects

        // TODO: Nextflow caching doesn't work well with this approach
        // if a single new sample is added, this whole subworkflow reruns
        MERGE_ANNOTATIONS.out.annotated_pickles
            .flatten()
            .map { it -> "${it}" } // TODO: is this crutch REALLY neccessary to collect paths to files in a txt files instead of their contents?
            .collectFile( name: 'annotated_cds_manifest.txt', newLine: true )
            .set { manifest_file }

        manifest_file
            .combine(bakta_db)
            .set { manifest_file_and_bakta_db }

        DETECT_PSEUDOGENES(manifest_file_and_bakta_db)

        ch_cds_annot_pkl = DETECT_PSEUDOGENES.out
            .flatten()
    } else {
        ch_cds_annot_pkl = MERGE_ANNOTATIONS.out.annotated_pickles
                .flatten()
    }


    //-----------------------------
    // RNA prediction
    //-----------------------------
    rna_outputs = FIND_RNAS(infiles_and_bakta_db)

    ch_rna_pkl = rna_outputs
        .flatten()
        .filter { it.name.endsWith('.pkl') }

    //-----------------------------
    // SORF extra search
    //-----------------------------
    ch_cds_keyed = ch_cds_annot_pkl.map { p -> tuple(sampleIdFromName(p.name), p) }
    ch_rna_keyed = ch_rna_pkl.map { p -> tuple(sampleIdFromName(p.name), p) }

    ch_sorf_in = ch_cds_keyed
        .join(ch_rna_keyed)
        .join(ch_asm)
        .map { sid, cds_pkl, rna_pkl, asm -> tuple(sid, asm, cds_pkl, rna_pkl) }
        .combine(bakta_db)

    SORF_EXTRA(ch_sorf_in)
}