Skip to content

Commit 7c02a3e

Browse files
committed
Add REGENIE chunked step 1 modules
1 parent 6859409 commit 7c02a3e

18 files changed

Lines changed: 1493 additions & 0 deletions
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- "bioconda::regenie=4.1.2"
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
process REGENIE_RUNL0 {
2+
tag "${meta.id}_${job_number}"
3+
label 'process_medium'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
7+
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7a/7a05bf71ea09adc5ebf9f0c656c9b326c0f16ba8e4966914972e58313469a466/data'
8+
: 'community.wave.seqera.io/library/regenie:4.1.2--5d361f9fcb2f85cf'}"
9+
10+
input:
11+
tuple val(meta), path(plink_genotype_file), path(plink_variant_file), path(plink_sample_file)
12+
tuple val(meta2), path(master), path(snplist), val(job_number)
13+
tuple val(meta3), path(pheno)
14+
tuple val(meta4), path(covar)
15+
val bsize
16+
17+
output:
18+
tuple val(meta), path("*_l0_Y*"), emit: l0_predictions
19+
tuple val(meta), path("*.log"), emit: log
20+
tuple val("${task.process}"), val('regenie'), eval('regenie --version 2>&1 | sed -n "1{s/^v//;s/\\.gz$//;p}"'), topic: versions, emit: versions_regenie
21+
22+
when:
23+
task.ext.when == null || task.ext.when
24+
25+
script:
26+
def args = task.ext.args ?: ''
27+
def input_prefix = plink_genotype_file.baseName
28+
def prefix = task.ext.prefix ?: input_prefix
29+
def run_prefix = "${prefix}_job${job_number}"
30+
def genotype_flag = plink_genotype_file.name.endsWith('.pgen') ? '--pgen' : '--bed'
31+
def covar_arg = covar ? "--covarFile ${covar}" : ''
32+
def bsize_arg = bsize ?: 1000
33+
"""
34+
regenie \\
35+
--step 1 \\
36+
${genotype_flag} ${input_prefix} \\
37+
--phenoFile ${pheno} \\
38+
${covar_arg} \\
39+
--bsize ${bsize_arg} \\
40+
--gz \\
41+
--threads ${task.cpus} \\
42+
${args} \\
43+
--out ${run_prefix} \\
44+
--run-l0 ${master},${job_number}
45+
"""
46+
47+
stub:
48+
def input_prefix = plink_genotype_file.baseName
49+
def prefix = task.ext.prefix ?: input_prefix
50+
def run_prefix = "${prefix}_job${job_number}"
51+
"""
52+
touch ${run_prefix}_l0_Y1
53+
touch ${run_prefix}.log
54+
"""
55+
}
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
2+
name: "regenie_runl0"
3+
description: Run one REGENIE step 1 level-0 job from a split master file
4+
keywords:
5+
- regenie
6+
- gwas
7+
- association
8+
- genomics
9+
- parallel
10+
tools:
11+
- "regenie":
12+
description: "Regenie is a C++ program for whole genome regression modelling of large genome-wide association studies (GWAS)."
13+
homepage: "https://rgcgithub.github.io/regenie/"
14+
documentation: "https://rgcgithub.github.io/regenie/options/"
15+
tool_dev_url: "https://github.com/rgcgithub/regenie"
16+
doi: "10.1038/s41588-021-00870-7"
17+
licence: ["MIT"]
18+
identifier: "biotools:regenie"
19+
20+
input:
21+
- - meta:
22+
type: map
23+
description: |
24+
Groovy Map containing genotype information
25+
Keep only the genotype analysis identifier in this map
26+
REGENIE consumes the staged basename of `plink_genotype_file` as the `--bed` or `--pgen` prefix, so the `.bed/.bim/.fam` or `.pgen/.pvar/.psam` files must share one basename
27+
e.g. `[ id:'cohort' ]`
28+
- plink_genotype_file:
29+
type: file
30+
description: PLINK primary genotype file in BED or PGEN format
31+
pattern: "*.{bed,pgen}"
32+
ontologies:
33+
- edam: "http://edamontology.org/format_3003" # BED
34+
- plink_variant_file:
35+
type: file
36+
description: PLINK variant metadata file in BIM or PVAR format
37+
pattern: "*.{bim,pvar,zst}"
38+
ontologies: []
39+
- plink_sample_file:
40+
type: file
41+
description: PLINK sample metadata file in FAM or PSAM format
42+
pattern: "*.{fam,psam}"
43+
ontologies: []
44+
- - meta2:
45+
type: map
46+
description: |
47+
Groovy Map containing split level-0 job information
48+
e.g. `[ id:'plink_simulated' ]`
49+
- master:
50+
type: file
51+
description: REGENIE split level-0 master file from `regenie/splitl0`
52+
pattern: "*.master"
53+
ontologies:
54+
- edam: "http://edamontology.org/format_2330" # Text
55+
- snplist:
56+
type: file
57+
description: Per-job variant list staged because the master file references it; the path is not passed explicitly to REGENIE
58+
pattern: "*_job*.snplist"
59+
ontologies:
60+
- edam: "http://edamontology.org/format_2330" # Text
61+
- job_number:
62+
type: integer
63+
description: Level-0 job number passed as the second value to `--run-l0`
64+
- - meta3:
65+
type: map
66+
description: |
67+
Groovy Map containing genotype/sample information associated with the phenotype file input
68+
Use the same phenotype file and phenotype-selection arguments for all `regenie/splitl0`, `regenie/runl0`, and `regenie/runl1` jobs in the same chunked step 1 analysis
69+
e.g. `[ id:'plink_simulated' ]`
70+
- pheno:
71+
type: file
72+
description: Phenotype file passed to `--phenoFile`
73+
pattern: "*.{phe,pheno,txt,tsv}"
74+
ontologies:
75+
- edam: "http://edamontology.org/format_3475" # TSV
76+
- - meta4:
77+
type: map
78+
description: |
79+
Groovy Map containing genotype/sample information associated with the covariate input
80+
Use compatible covariate inputs for all stages in the same chunked step 1 analysis
81+
e.g. `[ id:'plink_simulated' ]`
82+
- covar:
83+
type: file
84+
optional: true
85+
description: Optional covariate file passed to `--covarFile`; provide `[]` when absent
86+
pattern: "*.{covar,cov,txt,tsv}"
87+
ontologies:
88+
- edam: "http://edamontology.org/format_3475" # TSV
89+
- bsize:
90+
type: integer
91+
description: Optional block size passed to `--bsize`; pass `[]` to use the module default of `1000`
92+
93+
output:
94+
l0_predictions:
95+
- - meta:
96+
type: map
97+
description: |
98+
Groovy Map containing genotype/sample information
99+
e.g. `[ id:'plink_simulated' ]`
100+
- "*_l0_Y*":
101+
type: file
102+
description: REGENIE level-0 prediction files for this job
103+
pattern: "*_l0_Y*"
104+
ontologies: []
105+
log:
106+
- - meta:
107+
type: map
108+
description: |
109+
Groovy Map containing genotype information
110+
e.g. `[ id:'plink_simulated' ]`
111+
- "*.log":
112+
type: file
113+
description: REGENIE run level-0 log file
114+
pattern: "*.log"
115+
ontologies:
116+
- edam: "http://edamontology.org/format_2330" # Text
117+
versions_regenie:
118+
- - "${task.process}":
119+
type: string
120+
description: The process the versions were collected from
121+
- "regenie":
122+
type: string
123+
description: The tool name
124+
- 'regenie --version 2>&1 | sed -n "1{s/^v//;s/\.gz$//;p}"':
125+
type: eval
126+
description: The command used to generate the version of the tool
127+
128+
topics:
129+
versions:
130+
- - ${task.process}:
131+
type: string
132+
description: The process the versions were collected from
133+
- regenie:
134+
type: string
135+
description: The tool name
136+
- 'regenie --version 2>&1 | sed -n "1{s/^v//;s/\.gz$//;p}"':
137+
type: eval
138+
description: The command used to generate the version of the tool
139+
140+
notes: |
141+
`task.ext.args` is passed directly to REGENIE and can be used for stage-consistent options such as `--phenoColList`, `--bt`, `--loocv`, or `--keep-l0`.
142+
The same phenotype file, phenotype-selection arguments, trait mode arguments such as `--bt`, and compatible genotype/covariate inputs must be used across `regenie/splitl0`, every matching `regenie/runl0` job, and `regenie/runl1`.
143+
authors:
144+
- "@lyh970817"
145+
maintainers:
146+
- "@lyh970817"
147+
containers:
148+
conda:
149+
linux_amd64:
150+
lock_file: "modules/nf-core/regenie/runl0/.conda-lock/linux_amd64-bd-5d361f9fcb2f85cf_1.txt"
151+
docker:
152+
linux_amd64:
153+
build_id: "bd-5d361f9fcb2f85cf_1"
154+
name: "community.wave.seqera.io/library/regenie:4.1.2--5d361f9fcb2f85cf"
155+
scanId: "sc-cc9eb5ed5eb381dd_2"
156+
singularity:
157+
linux_amd64:
158+
build_id: "bd-7c121fb4ecd57890_1"
159+
name: "oras://community.wave.seqera.io/library/regenie:4.1.2--7c121fb4ecd57890"
160+
https: "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7a/7a05bf71ea09adc5ebf9f0c656c9b326c0f16ba8e4966914972e58313469a466/data"
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
nextflow_process {
2+
3+
name "Test Process REGENIE_RUNL0"
4+
config "./nextflow.config"
5+
script "../main.nf"
6+
process "REGENIE_RUNL0"
7+
8+
tag "modules"
9+
tag "modules_nfcore"
10+
tag "regenie"
11+
tag "regenie/splitl0"
12+
tag "regenie/runl0"
13+
14+
setup {
15+
run("REGENIE_SPLITL0") {
16+
script "../../splitl0/main.nf"
17+
process {
18+
"""
19+
input[0] = [
20+
[ id:'plink_simulated' ],
21+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
22+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
23+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
24+
]
25+
26+
input[1] = [
27+
[ id:'plink_simulated' ],
28+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
29+
]
30+
31+
input[2] = [
32+
[ id:'plink_simulated' ],
33+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
34+
]
35+
36+
input[3] = 100
37+
input[4] = 2
38+
"""
39+
}
40+
}
41+
}
42+
43+
test("homo_sapiens popgen - quantitative plink1 with covariates") {
44+
45+
when {
46+
params {
47+
module_args = '--phenoColList QuantitativeTrait'
48+
}
49+
process {
50+
"""
51+
input[0] = [
52+
[ id:'plink_simulated' ],
53+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
54+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
55+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
56+
]
57+
58+
input[1] = REGENIE_SPLITL0.out.master
59+
.combine(REGENIE_SPLITL0.out.snplists)
60+
.map { master_meta, master, snplist_meta, snplists ->
61+
[ master_meta, master, snplists.find { snplist -> snplist.getFileName().toString().contains('_job1.snplist') }, 1 ]
62+
}
63+
64+
input[2] = [
65+
[ id:'plink_simulated' ],
66+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
67+
]
68+
69+
input[3] = [
70+
[ id:'plink_simulated' ],
71+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
72+
]
73+
74+
input[4] = 100
75+
"""
76+
}
77+
}
78+
79+
then {
80+
assertAll(
81+
{ assert process.success },
82+
{ assert process.out.l0_predictions.size() == 1 },
83+
{ assert process.out.log.size() == 1 },
84+
{ assert process.out.l0_predictions.get(0).get(0).id == 'plink_simulated' },
85+
{ assert process.out.log.get(0).get(0).id == 'plink_simulated' },
86+
{
87+
def predictionFiles = process.out.l0_predictions.get(0).get(1)
88+
predictionFiles = predictionFiles instanceof List ? predictionFiles : [predictionFiles]
89+
assert predictionFiles.size() >= 1
90+
assert predictionFiles.every { path(it).getFileName().toString().contains('_l0_Y') }
91+
},
92+
{ assert path(process.out.log.get(0).get(1)).exists() },
93+
{
94+
def stablePredictions = process.out.l0_predictions.collect { prediction ->
95+
def predictionFiles = prediction[1] instanceof List ? prediction[1] : [prediction[1]]
96+
[prediction[0], predictionFiles.collect { path(it).getFileName().toString() }.sort()]
97+
}
98+
assert snapshot(
99+
stablePredictions,
100+
process.out.findAll { key, val -> key.startsWith('versions') }
101+
).match()
102+
}
103+
)
104+
}
105+
106+
}
107+
108+
test("homo_sapiens popgen - plink1 - stub") {
109+
110+
options "-stub"
111+
112+
when {
113+
params {
114+
module_args = '--phenoColList QuantitativeTrait'
115+
}
116+
process {
117+
"""
118+
input[0] = [
119+
[ id:'plink_simulated' ],
120+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
121+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
122+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
123+
]
124+
125+
input[1] = REGENIE_SPLITL0.out.master
126+
.combine(REGENIE_SPLITL0.out.snplists)
127+
.map { master_meta, master, snplist_meta, snplists ->
128+
[ master_meta, master, snplists.find { snplist -> snplist.getFileName().toString().contains('_job1.snplist') }, 1 ]
129+
}
130+
131+
input[2] = [
132+
[ id:'plink_simulated' ],
133+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
134+
]
135+
136+
input[3] = [
137+
[ id:'plink_simulated' ],
138+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
139+
]
140+
141+
input[4] = 100
142+
"""
143+
}
144+
}
145+
146+
then {
147+
assertAll(
148+
{ assert process.success },
149+
{
150+
def stablePredictions = process.out.l0_predictions.collect { prediction ->
151+
def predictionFiles = prediction[1] instanceof List ? prediction[1] : [prediction[1]]
152+
[prediction[0], predictionFiles.collect { path(it).getFileName().toString() }.sort()]
153+
}
154+
def stableLogs = process.out.log.collect { log ->
155+
[log[0], path(log[1]).getFileName().toString()]
156+
}
157+
assert snapshot(
158+
stablePredictions,
159+
stableLogs,
160+
process.out.findAll { key, val -> key.startsWith('versions') }
161+
).match()
162+
}
163+
)
164+
}
165+
166+
}
167+
168+
}

0 commit comments

Comments
 (0)