Skip to content

Commit 08caec7

Browse files
committed
Add REGENIE splitl0 module
1 parent 6859409 commit 08caec7

6 files changed

Lines changed: 458 additions & 0 deletions

File tree

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- "bioconda::regenie=4.1.2"
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
process REGENIE_SPLITL0 {
2+
tag "${meta.id}"
3+
label 'process_medium'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
7+
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7a/7a05bf71ea09adc5ebf9f0c656c9b326c0f16ba8e4966914972e58313469a466/data'
8+
: 'community.wave.seqera.io/library/regenie:4.1.2--5d361f9fcb2f85cf'}"
9+
10+
input:
11+
tuple val(meta), path(plink_genotype_file), path(plink_variant_file), path(plink_sample_file)
12+
tuple val(meta2), path(pheno)
13+
tuple val(meta3), path(covar)
14+
val bsize
15+
val n_jobs
16+
17+
output:
18+
tuple val(meta), path("*.master"), emit: master
19+
tuple val(meta), path("*_job*.snplist"), emit: snplists
20+
tuple val(meta), path("*.log"), emit: log
21+
tuple val("${task.process}"), val('regenie'), eval('regenie --version 2>&1 | sed -n "1{s/^v//;s/\\.gz$//;p}"'), topic: versions, emit: versions_regenie
22+
23+
when:
24+
task.ext.when == null || task.ext.when
25+
26+
script:
27+
def args = task.ext.args ?: ''
28+
def input_prefix = plink_genotype_file.baseName
29+
def prefix = task.ext.prefix ?: input_prefix
30+
def genotype_flag = plink_genotype_file.name.endsWith('.pgen') ? '--pgen' : '--bed'
31+
def covar_arg = covar ? "--covarFile ${covar}" : ''
32+
def bsize_arg = bsize ?: 1000
33+
"""
34+
regenie \\
35+
--step 1 \\
36+
${genotype_flag} ${input_prefix} \\
37+
--phenoFile ${pheno} \\
38+
${covar_arg} \\
39+
--bsize ${bsize_arg} \\
40+
--gz \\
41+
--threads ${task.cpus} \\
42+
${args} \\
43+
--out ${prefix} \\
44+
--split-l0 ${prefix},${n_jobs}
45+
"""
46+
47+
stub:
48+
def input_prefix = plink_genotype_file.baseName
49+
def prefix = task.ext.prefix ?: input_prefix
50+
def job_count = n_jobs as Integer
51+
def snplist_lines = (1..job_count).collect { job -> "touch ${prefix}_job${job}.snplist" }.join('\n')
52+
def master_lines = (1..job_count).collect { job -> "${prefix}_job${job} ${prefix}_job${job}.snplist" }.join('\\n')
53+
"""
54+
printf 'job snplist\\n${master_lines}\\n' > ${prefix}.master
55+
${snplist_lines}
56+
touch ${prefix}.log
57+
"""
58+
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
2+
name: "regenie_splitl0"
3+
description: Split REGENIE step 1 level-0 ridge-regression blocks into parallel jobs
4+
keywords:
5+
- regenie
6+
- gwas
7+
- association
8+
- genomics
9+
- parallel
10+
tools:
11+
- "regenie":
12+
description: "Regenie is a C++ program for whole genome regression modelling of large genome-wide association studies (GWAS)."
13+
homepage: "https://rgcgithub.github.io/regenie/"
14+
documentation: "https://rgcgithub.github.io/regenie/options/"
15+
tool_dev_url: "https://github.com/rgcgithub/regenie"
16+
doi: "10.1038/s41588-021-00870-7"
17+
licence: ["MIT"]
18+
identifier: "biotools:regenie"
19+
20+
input:
21+
- - meta:
22+
type: map
23+
description: |
24+
Groovy Map containing genotype information
25+
Keep only the genotype analysis identifier in this map
26+
REGENIE consumes the staged basename of `plink_genotype_file` as the `--bed` or `--pgen` prefix, so the `.bed/.bim/.fam` or `.pgen/.pvar/.psam` files must share one basename
27+
e.g. `[ id:'cohort' ]`
28+
- plink_genotype_file:
29+
type: file
30+
description: PLINK primary genotype file in BED or PGEN format
31+
pattern: "*.{bed,pgen}"
32+
ontologies:
33+
- edam: "http://edamontology.org/format_3003" # BED
34+
- plink_variant_file:
35+
type: file
36+
description: PLINK variant metadata file in BIM or PVAR format
37+
pattern: "*.{bim,pvar,zst}"
38+
ontologies: []
39+
- plink_sample_file:
40+
type: file
41+
description: PLINK sample metadata file in FAM or PSAM format
42+
pattern: "*.{fam,psam}"
43+
ontologies: []
44+
- - meta2:
45+
type: map
46+
description: |
47+
Groovy Map containing genotype/sample information associated with the phenotype file input
48+
Use the same phenotype file and phenotype-selection arguments for all `regenie/splitl0`, `regenie/runl0`, and `regenie/runl1` jobs in the same chunked step 1 analysis
49+
e.g. `[ id:'plink_simulated' ]`
50+
- pheno:
51+
type: file
52+
description: Phenotype file passed to `--phenoFile`
53+
pattern: "*.{phe,pheno,txt,tsv}"
54+
ontologies:
55+
- edam: "http://edamontology.org/format_3475" # TSV
56+
- - meta3:
57+
type: map
58+
description: |
59+
Groovy Map containing genotype/sample information associated with the covariate input
60+
Use compatible covariate inputs for all stages in the same chunked step 1 analysis
61+
e.g. `[ id:'plink_simulated' ]`
62+
- covar:
63+
type: file
64+
optional: true
65+
description: Optional covariate file passed to `--covarFile`; provide `[]` when absent
66+
pattern: "*.{covar,cov,txt,tsv}"
67+
ontologies:
68+
- edam: "http://edamontology.org/format_3475" # TSV
69+
- bsize:
70+
type: integer
71+
description: Optional block size passed to `--bsize`; pass `[]` to use the module default of `1000`
72+
- n_jobs:
73+
type: integer
74+
description: Number of level-0 jobs requested with `--split-l0`
75+
76+
output:
77+
master:
78+
- - meta:
79+
type: map
80+
description: |
81+
Groovy Map containing genotype/sample information
82+
e.g. `[ id:'plink_simulated' ]`
83+
- "*.master":
84+
type: file
85+
description: REGENIE split level-0 master file
86+
pattern: "*.master"
87+
ontologies:
88+
- edam: "http://edamontology.org/format_2330" # Text
89+
snplists:
90+
- - meta:
91+
type: map
92+
description: |
93+
Groovy Map containing genotype/sample information
94+
e.g. `[ id:'plink_simulated' ]`
95+
- "*_job*.snplist":
96+
type: file
97+
description: REGENIE per-job variant list files referenced by the master file
98+
pattern: "*_job*.snplist"
99+
ontologies:
100+
- edam: "http://edamontology.org/format_2330" # Text
101+
log:
102+
- - meta:
103+
type: map
104+
description: |
105+
Groovy Map containing genotype information
106+
e.g. `[ id:'plink_simulated' ]`
107+
- "*.log":
108+
type: file
109+
description: REGENIE split level-0 log file
110+
pattern: "*.log"
111+
ontologies:
112+
- edam: "http://edamontology.org/format_2330" # Text
113+
versions_regenie:
114+
- - "${task.process}":
115+
type: string
116+
description: The process the versions were collected from
117+
- "regenie":
118+
type: string
119+
description: The tool name
120+
- 'regenie --version 2>&1 | sed -n "1{s/^v//;s/\.gz$//;p}"':
121+
type: eval
122+
description: The command used to generate the version of the tool
123+
124+
topics:
125+
versions:
126+
- - ${task.process}:
127+
type: string
128+
description: The process the versions were collected from
129+
- regenie:
130+
type: string
131+
description: The tool name
132+
- 'regenie --version 2>&1 | sed -n "1{s/^v//;s/\.gz$//;p}"':
133+
type: eval
134+
description: The command used to generate the version of the tool
135+
136+
notes: |
137+
`task.ext.args` is passed directly to REGENIE and can be used for stage-consistent options such as `--phenoColList`, `--bt`, `--loocv`, or `--keep-l0`.
138+
The same phenotype file, phenotype-selection arguments, trait mode arguments such as `--bt`, and compatible genotype/covariate inputs must be used across `regenie/splitl0`, every matching `regenie/runl0` job, and `regenie/runl1`.
139+
authors:
140+
- "@lyh970817"
141+
maintainers:
142+
- "@lyh970817"
143+
containers:
144+
conda:
145+
linux_amd64:
146+
lock_file: "modules/nf-core/regenie/splitl0/.conda-lock/linux_amd64-bd-5d361f9fcb2f85cf_1.txt"
147+
docker:
148+
linux_amd64:
149+
build_id: "bd-5d361f9fcb2f85cf_1"
150+
name: "community.wave.seqera.io/library/regenie:4.1.2--5d361f9fcb2f85cf"
151+
scanId: "sc-cc9eb5ed5eb381dd_2"
152+
singularity:
153+
linux_amd64:
154+
build_id: "bd-7c121fb4ecd57890_1"
155+
name: "oras://community.wave.seqera.io/library/regenie:4.1.2--7c121fb4ecd57890"
156+
https: "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7a/7a05bf71ea09adc5ebf9f0c656c9b326c0f16ba8e4966914972e58313469a466/data"
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
nextflow_process {
2+
3+
name "Test Process REGENIE_SPLITL0"
4+
config "./nextflow.config"
5+
script "../main.nf"
6+
process "REGENIE_SPLITL0"
7+
8+
tag "modules"
9+
tag "modules_nfcore"
10+
tag "regenie"
11+
tag "regenie/splitl0"
12+
13+
test("homo_sapiens popgen - quantitative plink1 with covariates") {
14+
15+
when {
16+
params {
17+
module_args = '--phenoColList QuantitativeTrait'
18+
}
19+
process {
20+
"""
21+
input[0] = [
22+
[ id:'plink_simulated' ],
23+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
24+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
25+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
26+
]
27+
28+
input[1] = [
29+
[ id:'plink_simulated' ],
30+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
31+
]
32+
33+
input[2] = [
34+
[ id:'plink_simulated' ],
35+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
36+
]
37+
38+
input[3] = 100
39+
input[4] = 2
40+
"""
41+
}
42+
}
43+
44+
then {
45+
assertAll(
46+
{ assert process.success },
47+
{ assert process.out.master.size() == 1 },
48+
{ assert process.out.snplists.size() == 1 },
49+
{ assert process.out.log.size() == 1 },
50+
{ assert process.out.master.get(0).get(0).id == 'plink_simulated' },
51+
{ assert process.out.snplists.get(0).get(0).id == 'plink_simulated' },
52+
{ assert process.out.log.get(0).get(0).id == 'plink_simulated' },
53+
{
54+
def master = path(process.out.master.get(0).get(1))
55+
def lines = master.text.readLines().findAll { it }
56+
assert master.exists()
57+
assert lines.size() == 3
58+
assert lines[0] ==~ /\d+\s+\d+/
59+
assert lines.drop(1).every { line ->
60+
line.contains('plink_simulated_job') && !line.contains('/')
61+
}
62+
},
63+
{
64+
def snplists = process.out.snplists.get(0).get(1)
65+
assert snplists.size() == 2
66+
assert snplists.collect { path(it).getFileName().toString() }.sort() == [
67+
'plink_simulated_job1.snplist',
68+
'plink_simulated_job2.snplist'
69+
]
70+
},
71+
{ assert path(process.out.log.get(0).get(1)).exists() },
72+
{
73+
def stableMaster = process.out.master.collect { master ->
74+
[master[0], path(master[1]).getFileName().toString()]
75+
}
76+
def stableSnplists = process.out.snplists.collect { snplist ->
77+
[snplist[0], snplist[1].collect { path(it).getFileName().toString() }.sort()]
78+
}
79+
assert snapshot(
80+
stableMaster,
81+
stableSnplists,
82+
process.out.findAll { key, val -> key.startsWith('versions') }
83+
).match()
84+
}
85+
)
86+
}
87+
88+
}
89+
90+
test("homo_sapiens popgen - plink1 - stub") {
91+
92+
options "-stub"
93+
94+
when {
95+
params {
96+
module_args = '--phenoColList QuantitativeTrait'
97+
}
98+
process {
99+
"""
100+
input[0] = [
101+
[ id:'plink_simulated' ],
102+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
103+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
104+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
105+
]
106+
107+
input[1] = [
108+
[ id:'plink_simulated' ],
109+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
110+
]
111+
112+
input[2] = [
113+
[ id:'plink_simulated' ],
114+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
115+
]
116+
117+
input[3] = 100
118+
input[4] = 2
119+
"""
120+
}
121+
}
122+
123+
then {
124+
assertAll(
125+
{ assert process.success },
126+
{
127+
def stableMaster = process.out.master.collect { master ->
128+
[master[0], path(master[1]).getFileName().toString()]
129+
}
130+
def stableSnplists = process.out.snplists.collect { snplist ->
131+
[snplist[0], snplist[1].collect { path(it).getFileName().toString() }.sort()]
132+
}
133+
def stableLogs = process.out.log.collect { log ->
134+
[log[0], path(log[1]).getFileName().toString()]
135+
}
136+
assert snapshot(
137+
stableMaster,
138+
stableSnplists,
139+
stableLogs,
140+
process.out.findAll { key, val -> key.startsWith('versions') }
141+
).match()
142+
}
143+
)
144+
}
145+
146+
}
147+
148+
}

0 commit comments

Comments
 (0)