Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions modules/nf-core/regenie/splitl0/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::regenie=4.1.2"
58 changes: 58 additions & 0 deletions modules/nf-core/regenie/splitl0/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
process REGENIE_SPLITL0 {
tag "${meta.id}"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7a/7a05bf71ea09adc5ebf9f0c656c9b326c0f16ba8e4966914972e58313469a466/data'
: 'community.wave.seqera.io/library/regenie:4.1.2--5d361f9fcb2f85cf'}"

input:
tuple val(meta), path(plink_genotype_file), path(plink_variant_file), path(plink_sample_file)
tuple val(meta2), path(pheno)
tuple val(meta3), path(covar)
val bsize
val n_jobs

output:
tuple val(meta), path("*.master"), emit: master
tuple val(meta), path("*_job*.snplist"), emit: snplists
tuple val(meta), path("*.log"), emit: log
tuple val("${task.process}"), val('regenie'), eval('regenie --version 2>&1 | sed -n "1{s/^v//;s/\\.gz$//;p}"'), topic: versions, emit: versions_regenie

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def input_prefix = plink_genotype_file.baseName
def prefix = task.ext.prefix ?: input_prefix
def genotype_flag = plink_genotype_file.name.endsWith('.pgen') ? '--pgen' : '--bed'
def covar_arg = covar ? "--covarFile ${covar}" : ''
def bsize_arg = bsize ?: 1000
"""
regenie \\
--step 1 \\
${genotype_flag} ${input_prefix} \\
--phenoFile ${pheno} \\
${covar_arg} \\
--bsize ${bsize_arg} \\
--gz \\
--threads ${task.cpus} \\
${args} \\
--out ${prefix} \\
--split-l0 ${prefix},${n_jobs}
"""

stub:
def input_prefix = plink_genotype_file.baseName
def prefix = task.ext.prefix ?: input_prefix
def job_count = n_jobs as Integer
def snplist_lines = (1..job_count).collect { job -> "touch ${prefix}_job${job}.snplist" }.join('\n')
def master_lines = (1..job_count).collect { job -> "${prefix}_job${job} ${prefix}_job${job}.snplist" }.join('\\n')
"""
printf 'job snplist\\n${master_lines}\\n' > ${prefix}.master
${snplist_lines}
touch ${prefix}.log
"""
}
156 changes: 156 additions & 0 deletions modules/nf-core/regenie/splitl0/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "regenie_splitl0"
description: Split REGENIE step 1 level-0 ridge-regression blocks into parallel jobs
keywords:
- regenie
- gwas
- association
- genomics
- parallel
tools:
- "regenie":
description: "Regenie is a C++ program for whole genome regression modelling of large genome-wide association studies (GWAS)."
homepage: "https://rgcgithub.github.io/regenie/"
documentation: "https://rgcgithub.github.io/regenie/options/"
tool_dev_url: "https://github.com/rgcgithub/regenie"
doi: "10.1038/s41588-021-00870-7"
licence: ["MIT"]
identifier: "biotools:regenie"

input:
- - meta:
type: map
description: |
Groovy Map containing genotype information
Keep only the genotype analysis identifier in this map
REGENIE consumes the staged basename of `plink_genotype_file` as the `--bed` or `--pgen` prefix, so the `.bed/.bim/.fam` or `.pgen/.pvar/.psam` files must share one basename
e.g. `[ id:'cohort' ]`
- plink_genotype_file:
type: file
description: PLINK primary genotype file in BED or PGEN format
pattern: "*.{bed,pgen}"
ontologies:
- edam: "http://edamontology.org/format_3003" # BED
- plink_variant_file:
type: file
description: PLINK variant metadata file in BIM or PVAR format
pattern: "*.{bim,pvar,zst}"
ontologies: []
- plink_sample_file:
type: file
description: PLINK sample metadata file in FAM or PSAM format
pattern: "*.{fam,psam}"
ontologies: []
- - meta2:
type: map
description: |
Groovy Map containing genotype/sample information associated with the phenotype file input
Use the same phenotype file and phenotype-selection arguments for all `regenie/splitl0`, `regenie/runl0`, and `regenie/runl1` jobs in the same chunked step 1 analysis
e.g. `[ id:'plink_simulated' ]`
- pheno:
type: file
description: Phenotype file passed to `--phenoFile`
pattern: "*.{phe,pheno,txt,tsv}"
ontologies:
- edam: "http://edamontology.org/format_3475" # TSV
- - meta3:
type: map
description: |
Groovy Map containing genotype/sample information associated with the covariate input
Use compatible covariate inputs for all stages in the same chunked step 1 analysis
e.g. `[ id:'plink_simulated' ]`
- covar:
type: file
optional: true
description: Optional covariate file passed to `--covarFile`; provide `[]` when absent
pattern: "*.{covar,cov,txt,tsv}"
ontologies:
- edam: "http://edamontology.org/format_3475" # TSV
- bsize:
type: integer
description: Optional block size passed to `--bsize`; pass `[]` to use the module default of `1000`
- n_jobs:
type: integer
description: Number of level-0 jobs requested with `--split-l0`

output:
master:
- - meta:
type: map
description: |
Groovy Map containing genotype/sample information
e.g. `[ id:'plink_simulated' ]`
- "*.master":
type: file
description: REGENIE split level-0 master file
pattern: "*.master"
ontologies:
- edam: "http://edamontology.org/format_2330" # Text
snplists:
- - meta:
type: map
description: |
Groovy Map containing genotype/sample information
e.g. `[ id:'plink_simulated' ]`
- "*_job*.snplist":
type: file
description: REGENIE per-job variant list files referenced by the master file
pattern: "*_job*.snplist"
ontologies:
- edam: "http://edamontology.org/format_2330" # Text
log:
- - meta:
type: map
description: |
Groovy Map containing genotype information
e.g. `[ id:'plink_simulated' ]`
- "*.log":
type: file
description: REGENIE split level-0 log file
pattern: "*.log"
ontologies:
- edam: "http://edamontology.org/format_2330" # Text
versions_regenie:
- - "${task.process}":
type: string
description: The process the versions were collected from
- "regenie":
type: string
description: The tool name
- 'regenie --version 2>&1 | sed -n "1{s/^v//;s/\.gz$//;p}"':
type: eval
description: The command used to generate the version of the tool

topics:
versions:
- - ${task.process}:
type: string
description: The process the versions were collected from
- regenie:
type: string
description: The tool name
- 'regenie --version 2>&1 | sed -n "1{s/^v//;s/\.gz$//;p}"':
type: eval
description: The command used to generate the version of the tool

notes: |
`task.ext.args` is passed directly to REGENIE and can be used for stage-consistent options such as `--phenoColList`, `--bt`, `--loocv`, or `--keep-l0`.
The same phenotype file, phenotype-selection arguments, trait mode arguments such as `--bt`, and compatible genotype/covariate inputs must be used across `regenie/splitl0`, every matching `regenie/runl0` job, and `regenie/runl1`.
authors:
- "@lyh970817"
maintainers:
- "@lyh970817"
containers:
conda:
linux_amd64:
lock_file: "modules/nf-core/regenie/splitl0/.conda-lock/linux_amd64-bd-5d361f9fcb2f85cf_1.txt"
docker:
linux_amd64:
build_id: "bd-5d361f9fcb2f85cf_1"
name: "community.wave.seqera.io/library/regenie:4.1.2--5d361f9fcb2f85cf"
scanId: "sc-cc9eb5ed5eb381dd_2"
singularity:
linux_amd64:
build_id: "bd-7c121fb4ecd57890_1"
name: "oras://community.wave.seqera.io/library/regenie:4.1.2--7c121fb4ecd57890"
https: "https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/7a/7a05bf71ea09adc5ebf9f0c656c9b326c0f16ba8e4966914972e58313469a466/data"
148 changes: 148 additions & 0 deletions modules/nf-core/regenie/splitl0/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
nextflow_process {

name "Test Process REGENIE_SPLITL0"
config "./nextflow.config"
script "../main.nf"
process "REGENIE_SPLITL0"

tag "modules"
tag "modules_nfcore"
tag "regenie"
tag "regenie/splitl0"

test("homo_sapiens popgen - quantitative plink1 with covariates") {

when {
params {
module_args = '--phenoColList QuantitativeTrait'
}
process {
"""
input[0] = [
[ id:'plink_simulated' ],
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
]

input[1] = [
[ id:'plink_simulated' ],
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
]

input[2] = [
[ id:'plink_simulated' ],
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
]

input[3] = 100
input[4] = 2
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert process.out.master.size() == 1 },
{ assert process.out.snplists.size() == 1 },
{ assert process.out.log.size() == 1 },
{ assert process.out.master.get(0).get(0).id == 'plink_simulated' },
{ assert process.out.snplists.get(0).get(0).id == 'plink_simulated' },
{ assert process.out.log.get(0).get(0).id == 'plink_simulated' },
{
def master = path(process.out.master.get(0).get(1))
def lines = master.text.readLines().findAll { it }
assert master.exists()
assert lines.size() == 3
assert lines[0] ==~ /\d+\s+\d+/
assert lines.drop(1).every { line ->
line.contains('plink_simulated_job') && !line.contains('/')
}
},
{
def snplists = process.out.snplists.get(0).get(1)
assert snplists.size() == 2
assert snplists.collect { path(it).getFileName().toString() }.sort() == [
'plink_simulated_job1.snplist',
'plink_simulated_job2.snplist'
]
},
{ assert path(process.out.log.get(0).get(1)).exists() },
{
def stableMaster = process.out.master.collect { master ->
[master[0], path(master[1]).getFileName().toString()]
}
def stableSnplists = process.out.snplists.collect { snplist ->
[snplist[0], snplist[1].collect { path(it).getFileName().toString() }.sort()]
}
assert snapshot(
stableMaster,
stableSnplists,
process.out.findAll { key, val -> key.startsWith('versions') }
).match()
}
)
}

}

test("homo_sapiens popgen - plink1 - stub") {

options "-stub"

when {
params {
module_args = '--phenoColList QuantitativeTrait'
}
process {
"""
input[0] = [
[ id:'plink_simulated' ],
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true),
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
]

input[1] = [
[ id:'plink_simulated' ],
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_quantitative_phenoname.phe', checkIfExists: true)
]

input[2] = [
[ id:'plink_simulated' ],
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_covariates.txt', checkIfExists: true)
]

input[3] = 100
input[4] = 2
"""
}
}

then {
assertAll(
{ assert process.success },
{
def stableMaster = process.out.master.collect { master ->
[master[0], path(master[1]).getFileName().toString()]
}
def stableSnplists = process.out.snplists.collect { snplist ->
[snplist[0], snplist[1].collect { path(it).getFileName().toString() }.sort()]
}
def stableLogs = process.out.log.collect { log ->
[log[0], path(log[1]).getFileName().toString()]
}
assert snapshot(
stableMaster,
stableSnplists,
stableLogs,
process.out.findAll { key, val -> key.startsWith('versions') }
).match()
}
)
}

}

}
Loading