Normalize gcta/remlldms LDMS setup

lyh970817 · lyh970817 · commit 9198ed58ecdd · 2026-05-16T18:30:17.000+08:00
diff --git a/modules/nf-core/gcta/remlldms/main.nf b/modules/nf-core/gcta/remlldms/main.nf
@@ -2,28 +2,28 @@ process GCTA_REMLLDMS {
     tag "gcta_reml_ldms_${meta.id}_${meta2.id}"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9' :
-        'community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9' }"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/46/46b0d05f0daa47561d87d2a9cac5e51edc2c78e26f1bbab439c688386241a274/data'
+        : 'community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9'}"
 
     input:
-    tuple val(meta), path(phenotypes_file)
+    tuple val(meta), path(phenotypes_file), val(mpheno), val(prevalence)
     tuple val(meta2), path(mgrm_file), path(grm_files)
     tuple val(meta3), path(quant_covariates_file)
     tuple val(meta4), path(cat_covariates_file)
-    val(mpheno)
 
     output:
     tuple val(meta), path("*.hsq"), emit: reml_results
-    tuple val("${task.process}"), val("gcta"), eval("gcta --version 2>&1 | grep 'version v' | tr -s ' ' | cut -d' ' -f3 | sed 's/^v//'"), emit: versions_gcta, topic: versions
+    tuple val("${task.process}"), val("gcta"), eval("gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'"), emit: versions_gcta, topic: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def mpheno_value = (mpheno == null || (mpheno instanceof Collection && mpheno.isEmpty())) ? 1 : mpheno
+    def mpheno_value = mpheno ?: 1
     def mpheno_param = "--mpheno ${mpheno_value}"
+    def prevalence_param = prevalence ? "--prevalence ${prevalence}" : ''
     def qcovar_param = quant_covariates_file ? "--qcovar ${quant_covariates_file}" : ''
     def covar_param = cat_covariates_file ? "--covar ${cat_covariates_file}" : ''
     def extra_args = task.ext.args ?: ''
@@ -32,10 +32,12 @@ process GCTA_REMLLDMS {
     set -euo pipefail
 
     gcta \\
+        --reml \\
         --reml-no-constrain \\
         --mgrm ${mgrm_file} \\
         --pheno ${phenotypes_file} \\
         ${mpheno_param} \\
+        ${prevalence_param} \\
         ${qcovar_param} \\
         ${covar_param} \\
         --out "${prefix}" \\
diff --git a/modules/nf-core/gcta/remlldms/meta.yml b/modules/nf-core/gcta/remlldms/meta.yml
@@ -3,15 +3,22 @@ name: "gcta_remlldms"
 description: Run REML-LDMS heritability estimation with an MGRM manifest
 keywords:
   - gcta
+  - genome-wide complex trait analysis
   - reml
+  - restricted maximum likelihood
   - ldms
+  - linkage disequilibrium and minor allele frequency stratification
+  - grm
+  - genetic relationship matrix
   - genetics
 tools:
   - "gcta":
       description: "Genome-wide Complex Trait Analysis (GCTA) estimates genetic relationships, variance components, and association statistics from genome-wide data."
       homepage: "https://yanglab.westlake.edu.cn/software/gcta/"
       documentation: "https://yanglab.westlake.edu.cn/software/gcta/static/gcta_doc_latest.pdf"
       tool_dev_url: "https://yanglab.westlake.edu.cn/software/gcta/"
+      licence: ["GPL-3.0-only"]
+      identifier: "biotools:gcta"
 input:
   - - meta:
         type: map
@@ -24,6 +31,17 @@ input:
         pattern: "*.{phe,pheno,txt,tsv}"
         ontologies:
           - edam: "http://edamontology.org/format_3475"
+    - mpheno:
+        type: integer
+        description: |
+          Phenotype column selector passed to `--mpheno`.
+          Pass `[]` or `null` to use the default first phenotype column
+          (`--mpheno 1`).
+    - prevalence:
+        type: float
+        description: |
+          Population prevalence passed to `--prevalence` for case-control traits.
+          Pass `[]` or `null` for quantitative traits.
   - - meta2:
         type: map
         description: |
@@ -37,8 +55,8 @@ input:
           - edam: "http://edamontology.org/format_2330"
     - grm_files:
         type: file
-        description: GRM sidecar files referenced by `mgrm_file`
-        pattern: "*"
+        description: GRM bundles referenced by `mgrm_file`
+        pattern: "*.grm.*"
         ontologies: []
   - - meta3:
         type: map
@@ -62,11 +80,6 @@ input:
         pattern: "*.{covar,cov,txt,tsv}"
         ontologies:
           - edam: "http://edamontology.org/format_3475"
-  - mpheno:
-      type: integer
-      description: |
-        Phenotype column selector passed to `--mpheno`.
-        Pass `1` explicitly for the default first phenotype column.
 output:
   reml_results:
     - - meta:
@@ -87,7 +100,7 @@ output:
       - "gcta":
           type: string
           description: The tool name
-      - "gcta --version 2>&1 | grep 'version v' | tr -s ' ' | cut -d' ' -f3 | sed 's/^v//'":
+      - "gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'":
           type: eval
           description: The command used to retrieve the GCTA version
 topics:
@@ -98,10 +111,10 @@ topics:
       - gcta:
           type: string
           description: The tool name
-      - gcta --version 2>&1 | grep 'version v' | tr -s ' ' | cut -d' ' -f3 | sed 's/^v//':
+      - "gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'":
           type: eval
           description: The command used to retrieve the GCTA version
 authors:
-  - "@andongni"
+  - "@lyh970817"
 maintainers:
-  - "@andongni"
+  - "@lyh970817"
diff --git a/modules/nf-core/gcta/remlldms/tests/main.nf.test b/modules/nf-core/gcta/remlldms/tests/main.nf.test
@@ -40,6 +40,20 @@ nextflow_process {
             }
         }
 
+        run("GAWK", alias: "GAWK_BINARY_PHENOTYPE") {
+            script "../../../gawk/main.nf"
+            process {
+                """
+                input[0] = [
+                    [ id:'BinaryTrait' ],
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated_binary_phenoname.phe', checkIfExists: true)
+                ]
+                input[1] = Channel.of('FNR == 1 { next } { print \$1, \$2, \$3 + 1 }').collectFile(name:'binary_phenotypes.awk')
+                input[2] = false
+                """
+            }
+        }
+
         run("GAWK", alias: "GAWK_QUANTITATIVE_COVARIATES") {
             script "../../../gawk/main.nf"
             process {
@@ -68,14 +82,16 @@ nextflow_process {
             }
         }
 
-        run("GCTA_MAKEGRMPART", alias: "GCTA_MAKEGRM_LDMS1") {
+        run("GCTA_MAKEGRMPART", alias: "GCTA_MAKEGRMPART_LDMS1") {
             script "../../makegrmpart/main.nf"
             process {
                 """
                 file('plink_simulated.mbfile').text = 'plink_simulated\\n'
 
                 input[0] = [
-                    [ id:'plink_simulated_ldms1', part_gcta_job:1, nparts_gcta:1 ],
+                    [ id:'plink_simulated_ldms1' ],
+                    1,
+                    1,
                     file('plink_simulated.mbfile'),
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true)
@@ -98,17 +114,16 @@ nextflow_process {
         when {
             process {
                 """
-                input[0] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'QuantitativeTraitMpheno2' ], pheno] }
+                input[0] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'QuantitativeTraitMpheno2' ], pheno, 2, []] }
                 input[1] = Channel
                     .of('plink_simulated_ldms1.part_1_1')
                     .collectFile(name:'plink_simulated_ldms.mgrm', newLine: true)
-                    .combine(GCTA_MAKEGRM_LDMS1.out.grm_files)
-                    .map { mgrm_file, meta, grm_id, grm_bin, grm_n_bin ->
-                        [[ id:'plink_simulated_ldms' ], mgrm_file, [grm_id, grm_bin, grm_n_bin]]
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
                     }
                 input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output
                 input[3] = GAWK_CATEGORICAL_COVARIATES.out.output
-                input[4] = 2
                 """
             }
         }
@@ -133,17 +148,16 @@ nextflow_process {
         when {
             process {
                 """
-                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output
+                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [meta, pheno, 1, []] }
                 input[1] = Channel
                     .of('plink_simulated_ldms1.part_1_1')
                     .collectFile(name:'plink_simulated_ldms.mgrm', newLine: true)
-                    .combine(GCTA_MAKEGRM_LDMS1.out.grm_files)
-                    .map { mgrm_file, meta, grm_id, grm_bin, grm_n_bin ->
-                        [[ id:'plink_simulated_ldms' ], mgrm_file, [grm_id, grm_bin, grm_n_bin]]
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
                     }
                 input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output
                 input[3] = GAWK_CATEGORICAL_COVARIATES.out.output
-                input[4] = 1
                 """
             }
         }
@@ -168,17 +182,16 @@ nextflow_process {
         when {
             process {
                 """
-                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output
+                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [meta, pheno, 1, null] }
                 input[1] = Channel
                     .of('plink_simulated_ldms1.part_1_1')
                     .collectFile(name:'plink_simulated_ldms.mgrm', newLine: true)
-                    .combine(GCTA_MAKEGRM_LDMS1.out.grm_files)
-                    .map { mgrm_file, meta, grm_id, grm_bin, grm_n_bin ->
-                        [[ id:'plink_simulated_ldms' ], mgrm_file, [grm_id, grm_bin, grm_n_bin]]
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
                     }
                 input[2] = [[ id:'covariates_quant' ], []]
                 input[3] = [[ id:'covariates_cat' ], []]
-                input[4] = 1
                 """
             }
         }
@@ -198,22 +211,55 @@ nextflow_process {
         }
     }
 
+    test("homo_sapiens gsmr - binary phenotype with ldms mgrm and prevalence") {
+        config "./nextflow.config"
+        when {
+            process {
+                """
+                input[0] = GAWK_BINARY_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'BinaryTraitPrevalence' ], pheno, 1, 0.1] }
+                input[1] = Channel
+                    .of('plink_simulated_ldms1.part_1_1')
+                    .collectFile(name:'plink_simulated_ldms.mgrm', newLine: true)
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
+                    }
+                input[2] = [[ id:'covariates_quant' ], []]
+                input[3] = [[ id:'covariates_cat' ], []]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert process.out.reml_results.size() == 1 },
+                { assert process.out.reml_results.get(0).get(0).id == "BinaryTraitPrevalence" },
+                {
+                    assert snapshot(
+                        process.out.reml_results,
+                        process.out.findAll { key, val -> key.startsWith('versions') }
+                    ).match()
+                }
+            )
+        }
+    }
+
     test("homo_sapiens gsmr - ldms mgrm mpheno defaults to first phenotype when empty") {
         config "./nextflow.config"
         when {
             process {
                 """
-                input[0] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'QuantitativeTraitMphenoDefault' ], pheno] }
+                input[0] = GAWK_MULTI_PHENOTYPES.out.output.map { meta, pheno -> [[ id:'QuantitativeTraitMphenoDefault' ], pheno, [], []] }
                 input[1] = Channel
                     .of('plink_simulated_ldms1.part_1_1')
                     .collectFile(name:'plink_simulated_ldms.mgrm', newLine: true)
-                    .combine(GCTA_MAKEGRM_LDMS1.out.grm_files)
-                    .map { mgrm_file, meta, grm_id, grm_bin, grm_n_bin ->
-                        [[ id:'plink_simulated_ldms' ], mgrm_file, [grm_id, grm_bin, grm_n_bin]]
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
                     }
                 input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output
                 input[3] = GAWK_CATEGORICAL_COVARIATES.out.output
-                input[4] = []
                 """
             }
         }
@@ -238,17 +284,16 @@ nextflow_process {
         when {
             process {
                 """
-                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'QuantitativeTraitInvalidLdms' ], pheno] }
+                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [[ id:'QuantitativeTraitInvalidLdms' ], pheno, 1, []] }
                 input[1] = Channel
                     .of('plink_simulated_ldms_missing.part_1_1')
                     .collectFile(name:'plink_simulated_ldms_broken.mgrm', newLine: true)
-                    .combine(GCTA_MAKEGRM_LDMS1.out.grm_files)
-                    .map { mgrm_file, meta, grm_id, grm_bin, grm_n_bin ->
-                        [[ id:'plink_simulated_ldms' ], mgrm_file, [grm_id, grm_bin, grm_n_bin]]
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
                     }
                 input[2] = GAWK_QUANTITATIVE_COVARIATES.out.output
                 input[3] = GAWK_CATEGORICAL_COVARIATES.out.output
-                input[4] = 1
                 """
             }
         }
@@ -268,17 +313,16 @@ nextflow_process {
         when {
             process {
                 """
-                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output
+                input[0] = GAWK_QUANTITATIVE_PHENOTYPE.out.output.map { meta, pheno -> [meta, pheno, 1, []] }
                 input[1] = Channel
                     .of('plink_simulated_ldms1.part_1_1')
                     .collectFile(name:'plink_simulated_ldms.mgrm', newLine: true)
-                    .combine(GCTA_MAKEGRM_LDMS1.out.grm_files)
-                    .map { mgrm_file, meta, grm_id, grm_bin, grm_n_bin ->
-                        [[ id:'plink_simulated_ldms' ], mgrm_file, [grm_id, grm_bin, grm_n_bin]]
+                    .combine(GCTA_MAKEGRMPART_LDMS1.out.grm_files)
+                    .map { mgrm_file, meta, grm_files, nparts_gcta, part_gcta_job ->
+                        [[ id:'plink_simulated_ldms' ], mgrm_file, grm_files]
                     }
                 input[2] = [[ id:'covariates_quant' ], []]
                 input[3] = [[ id:'covariates_cat' ], []]
-                input[4] = 1
                 """
             }
         }
diff --git a/modules/nf-core/gcta/remlldms/tests/main.nf.test.snap b/modules/nf-core/gcta/remlldms/tests/main.nf.test.snap
@@ -128,5 +128,31 @@
             "nextflow": "25.10.4"
         },
         "timestamp": "2026-03-21T18:45:48.772452882"
+    },
+    "homo_sapiens gsmr - binary phenotype with ldms mgrm and prevalence": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "BinaryTraitPrevalence"
+                    },
+                    "BinaryTraitPrevalence.hsq:md5,7abad219d974e43713fc65d61f62fd30"
+                ]
+            ],
+            {
+                "versions_gcta": [
+                    [
+                        "GCTA_REMLLDMS",
+                        "gcta",
+                        "1.94.1"
+                    ]
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.3",
+            "nextflow": "25.10.4"
+        },
+        "timestamp": "2026-05-13T16:08:04.492576073"
     }
 }