Normalize gcta/addgrms GRM inputs

lyh970817 · lyh970817 · commit c3226e09ec68 · 2026-05-16T18:28:33.000+08:00
diff --git a/modules/nf-core/gcta/addgrms/main.nf b/modules/nf-core/gcta/addgrms/main.nf
@@ -2,31 +2,32 @@ process GCTA_ADDGRMS {
     tag "${meta.id}"
     label 'process_medium'
     conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'docker://community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9' :
-        'community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9' }"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/46/46b0d05f0daa47561d87d2a9cac5e51edc2c78e26f1bbab439c688386241a274/data'
+        : 'community.wave.seqera.io/library/gcta:1.94.1--9bc35dc424fcf6e9'}"
 
     input:
     tuple val(meta), path(mgrm_file), path(grm_files)
 
     output:
-    tuple val(meta), path("*.grm.id"), path("*.grm.bin"), path("*.grm.N.bin"), emit: combined_grm
-    tuple val("${task.process}"), val("gcta"), eval("gcta --version 2>&1 | grep 'version v' | tr -s ' ' | cut -d' ' -f3 | sed 's/^v//'"), emit: versions_gcta, topic: versions
+    tuple val(meta), path("*.grm.*"), emit: combined_grm
+    tuple val("${task.process}"), val("gcta"), eval("gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'"), emit: versions_gcta, topic: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def extra_args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """
+
     gcta \\
         --mgrm ${mgrm_file} \\
         --make-grm \\
         --out ${prefix} \\
         --thread-num ${task.cpus} \\
-        ${args}
+        ${extra_args}
     """
 
     stub:
diff --git a/modules/nf-core/gcta/addgrms/meta.yml b/modules/nf-core/gcta/addgrms/meta.yml
@@ -3,14 +3,17 @@ name: "gcta_addgrms"
 description: Combine multiple GRMs listed in an MGRM manifest into a single dense GRM
 keywords:
   - gcta
+  - genome-wide complex trait analysis
   - grm
+  - genetic relationship matrix
   - genetics
 tools:
   - "gcta":
       description: "Genome-wide Complex Trait Analysis (GCTA) estimates genetic relationships, variance components, and association statistics from genome-wide data."
       homepage: "https://yanglab.westlake.edu.cn/software/gcta/"
       documentation: "https://yanglab.westlake.edu.cn/software/gcta/static/gcta_doc_latest.pdf"
       tool_dev_url: "https://yanglab.westlake.edu.cn/software/gcta/"
+      licence: ["GPL-3.0-only"]
       identifier: "biotools:gcta"
 
 input:
@@ -38,20 +41,10 @@ output:
           description: |
             Groovy map containing combined GRM metadata
             e.g. `[ id:'plink_simulated' ]`
-      - "*.grm.id":
+      - "*.grm.*":
           type: file
-          description: Combined GRM sample identifier file
-          pattern: "*.grm.id"
-          ontologies: []
-      - "*.grm.bin":
-          type: file
-          description: Combined GRM binary matrix file
-          pattern: "*.grm.bin"
-          ontologies: []
-      - "*.grm.N.bin":
-          type: file
-          description: Combined GRM sample-count matrix file
-          pattern: "*.grm.N.bin"
+          description: Combined dense GRM sidecar files
+          pattern: "*.grm.{id,bin,N.bin}"
           ontologies: []
   versions_gcta:
     - - "${task.process}":
@@ -60,7 +53,7 @@ output:
       - "gcta":
           type: string
           description: The tool name
-      - "gcta --version 2>&1 | grep 'version v' | tr -s ' ' | cut -d' ' -f3 | sed 's/^v//'":
+      - "gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'":
           type: eval
           description: The command used to retrieve the GCTA version
 
@@ -72,11 +65,11 @@ topics:
       - gcta:
           type: string
           description: The tool name
-      - gcta --version 2>&1 | grep 'version v' | tr -s ' ' | cut -d' ' -f3 | sed 's/^v//':
+      - "gcta --version | sed -En 's/^[*] version v([0-9.]*).*/\\1/p'":
           type: eval
           description: The command used to retrieve the GCTA version
 
 authors:
-  - "@andongni"
+  - "@lyh970817"
 maintainers:
-  - "@andongni"
+  - "@lyh970817"
diff --git a/modules/nf-core/gcta/addgrms/tests/main.nf.test b/modules/nf-core/gcta/addgrms/tests/main.nf.test
@@ -9,57 +9,37 @@ nextflow_process {
     tag "gcta"
     tag "gcta/addgrms"
     tag "gcta/makegrmpart"
-    tag "gawk"
 
     setup {
-        run("GAWK", alias: "GAWK_VARIANTS_LDMS1") {
-            script "../../../gawk/main.nf"
-            process {
-                """
-                input[0] = [
-                    [ id:'plink_simulated_ldms1_variants' ],
-                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true)
-                ]
-                input[1] = Channel.of('NR <= 110 { print \$2 }').collectFile(name:'ldms1_variants.awk')
-                input[2] = false
-                """
-            }
-        }
-
-        run("GAWK", alias: "GAWK_VARIANTS_LDMS2") {
-            script "../../../gawk/main.nf"
-            process {
-                """
-                input[0] = [
-                    [ id:'plink_simulated_ldms2_variants' ],
-                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true)
-                ]
-                input[1] = Channel.of('NR > 110 { print \$2 }').collectFile(name:'ldms2_variants.awk')
-                input[2] = false
-                """
-            }
-        }
-
         run("GCTA_MAKEGRMPART", alias: "GCTA_MAKEGRMPART_LDMS1") {
             script "../../makegrmpart/main.nf"
             process {
                 """
-                file('plink_simulated.mbfile').text = 'plink_simulated\\n'
+                file('plink_simulated_ldms1.mbfile').text = 'plink_simulated\\n'
+
+                def bimFile = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true)
+                def extractSnps = bimFile.readLines()
+                    .take(10)
+                    .collect { row -> row.trim().split(/\\s+/)[1] }
+                    .join('\\n') + '\\n'
+                file('plink_simulated_ldms1.snps.txt').text = extractSnps
 
                 input[0] = [
-                    [ id:'plink_simulated_ldms1', part_gcta_job:1, nparts_gcta:1 ],
-                    file('plink_simulated.mbfile'),
+                    [ id:'plink_simulated_ldms1' ],
+                    1,
+                    1,
+                    file('plink_simulated_ldms1.mbfile'),
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true)
                     ],
                     [
-                        file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true)
+                        bimFile
                     ],
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
                     ]
                 ]
-                input[1] = GAWK_VARIANTS_LDMS1.out.output
+                input[1] = [[ id:'plink_simulated_ldms1' ], file('plink_simulated_ldms1.snps.txt')]
                 """
             }
         }
@@ -68,22 +48,32 @@ nextflow_process {
             script "../../makegrmpart/main.nf"
             process {
                 """
-                file('plink_simulated.mbfile').text = 'plink_simulated\\n'
+                file('plink_simulated_ldms2.mbfile').text = 'plink_simulated\\n'
+
+                def bimFile = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true)
+                def extractSnps = bimFile.readLines()
+                    .drop(10)
+                    .take(10)
+                    .collect { row -> row.trim().split(/\\s+/)[1] }
+                    .join('\\n') + '\\n'
+                file('plink_simulated_ldms2.snps.txt').text = extractSnps
 
                 input[0] = [
-                    [ id:'plink_simulated_ldms2', part_gcta_job:1, nparts_gcta:1 ],
-                    file('plink_simulated.mbfile'),
+                    [ id:'plink_simulated_ldms2' ],
+                    1,
+                    1,
+                    file('plink_simulated_ldms2.mbfile'),
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bed', checkIfExists: true)
                     ],
                     [
-                        file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.bim', checkIfExists: true)
+                        bimFile
                     ],
                     [
                         file(params.modules_testdata_base_path + 'genomics/homo_sapiens/popgen/plink_simulated.fam', checkIfExists: true)
                     ]
                 ]
-                input[1] = GAWK_VARIANTS_LDMS2.out.output
+                input[1] = [[ id:'plink_simulated_ldms2' ], file('plink_simulated_ldms2.snps.txt')]
                 """
             }
         }
@@ -101,9 +91,9 @@ nextflow_process {
 
                 grm_files = GCTA_MAKEGRMPART_LDMS1.out.grm_files
                     .mix(GCTA_MAKEGRMPART_LDMS2.out.grm_files)
-                    .map { meta, grm_id, grm_bin, grm_n_bin -> [grm_id, grm_bin, grm_n_bin] }
+                    .map { meta, grm_files, nparts_gcta, part_gcta_job -> grm_files }
                     .collect()
-                    .map { rows -> rows.sort { left, right -> left[0].name <=> right[0].name }.flatten() }
+                    .map { bundles -> bundles.flatten().sort { it.name } }
 
                 input[0] = mgrm_file
                     .combine(grm_files)
@@ -117,7 +107,13 @@ nextflow_process {
                 { assert process.success },
                 { assert process.out.combined_grm.size() == 1 },
                 { assert process.out.combined_grm.get(0).get(0).id == "plink_simulated_ldms" },
-                { assert file(process.out.combined_grm.get(0).get(1)).name == "plink_simulated_ldms.grm.id" },
+                {
+                    assert process.out.combined_grm.get(0).get(1).collect { file(it).name }.toSet() == [
+                        'plink_simulated_ldms.grm.id',
+                        'plink_simulated_ldms.grm.bin',
+                        'plink_simulated_ldms.grm.N.bin'
+                    ] as Set
+                },
                 {
                     assert snapshot(
                         process.out.combined_grm,
@@ -141,9 +137,9 @@ nextflow_process {
 
                 grm_files = GCTA_MAKEGRMPART_LDMS1.out.grm_files
                     .mix(GCTA_MAKEGRMPART_LDMS2.out.grm_files)
-                    .map { meta, grm_id, grm_bin, grm_n_bin -> [grm_id, grm_bin, grm_n_bin] }
+                    .map { meta, grm_files, nparts_gcta, part_gcta_job -> grm_files }
                     .collect()
-                    .map { rows -> rows.sort { left, right -> left[0].name <=> right[0].name }.flatten() }
+                    .map { bundles -> bundles.flatten().sort { it.name } }
 
                 input[0] = mgrm_file
                     .combine(grm_files)
diff --git a/modules/nf-core/gcta/addgrms/tests/main.nf.test.snap b/modules/nf-core/gcta/addgrms/tests/main.nf.test.snap
@@ -6,9 +6,11 @@
                     {
                         "id": "plink_simulated_ldms"
                     },
-                    "plink_simulated_ldms.grm.id:md5,d41d8cd98f00b204e9800998ecf8427e",
-                    "plink_simulated_ldms.grm.bin:md5,d41d8cd98f00b204e9800998ecf8427e",
-                    "plink_simulated_ldms.grm.N.bin:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    [
+                        "plink_simulated_ldms.grm.N.bin:md5,d41d8cd98f00b204e9800998ecf8427e",
+                        "plink_simulated_ldms.grm.bin:md5,d41d8cd98f00b204e9800998ecf8427e",
+                        "plink_simulated_ldms.grm.id:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
                 ]
             ],
             {
@@ -25,7 +27,7 @@
             "nf-test": "0.9.3",
             "nextflow": "25.10.4"
         },
-        "timestamp": "2026-03-21T00:42:02.641342626"
+        "timestamp": "2026-05-15T21:10:46.231316108"
     },
     "homo_sapiens popgen - merge dense GRMs from mgrm": {
         "content": [
@@ -34,9 +36,11 @@
                     {
                         "id": "plink_simulated_ldms"
                     },
-                    "plink_simulated_ldms.grm.id:md5,4f9aa36c44a417ff6d7caa9841e66ad9",
-                    "plink_simulated_ldms.grm.bin:md5,59a9d628e3fb4b9488244048c952b2ca",
-                    "plink_simulated_ldms.grm.N.bin:md5,acaa43bbbf2253d392537a178ecf09a4"
+                    [
+                        "plink_simulated_ldms.grm.N.bin:md5,804f8e1799c8b2d4d3df1b52a2a463c6",
+                        "plink_simulated_ldms.grm.bin:md5,850235911329bf9ab68f03e25bbc1ef1",
+                        "plink_simulated_ldms.grm.id:md5,4f9aa36c44a417ff6d7caa9841e66ad9"
+                    ]
                 ]
             ],
             {
@@ -53,6 +57,6 @@
             "nf-test": "0.9.3",
             "nextflow": "25.10.4"
         },
-        "timestamp": "2026-03-21T00:41:50.805078215"
+        "timestamp": "2026-05-15T22:52:43.953267272"
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -6,9 +6,11 @@`
`6`	`6`	`{`
`7`	`7`	`"id": "plink_simulated_ldms"`
`8`	`8`	`},`
`9`		`- "plink_simulated_ldms.grm.id:md5,d41d8cd98f00b204e9800998ecf8427e",`
`10`		`- "plink_simulated_ldms.grm.bin:md5,d41d8cd98f00b204e9800998ecf8427e",`
`11`		`- "plink_simulated_ldms.grm.N.bin:md5,d41d8cd98f00b204e9800998ecf8427e"`
	`9`	`+ [`
	`10`	`+ "plink_simulated_ldms.grm.N.bin:md5,d41d8cd98f00b204e9800998ecf8427e",`
	`11`	`+ "plink_simulated_ldms.grm.bin:md5,d41d8cd98f00b204e9800998ecf8427e",`
	`12`	`+ "plink_simulated_ldms.grm.id:md5,d41d8cd98f00b204e9800998ecf8427e"`
	`13`	`+ ]`
`12`	`14`	`]`
`13`	`15`	`],`
`14`	`16`	`{`
`@@ -25,7 +27,7 @@`
`25`	`27`	`"nf-test": "0.9.3",`
`26`	`28`	`"nextflow": "25.10.4"`
`27`	`29`	`},`
`28`		`- "timestamp": "2026-03-21T00:42:02.641342626"`
	`30`	`+ "timestamp": "2026-05-15T21:10:46.231316108"`
`29`	`31`	`},`
`30`	`32`	`"homo_sapiens popgen - merge dense GRMs from mgrm": {`
`31`	`33`	`"content": [`
`@@ -34,9 +36,11 @@`
`34`	`36`	`{`
`35`	`37`	`"id": "plink_simulated_ldms"`
`36`	`38`	`},`
`37`		`- "plink_simulated_ldms.grm.id:md5,4f9aa36c44a417ff6d7caa9841e66ad9",`
`38`		`- "plink_simulated_ldms.grm.bin:md5,59a9d628e3fb4b9488244048c952b2ca",`
`39`		`- "plink_simulated_ldms.grm.N.bin:md5,acaa43bbbf2253d392537a178ecf09a4"`
	`39`	`+ [`
	`40`	`+ "plink_simulated_ldms.grm.N.bin:md5,804f8e1799c8b2d4d3df1b52a2a463c6",`
	`41`	`+ "plink_simulated_ldms.grm.bin:md5,850235911329bf9ab68f03e25bbc1ef1",`
	`42`	`+ "plink_simulated_ldms.grm.id:md5,4f9aa36c44a417ff6d7caa9841e66ad9"`
	`43`	`+ ]`
`40`	`44`	`]`
`41`	`45`	`],`
`42`	`46`	`{`
`@@ -53,6 +57,6 @@`
`53`	`57`	`"nf-test": "0.9.3",`
`54`	`58`	`"nextflow": "25.10.4"`
`55`	`59`	`},`
`56`		`- "timestamp": "2026-03-21T00:41:50.805078215"`
	`60`	`+ "timestamp": "2026-05-15T22:52:43.953267272"`
`57`	`61`	`}`
`58`	`62`	`}`