From 8aa7c1f69a8f0a83f207ad8763e8f5db7f06ebfb Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 26 Jun 2026 14:36:33 +0100 Subject: [PATCH 1/2] fix(custom/orfnormalise): treat ribotish GenomePos start as 0-based ribotish `predict` emits 0-based half-open `GenomePos` coordinates, but the parser subtracted 1 from the start, treating it as 1-based. That shifted every ribotish-derived ORF 5' by one base: a reading-frame shift on + strand records (premature in-frame stops, and frame-2 codon positions in any downstream codon-aware step) and a 3' overhang on - strand. Verified against the genome - the corrected start lands on the ATG and translates to a clean full-length ORF. Also: - migrate the test fixtures off the deleted `pinin4fjords/test-datasets` `add-orf-prediction-fixtures` branch to `modules_testdata_base_path` (nf-core/test-datasets), which the test now depends on to run at all. - add a codon-boundary regression guard asserting every emitted ORF spans a whole number of codons, which catches this class of coordinate bug for all callers. - regenerate the orfnormalise and orftable_fasta_gtf_buildorfcatalogue snapshots (only the ribotish-derived records change). Co-Authored-By: Claude Opus 4.8 --- .../orfnormalise/templates/orfnormalise.py | 3 +- .../custom/orfnormalise/tests/main.nf.test | 22 +++++++--- .../orfnormalise/tests/main.nf.test.snap | 32 +++++++------- .../tests/main.nf.test.snap | 44 +++++++++---------- 4 files changed, 56 insertions(+), 45 deletions(-) diff --git a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py index f5b8cba8d0f7..c9ee8189a4f8 100644 --- a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py +++ b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py @@ -517,7 +517,8 @@ def _parse_ribotish_genpos(s): m = _RIBOTISH_GENPOS_RE.match(s.strip()) if not m: return None - return m.group(1), int(m.group(2)) - 1, int(m.group(3)), m.group(4) + # GenomePos is 0-based half-open (BED-style). + return m.group(1), int(m.group(2)), int(m.group(3)), m.group(4) def parse_ribotish(path, transcripts, fields): diff --git a/modules/nf-core/custom/orfnormalise/tests/main.nf.test b/modules/nf-core/custom/orfnormalise/tests/main.nf.test index 7e8265721f48..07aac788f344 100644 --- a/modules/nf-core/custom/orfnormalise/tests/main.nf.test +++ b/modules/nf-core/custom/orfnormalise/tests/main.nf.test @@ -16,7 +16,7 @@ nextflow_process { """ input[0] = channel.of([ [id: 'sample1'], - file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true), 'ribocode' ]) input[1] = channel.of([ @@ -39,6 +39,8 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out).match() }, + // every ORF must span a whole number of codons (frame guard) + { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } }, { assert rows.size() > 0 }, { assert rows.every { it[aa].toInteger() > 0 } }, { assert rows.every { it[score] && it[score] != '' } }, @@ -57,7 +59,7 @@ nextflow_process { """ input[0] = channel.of([ [id: 'sample1'], - file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true), 'ribotish' ]) input[1] = channel.of([ @@ -80,6 +82,8 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out).match() }, + // every ORF must span a whole number of codons (frame guard) + { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } }, { assert rows.size() > 0 }, { assert rows.every { it[aa].toInteger() > 0 } }, { assert rows.every { it[score] && it[score] != '' } }, @@ -95,7 +99,7 @@ nextflow_process { """ input[0] = channel.of([ [id: 'sample1'], - file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true), 'ribotricer' ]) input[1] = channel.of([ @@ -123,6 +127,8 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out).match() }, + // every ORF must span a whole number of codons (frame guard) + { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } }, { assert rows.size() > 0 }, { assert rows.every { it[aa].toInteger() > 0 } }, { assert rows.every { it[score] && it[score] != '' } }, @@ -139,7 +145,7 @@ nextflow_process { """ input[0] = channel.of([ [id: 'sample1'], - file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true), 'rpbp' ]) input[1] = channel.of([ @@ -162,6 +168,8 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out).match() }, + // every ORF must span a whole number of codons (frame guard) + { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } }, { assert rows.size() > 0 }, { assert rows.every { it[aa].toInteger() > 0 } }, { assert rows.every { it[score] && it[score] != '' && it[score].toDouble() > 0 } }, @@ -179,7 +187,7 @@ nextflow_process { """ input[0] = channel.of([ [id: 'cohort'], - file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true), 'price' ]) input[1] = channel.of([ @@ -206,6 +214,8 @@ nextflow_process { assertAll( { assert process.success }, { assert snapshot(process.out).match() }, + // every ORF must span a whole number of codons (frame guard) + { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } }, { assert rows.size() > 0 }, { assert rows.every { it[aa].toInteger() > 0 } }, { assert rows.collect { it[cls] }.toSet().any { it != 'other' } }, @@ -222,7 +232,7 @@ nextflow_process { """ input[0] = channel.of([ [id: 'sample1'], - file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true), 'ribotish' ]) input[1] = channel.of([ diff --git a/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap b/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap index 46734d33c011..e6dd7fce24e3 100644 --- a/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap +++ b/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap @@ -42,11 +42,11 @@ ] } ], - "timestamp": "2026-06-12T13:07:41.301245", "meta": { "nf-test": "0.9.5", "nextflow": "25.10.4" - } + }, + "timestamp": "2026-06-12T13:07:41.301245" }, "homo_sapiens [chr20] - rpbp": { "content": [ @@ -91,11 +91,11 @@ ] } ], - "timestamp": "2026-06-12T13:07:53.372563", "meta": { "nf-test": "0.9.5", "nextflow": "25.10.4" - } + }, + "timestamp": "2026-06-12T13:07:53.372563" }, "homo_sapiens [chr20] - ribocode": { "content": [ @@ -140,11 +140,11 @@ ] } ], - "timestamp": "2026-06-12T13:07:17.199267", "meta": { "nf-test": "0.9.5", "nextflow": "25.10.4" - } + }, + "timestamp": "2026-06-12T13:07:17.199267" }, "homo_sapiens [chr20] - ribotish": { "content": [ @@ -154,7 +154,7 @@ { "id": "sample1" }, - "sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534" + "sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e" ] ], "1": [ @@ -162,7 +162,7 @@ { "id": "sample1" }, - "sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8" + "sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902" ] ], "2": [ @@ -173,7 +173,7 @@ { "id": "sample1" }, - "sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534" + "sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e" ] ], "tsv": [ @@ -181,7 +181,7 @@ { "id": "sample1" }, - "sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8" + "sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902" ] ], "versions": [ @@ -189,11 +189,11 @@ ] } ], - "timestamp": "2026-06-12T13:07:28.989952", "meta": { - "nf-test": "0.9.5", - "nextflow": "25.10.4" - } + "nf-test": "0.9.3", + "nextflow": "25.04.8" + }, + "timestamp": "2026-06-26T13:34:16.887667785" }, "homo_sapiens [chr19+chr22] - price": { "content": [ @@ -238,10 +238,10 @@ ] } ], - "timestamp": "2026-06-12T13:07:59.997149", "meta": { "nf-test": "0.9.5", "nextflow": "25.10.4" - } + }, + "timestamp": "2026-06-12T13:07:59.997149" } } \ No newline at end of file diff --git a/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap b/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap index 60b1de48e11b..be93544dbf99 100644 --- a/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap +++ b/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap @@ -7,7 +7,7 @@ { "id": "cohort" }, - "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "1": [ @@ -15,7 +15,7 @@ { "id": "cohort" }, - "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "2": [ @@ -31,7 +31,7 @@ { "id": "cohort" }, - "cohort.catalogue.fasta:md5,362312506423e6861f84108f226a6ad3" + "cohort.catalogue.fasta:md5,70aadcfb2bb15da2dd941536784ed202" ] ], "4": [ @@ -47,7 +47,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "6": [ @@ -55,7 +55,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "7": [ @@ -71,7 +71,7 @@ { "id": "cohort" }, - "cohort.catalogue.fasta:md5,362312506423e6861f84108f226a6ad3" + "cohort.catalogue.fasta:md5,70aadcfb2bb15da2dd941536784ed202" ] ], "catalogue_bed12": [ @@ -79,7 +79,7 @@ { "id": "cohort" }, - "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "catalogue_tsv": [ @@ -87,7 +87,7 @@ { "id": "cohort" }, - "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "consensus_bed12": [ @@ -95,7 +95,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "consensus_orf_to_gene_tsv": [ @@ -111,7 +111,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "multiqc": [ @@ -136,7 +136,7 @@ "nf-test": "0.9.3", "nextflow": "25.04.8" }, - "timestamp": "2026-06-25T15:38:17.504637551" + "timestamp": "2026-06-26T13:34:49.442717716" }, "homo_sapiens [chr20] - ribotish + ribocode - collapse disabled": { "content": [ @@ -146,7 +146,7 @@ { "id": "cohort" }, - "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "1": [ @@ -154,7 +154,7 @@ { "id": "cohort" }, - "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "2": [ @@ -170,7 +170,7 @@ { "id": "cohort" }, - "cohort.catalogue.aa.fasta:md5,8cbe5d11cbc9cfe0ef67d875b2632fe0" + "cohort.catalogue.aa.fasta:md5,fc2245be6a462ebeb3ab653d7329e685" ] ], "4": [ @@ -186,7 +186,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "6": [ @@ -194,7 +194,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "7": [ @@ -210,7 +210,7 @@ { "id": "cohort" }, - "cohort.catalogue.aa.fasta:md5,8cbe5d11cbc9cfe0ef67d875b2632fe0" + "cohort.catalogue.aa.fasta:md5,fc2245be6a462ebeb3ab653d7329e685" ] ], "catalogue_bed12": [ @@ -218,7 +218,7 @@ { "id": "cohort" }, - "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "catalogue_tsv": [ @@ -226,7 +226,7 @@ { "id": "cohort" }, - "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "consensus_bed12": [ @@ -234,7 +234,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7" + "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f" ] ], "consensus_orf_to_gene_tsv": [ @@ -250,7 +250,7 @@ { "id": "cohort" }, - "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc" + "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230" ] ], "multiqc": [ @@ -275,6 +275,6 @@ "nf-test": "0.9.3", "nextflow": "25.04.8" }, - "timestamp": "2026-06-25T20:02:00.203522045" + "timestamp": "2026-06-26T13:34:56.775003133" } } \ No newline at end of file From 6d272e8758025825d6a941156cf973ddf72e48b9 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 30 Jun 2026 15:03:42 +0100 Subject: [PATCH 2/2] refactor(custom/orfnormalise): drop dead ribotish Blocks branch ribotish `predict` reports a single genomic span (GenomePos) and no per-exon `Blocks` column, so `parse_intervals(row.get("Blocks", ...))` never fired - exon structure is always recovered from the GenomePos span intersected with the transcript's exons. Remove the dead branch so ribotish always uses that verified path. This also removes a latent foot-gun: `parse_intervals` is 1-based (correct for ribotricer), so had ribotish ever emitted a 0-based `Blocks` column it would have reintroduced the off-by-one this PR fixes. Output is unchanged (the branch was dead); snapshots match without regen. Co-Authored-By: Claude Opus 4.8 --- .../orfnormalise/templates/orfnormalise.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py index c9ee8189a4f8..6fb35d7ee1ab 100644 --- a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py +++ b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py @@ -552,15 +552,16 @@ def parse_ribotish(path, transcripts, fields): continue chrom, start, end, strand = gp - blocks = parse_intervals(row.get("Blocks", ""), seps=(":", "-")) - if not blocks: - tx = transcripts.get(tid) - if tx is None: + # ribotish predict reports one genomic span (GenomePos), not + # per-exon blocks, so the exon structure is recovered by + # intersecting that span with the transcript's exons. + tx = transcripts.get(tid) + if tx is None: + blocks = [(start, end)] + else: + blocks = [(max(start, gs), min(end, ge)) for gs, ge in tx.exons if min(end, ge) > max(start, gs)] + if not blocks: blocks = [(start, end)] - else: - blocks = [(max(start, gs), min(end, ge)) for gs, ge in tx.exons if min(end, ge) > max(start, gs)] - if not blocks: - blocks = [(start, end)] rows.append( {