From 8aa7c1f69a8f0a83f207ad8763e8f5db7f06ebfb Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jonathan.manning@seqera.io>
Date: Fri, 26 Jun 2026 14:36:33 +0100
Subject: [PATCH 1/2] fix(custom/orfnormalise): treat ribotish GenomePos start
 as 0-based

ribotish `predict` emits 0-based half-open `GenomePos` coordinates, but the
parser subtracted 1 from the start, treating it as 1-based. That shifted every
ribotish-derived ORF 5' by one base: a reading-frame shift on + strand records
(premature in-frame stops, and frame-2 codon positions in any downstream
codon-aware step) and a 3' overhang on - strand. Verified against the genome -
the corrected start lands on the ATG and translates to a clean full-length ORF.

Also:
- migrate the test fixtures off the deleted `pinin4fjords/test-datasets`
  `add-orf-prediction-fixtures` branch to `modules_testdata_base_path`
  (nf-core/test-datasets), which the test now depends on to run at all.
- add a codon-boundary regression guard asserting every emitted ORF spans a
  whole number of codons, which catches this class of coordinate bug for all
  callers.
- regenerate the orfnormalise and orftable_fasta_gtf_buildorfcatalogue
  snapshots (only the ribotish-derived records change).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../orfnormalise/templates/orfnormalise.py    |  3 +-
 .../custom/orfnormalise/tests/main.nf.test    | 22 +++++++---
 .../orfnormalise/tests/main.nf.test.snap      | 32 +++++++-------
 .../tests/main.nf.test.snap                   | 44 +++++++++----------
 4 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py
index f5b8cba8d0f7..c9ee8189a4f8 100644
--- a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py
+++ b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py
@@ -517,7 +517,8 @@ def _parse_ribotish_genpos(s):
     m = _RIBOTISH_GENPOS_RE.match(s.strip())
     if not m:
         return None
-    return m.group(1), int(m.group(2)) - 1, int(m.group(3)), m.group(4)
+    # GenomePos is 0-based half-open (BED-style).
+    return m.group(1), int(m.group(2)), int(m.group(3)), m.group(4)
 
 
 def parse_ribotish(path, transcripts, fields):
diff --git a/modules/nf-core/custom/orfnormalise/tests/main.nf.test b/modules/nf-core/custom/orfnormalise/tests/main.nf.test
index 7e8265721f48..07aac788f344 100644
--- a/modules/nf-core/custom/orfnormalise/tests/main.nf.test
+++ b/modules/nf-core/custom/orfnormalise/tests/main.nf.test
@@ -16,7 +16,7 @@ nextflow_process {
                 """
                 input[0] = channel.of([
                     [id: 'sample1'],
-                    file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
                     'ribocode'
                 ])
                 input[1] = channel.of([
@@ -39,6 +39,8 @@ nextflow_process {
             assertAll(
                 { assert process.success },
                 { assert snapshot(process.out).match() },
+                // every ORF must span a whole number of codons (frame guard)
+                { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
                 { assert rows.size() > 0 },
                 { assert rows.every { it[aa].toInteger() > 0 } },
                 { assert rows.every { it[score] && it[score] != '' } },
@@ -57,7 +59,7 @@ nextflow_process {
                 """
                 input[0] = channel.of([
                     [id: 'sample1'],
-                    file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
                     'ribotish'
                 ])
                 input[1] = channel.of([
@@ -80,6 +82,8 @@ nextflow_process {
             assertAll(
                 { assert process.success },
                 { assert snapshot(process.out).match() },
+                // every ORF must span a whole number of codons (frame guard)
+                { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
                 { assert rows.size() > 0 },
                 { assert rows.every { it[aa].toInteger() > 0 } },
                 { assert rows.every { it[score] && it[score] != '' } },
@@ -95,7 +99,7 @@ nextflow_process {
                 """
                 input[0] = channel.of([
                     [id: 'sample1'],
-                    file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true),
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true),
                     'ribotricer'
                 ])
                 input[1] = channel.of([
@@ -123,6 +127,8 @@ nextflow_process {
             assertAll(
                 { assert process.success },
                 { assert snapshot(process.out).match() },
+                // every ORF must span a whole number of codons (frame guard)
+                { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
                 { assert rows.size() > 0 },
                 { assert rows.every { it[aa].toInteger() > 0 } },
                 { assert rows.every { it[score] && it[score] != '' } },
@@ -139,7 +145,7 @@ nextflow_process {
                 """
                 input[0] = channel.of([
                     [id: 'sample1'],
-                    file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true),
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true),
                     'rpbp'
                 ])
                 input[1] = channel.of([
@@ -162,6 +168,8 @@ nextflow_process {
             assertAll(
                 { assert process.success },
                 { assert snapshot(process.out).match() },
+                // every ORF must span a whole number of codons (frame guard)
+                { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
                 { assert rows.size() > 0 },
                 { assert rows.every { it[aa].toInteger() > 0 } },
                 { assert rows.every { it[score] && it[score] != '' && it[score].toDouble() > 0 } },
@@ -179,7 +187,7 @@ nextflow_process {
                 """
                 input[0] = channel.of([
                     [id: 'cohort'],
-                    file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true),
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true),
                     'price'
                 ])
                 input[1] = channel.of([
@@ -206,6 +214,8 @@ nextflow_process {
             assertAll(
                 { assert process.success },
                 { assert snapshot(process.out).match() },
+                // every ORF must span a whole number of codons (frame guard)
+                { assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
                 { assert rows.size() > 0 },
                 { assert rows.every { it[aa].toInteger() > 0 } },
                 { assert rows.collect { it[cls] }.toSet().any { it != 'other' } },
@@ -222,7 +232,7 @@ nextflow_process {
                 """
                 input[0] = channel.of([
                     [id: 'sample1'],
-                    file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
                     'ribotish'
                 ])
                 input[1] = channel.of([
diff --git a/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap b/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap
index 46734d33c011..e6dd7fce24e3 100644
--- a/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap
+++ b/modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap
@@ -42,11 +42,11 @@
                 ]
             }
         ],
-        "timestamp": "2026-06-12T13:07:41.301245",
         "meta": {
             "nf-test": "0.9.5",
             "nextflow": "25.10.4"
-        }
+        },
+        "timestamp": "2026-06-12T13:07:41.301245"
     },
     "homo_sapiens [chr20] - rpbp": {
         "content": [
@@ -91,11 +91,11 @@
                 ]
             }
         ],
-        "timestamp": "2026-06-12T13:07:53.372563",
         "meta": {
             "nf-test": "0.9.5",
             "nextflow": "25.10.4"
-        }
+        },
+        "timestamp": "2026-06-12T13:07:53.372563"
     },
     "homo_sapiens [chr20] - ribocode": {
         "content": [
@@ -140,11 +140,11 @@
                 ]
             }
         ],
-        "timestamp": "2026-06-12T13:07:17.199267",
         "meta": {
             "nf-test": "0.9.5",
             "nextflow": "25.10.4"
-        }
+        },
+        "timestamp": "2026-06-12T13:07:17.199267"
     },
     "homo_sapiens [chr20] - ribotish": {
         "content": [
@@ -154,7 +154,7 @@
                         {
                             "id": "sample1"
                         },
-                        "sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534"
+                        "sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e"
                     ]
                 ],
                 "1": [
@@ -162,7 +162,7 @@
                         {
                             "id": "sample1"
                         },
-                        "sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8"
+                        "sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902"
                     ]
                 ],
                 "2": [
@@ -173,7 +173,7 @@
                         {
                             "id": "sample1"
                         },
-                        "sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534"
+                        "sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e"
                     ]
                 ],
                 "tsv": [
@@ -181,7 +181,7 @@
                         {
                             "id": "sample1"
                         },
-                        "sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8"
+                        "sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902"
                     ]
                 ],
                 "versions": [
@@ -189,11 +189,11 @@
                 ]
             }
         ],
-        "timestamp": "2026-06-12T13:07:28.989952",
         "meta": {
-            "nf-test": "0.9.5",
-            "nextflow": "25.10.4"
-        }
+            "nf-test": "0.9.3",
+            "nextflow": "25.04.8"
+        },
+        "timestamp": "2026-06-26T13:34:16.887667785"
     },
     "homo_sapiens [chr19+chr22] - price": {
         "content": [
@@ -238,10 +238,10 @@
                 ]
             }
         ],
-        "timestamp": "2026-06-12T13:07:59.997149",
         "meta": {
             "nf-test": "0.9.5",
             "nextflow": "25.10.4"
-        }
+        },
+        "timestamp": "2026-06-12T13:07:59.997149"
     }
 }
\ No newline at end of file
diff --git a/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap b/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap
index 60b1de48e11b..be93544dbf99 100644
--- a/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap
+++ b/subworkflows/nf-core/orftable_fasta_gtf_buildorfcatalogue/tests/main.nf.test.snap
@@ -7,7 +7,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "1": [
@@ -15,7 +15,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "2": [
@@ -31,7 +31,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.fasta:md5,362312506423e6861f84108f226a6ad3"
+                        "cohort.catalogue.fasta:md5,70aadcfb2bb15da2dd941536784ed202"
                     ]
                 ],
                 "4": [
@@ -47,7 +47,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "6": [
@@ -55,7 +55,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "7": [
@@ -71,7 +71,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.fasta:md5,362312506423e6861f84108f226a6ad3"
+                        "cohort.catalogue.fasta:md5,70aadcfb2bb15da2dd941536784ed202"
                     ]
                 ],
                 "catalogue_bed12": [
@@ -79,7 +79,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "catalogue_tsv": [
@@ -87,7 +87,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "consensus_bed12": [
@@ -95,7 +95,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "consensus_orf_to_gene_tsv": [
@@ -111,7 +111,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "multiqc": [
@@ -136,7 +136,7 @@
             "nf-test": "0.9.3",
             "nextflow": "25.04.8"
         },
-        "timestamp": "2026-06-25T15:38:17.504637551"
+        "timestamp": "2026-06-26T13:34:49.442717716"
     },
     "homo_sapiens [chr20] - ribotish + ribocode - collapse disabled": {
         "content": [
@@ -146,7 +146,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "1": [
@@ -154,7 +154,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "2": [
@@ -170,7 +170,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.aa.fasta:md5,8cbe5d11cbc9cfe0ef67d875b2632fe0"
+                        "cohort.catalogue.aa.fasta:md5,fc2245be6a462ebeb3ab653d7329e685"
                     ]
                 ],
                 "4": [
@@ -186,7 +186,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "6": [
@@ -194,7 +194,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "7": [
@@ -210,7 +210,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.aa.fasta:md5,8cbe5d11cbc9cfe0ef67d875b2632fe0"
+                        "cohort.catalogue.aa.fasta:md5,fc2245be6a462ebeb3ab653d7329e685"
                     ]
                 ],
                 "catalogue_bed12": [
@@ -218,7 +218,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "catalogue_tsv": [
@@ -226,7 +226,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "consensus_bed12": [
@@ -234,7 +234,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.bed12:md5,59b9b0c4ceb890b58b8bcde4fecb2ac7"
+                        "cohort.catalogue.consensus.bed12:md5,00ba4985c25afd02b9aa9cc22ff0874f"
                     ]
                 ],
                 "consensus_orf_to_gene_tsv": [
@@ -250,7 +250,7 @@
                         {
                             "id": "cohort"
                         },
-                        "cohort.catalogue.consensus.tsv:md5,3f078bae5f266aed6d218eda26dc78fc"
+                        "cohort.catalogue.consensus.tsv:md5,3e38e89a3b236f49b3c2f80bef451230"
                     ]
                 ],
                 "multiqc": [
@@ -275,6 +275,6 @@
             "nf-test": "0.9.3",
             "nextflow": "25.04.8"
         },
-        "timestamp": "2026-06-25T20:02:00.203522045"
+        "timestamp": "2026-06-26T13:34:56.775003133"
     }
 }
\ No newline at end of file

From 6d272e8758025825d6a941156cf973ddf72e48b9 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jonathan.manning@seqera.io>
Date: Tue, 30 Jun 2026 15:03:42 +0100
Subject: [PATCH 2/2] refactor(custom/orfnormalise): drop dead ribotish Blocks
 branch

ribotish `predict` reports a single genomic span (GenomePos) and no per-exon
`Blocks` column, so `parse_intervals(row.get("Blocks", ...))` never fired -
exon structure is always recovered from the GenomePos span intersected with
the transcript's exons. Remove the dead branch so ribotish always uses that
verified path. This also removes a latent foot-gun: `parse_intervals` is
1-based (correct for ribotricer), so had ribotish ever emitted a 0-based
`Blocks` column it would have reintroduced the off-by-one this PR fixes.

Output is unchanged (the branch was dead); snapshots match without regen.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../orfnormalise/templates/orfnormalise.py      | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py
index c9ee8189a4f8..6fb35d7ee1ab 100644
--- a/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py
+++ b/modules/nf-core/custom/orfnormalise/templates/orfnormalise.py
@@ -552,15 +552,16 @@ def parse_ribotish(path, transcripts, fields):
                 continue
             chrom, start, end, strand = gp
 
-            blocks = parse_intervals(row.get("Blocks", ""), seps=(":", "-"))
-            if not blocks:
-                tx = transcripts.get(tid)
-                if tx is None:
+            # ribotish predict reports one genomic span (GenomePos), not
+            # per-exon blocks, so the exon structure is recovered by
+            # intersecting that span with the transcript's exons.
+            tx = transcripts.get(tid)
+            if tx is None:
+                blocks = [(start, end)]
+            else:
+                blocks = [(max(start, gs), min(end, ge)) for gs, ge in tx.exons if min(end, ge) > max(start, gs)]
+                if not blocks:
                     blocks = [(start, end)]
-                else:
-                    blocks = [(max(start, gs), min(end, ge)) for gs, ge in tx.exons if min(end, ge) > max(start, gs)]
-                    if not blocks:
-                        blocks = [(start, end)]
 
             rows.append(
                 {