Skip to content

Commit 5243c7b

Browse files
pinin4fjordsclaude
andcommitted
fix(custom/orfnormalise): treat ribotish GenomePos start as 0-based
ribotish `predict` emits 0-based half-open `GenomePos` coordinates, but the parser subtracted 1 from the start, treating it as 1-based. That shifted every ribotish-derived ORF 5' by one base: a reading-frame shift on + strand records (premature in-frame stops, and frame-2 codon positions in any downstream codon-aware step) and a 3' overhang on - strand. Verified against the genome - the corrected start lands on the ATG and translates to a clean full-length ORF. Also: - migrate the test fixtures off the deleted `pinin4fjords/test-datasets` `add-orf-prediction-fixtures` branch to `modules_testdata_base_path` (nf-core/test-datasets), which the test now depends on to run at all. - add a codon-boundary regression guard asserting every emitted ORF spans a whole number of codons, which catches this class of coordinate bug for all callers. - regenerate the orfnormalise and orftable_fasta_gtf_buildorfcatalogue snapshots (only the ribotish-derived records change). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 3c81fd4 commit 5243c7b

4 files changed

Lines changed: 68 additions & 45 deletions

File tree

modules/nf-core/custom/orfnormalise/templates/orfnormalise.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,10 @@ def _parse_ribotish_genpos(s):
517517
m = _RIBOTISH_GENPOS_RE.match(s.strip())
518518
if not m:
519519
return None
520-
return m.group(1), int(m.group(2)) - 1, int(m.group(3)), m.group(4)
520+
# ribotish GenomePos is 0-based half-open (BED-style), so the start is used
521+
# as-is. Subtracting 1 would shift the ORF 5' by one base, breaking the
522+
# reading frame on + strand records and adding a 3' overhang on - strand.
523+
return m.group(1), int(m.group(2)), int(m.group(3)), m.group(4)
521524

522525

523526
def parse_ribotish(path, transcripts, fields):

modules/nf-core/custom/orfnormalise/tests/main.nf.test

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ nextflow_process {
1616
"""
1717
input[0] = channel.of([
1818
[id: 'sample1'],
19-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
19+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
2020
'ribocode'
2121
])
2222
input[1] = channel.of([
@@ -39,6 +39,10 @@ nextflow_process {
3939
assertAll(
4040
{ assert process.success },
4141
{ assert snapshot(process.out).match() },
42+
// Every emitted ORF must span a whole number of codons; a
43+
// blockSizes sum not divisible by 3 signals a coordinate /
44+
// reading-frame error in the caller's coordinate parser.
45+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
4246
{ assert rows.size() > 0 },
4347
{ assert rows.every { it[aa].toInteger() > 0 } },
4448
{ assert rows.every { it[score] && it[score] != '' } },
@@ -57,7 +61,7 @@ nextflow_process {
5761
"""
5862
input[0] = channel.of([
5963
[id: 'sample1'],
60-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
64+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
6165
'ribotish'
6266
])
6367
input[1] = channel.of([
@@ -80,6 +84,10 @@ nextflow_process {
8084
assertAll(
8185
{ assert process.success },
8286
{ assert snapshot(process.out).match() },
87+
// Every emitted ORF must span a whole number of codons; a
88+
// blockSizes sum not divisible by 3 signals a coordinate /
89+
// reading-frame error in the caller's coordinate parser.
90+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
8391
{ assert rows.size() > 0 },
8492
{ assert rows.every { it[aa].toInteger() > 0 } },
8593
{ assert rows.every { it[score] && it[score] != '' } },
@@ -95,7 +103,7 @@ nextflow_process {
95103
"""
96104
input[0] = channel.of([
97105
[id: 'sample1'],
98-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true),
106+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true),
99107
'ribotricer'
100108
])
101109
input[1] = channel.of([
@@ -123,6 +131,10 @@ nextflow_process {
123131
assertAll(
124132
{ assert process.success },
125133
{ assert snapshot(process.out).match() },
134+
// Every emitted ORF must span a whole number of codons; a
135+
// blockSizes sum not divisible by 3 signals a coordinate /
136+
// reading-frame error in the caller's coordinate parser.
137+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
126138
{ assert rows.size() > 0 },
127139
{ assert rows.every { it[aa].toInteger() > 0 } },
128140
{ assert rows.every { it[score] && it[score] != '' } },
@@ -139,7 +151,7 @@ nextflow_process {
139151
"""
140152
input[0] = channel.of([
141153
[id: 'sample1'],
142-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true),
154+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true),
143155
'rpbp'
144156
])
145157
input[1] = channel.of([
@@ -162,6 +174,10 @@ nextflow_process {
162174
assertAll(
163175
{ assert process.success },
164176
{ assert snapshot(process.out).match() },
177+
// Every emitted ORF must span a whole number of codons; a
178+
// blockSizes sum not divisible by 3 signals a coordinate /
179+
// reading-frame error in the caller's coordinate parser.
180+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
165181
{ assert rows.size() > 0 },
166182
{ assert rows.every { it[aa].toInteger() > 0 } },
167183
{ assert rows.every { it[score] && it[score] != '' && it[score].toDouble() > 0 } },
@@ -179,7 +195,7 @@ nextflow_process {
179195
"""
180196
input[0] = channel.of([
181197
[id: 'cohort'],
182-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true),
198+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true),
183199
'price'
184200
])
185201
input[1] = channel.of([
@@ -206,6 +222,10 @@ nextflow_process {
206222
assertAll(
207223
{ assert process.success },
208224
{ assert snapshot(process.out).match() },
225+
// Every emitted ORF must span a whole number of codons; a
226+
// blockSizes sum not divisible by 3 signals a coordinate /
227+
// reading-frame error in the caller's coordinate parser.
228+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
209229
{ assert rows.size() > 0 },
210230
{ assert rows.every { it[aa].toInteger() > 0 } },
211231
{ assert rows.collect { it[cls] }.toSet().any { it != 'other' } },
@@ -222,7 +242,7 @@ nextflow_process {
222242
"""
223243
input[0] = channel.of([
224244
[id: 'sample1'],
225-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
245+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
226246
'ribotish'
227247
])
228248
input[1] = channel.of([

modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@
4242
]
4343
}
4444
],
45-
"timestamp": "2026-06-12T13:07:41.301245",
4645
"meta": {
4746
"nf-test": "0.9.5",
4847
"nextflow": "25.10.4"
49-
}
48+
},
49+
"timestamp": "2026-06-12T13:07:41.301245"
5050
},
5151
"homo_sapiens [chr20] - rpbp": {
5252
"content": [
@@ -91,11 +91,11 @@
9191
]
9292
}
9393
],
94-
"timestamp": "2026-06-12T13:07:53.372563",
9594
"meta": {
9695
"nf-test": "0.9.5",
9796
"nextflow": "25.10.4"
98-
}
97+
},
98+
"timestamp": "2026-06-12T13:07:53.372563"
9999
},
100100
"homo_sapiens [chr20] - ribocode": {
101101
"content": [
@@ -140,11 +140,11 @@
140140
]
141141
}
142142
],
143-
"timestamp": "2026-06-12T13:07:17.199267",
144143
"meta": {
145144
"nf-test": "0.9.5",
146145
"nextflow": "25.10.4"
147-
}
146+
},
147+
"timestamp": "2026-06-12T13:07:17.199267"
148148
},
149149
"homo_sapiens [chr20] - ribotish": {
150150
"content": [
@@ -154,15 +154,15 @@
154154
{
155155
"id": "sample1"
156156
},
157-
"sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534"
157+
"sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e"
158158
]
159159
],
160160
"1": [
161161
[
162162
{
163163
"id": "sample1"
164164
},
165-
"sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8"
165+
"sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902"
166166
]
167167
],
168168
"2": [
@@ -173,27 +173,27 @@
173173
{
174174
"id": "sample1"
175175
},
176-
"sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534"
176+
"sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e"
177177
]
178178
],
179179
"tsv": [
180180
[
181181
{
182182
"id": "sample1"
183183
},
184-
"sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8"
184+
"sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902"
185185
]
186186
],
187187
"versions": [
188188
"versions.yml:md5,e7fc396424c69f0969daec4a45c3de70"
189189
]
190190
}
191191
],
192-
"timestamp": "2026-06-12T13:07:28.989952",
193192
"meta": {
194-
"nf-test": "0.9.5",
195-
"nextflow": "25.10.4"
196-
}
193+
"nf-test": "0.9.3",
194+
"nextflow": "25.04.8"
195+
},
196+
"timestamp": "2026-06-26T13:34:16.887667785"
197197
},
198198
"homo_sapiens [chr19+chr22] - price": {
199199
"content": [
@@ -238,10 +238,10 @@
238238
]
239239
}
240240
],
241-
"timestamp": "2026-06-12T13:07:59.997149",
242241
"meta": {
243242
"nf-test": "0.9.5",
244243
"nextflow": "25.10.4"
245-
}
244+
},
245+
"timestamp": "2026-06-12T13:07:59.997149"
246246
}
247247
}

0 commit comments

Comments
 (0)