Skip to content

Commit 8aa7c1f

Browse files
pinin4fjordsclaude
andcommitted
fix(custom/orfnormalise): treat ribotish GenomePos start as 0-based
ribotish `predict` emits 0-based half-open `GenomePos` coordinates, but the parser subtracted 1 from the start, treating it as 1-based. That shifted every ribotish-derived ORF 5' by one base: a reading-frame shift on + strand records (premature in-frame stops, and frame-2 codon positions in any downstream codon-aware step) and a 3' overhang on - strand. Verified against the genome - the corrected start lands on the ATG and translates to a clean full-length ORF. Also: - migrate the test fixtures off the deleted `pinin4fjords/test-datasets` `add-orf-prediction-fixtures` branch to `modules_testdata_base_path` (nf-core/test-datasets), which the test now depends on to run at all. - add a codon-boundary regression guard asserting every emitted ORF spans a whole number of codons, which catches this class of coordinate bug for all callers. - regenerate the orfnormalise and orftable_fasta_gtf_buildorfcatalogue snapshots (only the ribotish-derived records change). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 3c81fd4 commit 8aa7c1f

4 files changed

Lines changed: 56 additions & 45 deletions

File tree

modules/nf-core/custom/orfnormalise/templates/orfnormalise.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,8 @@ def _parse_ribotish_genpos(s):
517517
m = _RIBOTISH_GENPOS_RE.match(s.strip())
518518
if not m:
519519
return None
520-
return m.group(1), int(m.group(2)) - 1, int(m.group(3)), m.group(4)
520+
# GenomePos is 0-based half-open (BED-style).
521+
return m.group(1), int(m.group(2)), int(m.group(3)), m.group(4)
521522

522523

523524
def parse_ribotish(path, transcripts, fields):

modules/nf-core/custom/orfnormalise/tests/main.nf.test

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ nextflow_process {
1616
"""
1717
input[0] = channel.of([
1818
[id: 'sample1'],
19-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
19+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribocode.txt', checkIfExists: true),
2020
'ribocode'
2121
])
2222
input[1] = channel.of([
@@ -39,6 +39,8 @@ nextflow_process {
3939
assertAll(
4040
{ assert process.success },
4141
{ assert snapshot(process.out).match() },
42+
// every ORF must span a whole number of codons (frame guard)
43+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
4244
{ assert rows.size() > 0 },
4345
{ assert rows.every { it[aa].toInteger() > 0 } },
4446
{ assert rows.every { it[score] && it[score] != '' } },
@@ -57,7 +59,7 @@ nextflow_process {
5759
"""
5860
input[0] = channel.of([
5961
[id: 'sample1'],
60-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
62+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
6163
'ribotish'
6264
])
6365
input[1] = channel.of([
@@ -80,6 +82,8 @@ nextflow_process {
8082
assertAll(
8183
{ assert process.success },
8284
{ assert snapshot(process.out).match() },
85+
// every ORF must span a whole number of codons (frame guard)
86+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
8387
{ assert rows.size() > 0 },
8488
{ assert rows.every { it[aa].toInteger() > 0 } },
8589
{ assert rows.every { it[score] && it[score] != '' } },
@@ -95,7 +99,7 @@ nextflow_process {
9599
"""
96100
input[0] = channel.of([
97101
[id: 'sample1'],
98-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true),
102+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotricer.tsv', checkIfExists: true),
99103
'ribotricer'
100104
])
101105
input[1] = channel.of([
@@ -123,6 +127,8 @@ nextflow_process {
123127
assertAll(
124128
{ assert process.success },
125129
{ assert snapshot(process.out).match() },
130+
// every ORF must span a whole number of codons (frame guard)
131+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
126132
{ assert rows.size() > 0 },
127133
{ assert rows.every { it[aa].toInteger() > 0 } },
128134
{ assert rows.every { it[score] && it[score] != '' } },
@@ -139,7 +145,7 @@ nextflow_process {
139145
"""
140146
input[0] = channel.of([
141147
[id: 'sample1'],
142-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true),
148+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.rpbp.predicted-orfs.bed.gz', checkIfExists: true),
143149
'rpbp'
144150
])
145151
input[1] = channel.of([
@@ -162,6 +168,8 @@ nextflow_process {
162168
assertAll(
163169
{ assert process.success },
164170
{ assert snapshot(process.out).match() },
171+
// every ORF must span a whole number of codons (frame guard)
172+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
165173
{ assert rows.size() > 0 },
166174
{ assert rows.every { it[aa].toInteger() > 0 } },
167175
{ assert rows.every { it[score] && it[score] != '' && it[score].toDouble() > 0 } },
@@ -179,7 +187,7 @@ nextflow_process {
179187
"""
180188
input[0] = channel.of([
181189
[id: 'cohort'],
182-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true),
190+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/cohort.price.orfs.tsv', checkIfExists: true),
183191
'price'
184192
])
185193
input[1] = channel.of([
@@ -206,6 +214,8 @@ nextflow_process {
206214
assertAll(
207215
{ assert process.success },
208216
{ assert snapshot(process.out).match() },
217+
// every ORF must span a whole number of codons (frame guard)
218+
{ assert path(process.out.bed12[0][1]).text.readLines().findAll { it.trim() }.every { it.split('\t')[10].split(',').findAll { s -> s }.collect { s -> s.toInteger() }.sum() % 3 == 0 } },
209219
{ assert rows.size() > 0 },
210220
{ assert rows.every { it[aa].toInteger() > 0 } },
211221
{ assert rows.collect { it[cls] }.toSet().any { it != 'other' } },
@@ -222,7 +232,7 @@ nextflow_process {
222232
"""
223233
input[0] = channel.of([
224234
[id: 'sample1'],
225-
file('https://raw.githubusercontent.com/pinin4fjords/test-datasets/add-orf-prediction-fixtures/data/genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
235+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/riboseq_expression/orf_predictions/sample1.ribotish.pred.txt', checkIfExists: true),
226236
'ribotish'
227237
])
228238
input[1] = channel.of([

modules/nf-core/custom/orfnormalise/tests/main.nf.test.snap

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@
4242
]
4343
}
4444
],
45-
"timestamp": "2026-06-12T13:07:41.301245",
4645
"meta": {
4746
"nf-test": "0.9.5",
4847
"nextflow": "25.10.4"
49-
}
48+
},
49+
"timestamp": "2026-06-12T13:07:41.301245"
5050
},
5151
"homo_sapiens [chr20] - rpbp": {
5252
"content": [
@@ -91,11 +91,11 @@
9191
]
9292
}
9393
],
94-
"timestamp": "2026-06-12T13:07:53.372563",
9594
"meta": {
9695
"nf-test": "0.9.5",
9796
"nextflow": "25.10.4"
98-
}
97+
},
98+
"timestamp": "2026-06-12T13:07:53.372563"
9999
},
100100
"homo_sapiens [chr20] - ribocode": {
101101
"content": [
@@ -140,11 +140,11 @@
140140
]
141141
}
142142
],
143-
"timestamp": "2026-06-12T13:07:17.199267",
144143
"meta": {
145144
"nf-test": "0.9.5",
146145
"nextflow": "25.10.4"
147-
}
146+
},
147+
"timestamp": "2026-06-12T13:07:17.199267"
148148
},
149149
"homo_sapiens [chr20] - ribotish": {
150150
"content": [
@@ -154,15 +154,15 @@
154154
{
155155
"id": "sample1"
156156
},
157-
"sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534"
157+
"sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e"
158158
]
159159
],
160160
"1": [
161161
[
162162
{
163163
"id": "sample1"
164164
},
165-
"sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8"
165+
"sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902"
166166
]
167167
],
168168
"2": [
@@ -173,27 +173,27 @@
173173
{
174174
"id": "sample1"
175175
},
176-
"sample1.normalised.bed12:md5,83e35410ce60f5289a4956016ea81534"
176+
"sample1.normalised.bed12:md5,1b3cb2249df360840aa062001653a85e"
177177
]
178178
],
179179
"tsv": [
180180
[
181181
{
182182
"id": "sample1"
183183
},
184-
"sample1.normalised.tsv:md5,b0ec626bdb9a80f7ae8450067fe059c8"
184+
"sample1.normalised.tsv:md5,5b52abd1e50bab3722b50e4bbf379902"
185185
]
186186
],
187187
"versions": [
188188
"versions.yml:md5,e7fc396424c69f0969daec4a45c3de70"
189189
]
190190
}
191191
],
192-
"timestamp": "2026-06-12T13:07:28.989952",
193192
"meta": {
194-
"nf-test": "0.9.5",
195-
"nextflow": "25.10.4"
196-
}
193+
"nf-test": "0.9.3",
194+
"nextflow": "25.04.8"
195+
},
196+
"timestamp": "2026-06-26T13:34:16.887667785"
197197
},
198198
"homo_sapiens [chr19+chr22] - price": {
199199
"content": [
@@ -238,10 +238,10 @@
238238
]
239239
}
240240
],
241-
"timestamp": "2026-06-12T13:07:59.997149",
242241
"meta": {
243242
"nf-test": "0.9.5",
244243
"nextflow": "25.10.4"
245-
}
244+
},
245+
"timestamp": "2026-06-12T13:07:59.997149"
246246
}
247247
}

0 commit comments

Comments
 (0)