Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
f35e2f5
feat(custom/bed12codonpositions): add BED12 to BED6 codon-position ex…
pinin4fjords May 21, 2026
b1f80f3
fix(custom/bed12codonpositions): stabilise versions.yml across runtimes
pinin4fjords May 21, 2026
ee1d68c
fix(custom/bed12codonpositions): align stub versions.yml with script
pinin4fjords May 21, 2026
eba4ff4
fix(custom/bed12codonpositions): pin exact runtime via Wave container
pinin4fjords May 21, 2026
bea7278
feat(custom/bed12codonpositions): preserve score, mRNA-order output, …
pinin4fjords May 21, 2026
4d6d755
Merge branch 'master' into custom-bed12codonpositions
pinin4fjords May 21, 2026
deadb0c
docs(custom/bed12codonpositions): tighten module description and scri…
pinin4fjords May 21, 2026
ea505b4
Merge branch 'custom-bed12codonpositions' of github.com:pinin4fjords/…
pinin4fjords May 21, 2026
73cfad1
Merge branch 'master' into custom-bed12codonpositions
pinin4fjords May 21, 2026
70d4e98
Use single nextflow.config with module_args for bed12codonpositions t…
pinin4fjords Jun 12, 2026
53b8b51
Merge branch 'master' into custom-bed12codonpositions
pinin4fjords Jun 12, 2026
8e3ed72
Address review feedback
pinin4fjords Jun 14, 2026
b279374
Merge remote-tracking branch 'pinin4fjords/custom-bed12codonpositions…
pinin4fjords Jun 14, 2026
51cf1ef
Add regenerated snapshot for bed12codonpositions tests [skip ci]
pinin4fjords Jun 14, 2026
2685fc4
ci: trigger CI for snapshot update
pinin4fjords Jun 14, 2026
8fdf9e7
Merge branch 'master' into custom-bed12codonpositions
pinin4fjords Jun 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions modules/nf-core/custom/bed12codonpositions/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- conda-forge::pandas=2.3.0
- conda-forge::python=3.12.11
- conda-forge::pyyaml=6.0.2
49 changes: 49 additions & 0 deletions modules/nf-core/custom/bed12codonpositions/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
process CUSTOM_BED12CODONPOSITIONS {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/0f/0f1019bd22c111267bcb670fdb128829776f0ca6adfa7b0e2d126f91577d08e3/data' :
'community.wave.seqera.io/library/python_pandas_pyyaml:75514f9f977be607' }"

input:
tuple val(meta), path(bed12)

output:
tuple val(meta), path("${prefix}.bed"), emit: bed
path "versions.yml" , emit: versions, topic: versions

when:
task.ext.when == null || task.ext.when

script:
prefix = task.ext.prefix ?: "${meta.id}"
args = task.ext.args ?: ''
template 'bed12codonpositions.py'

stub:
prefix = task.ext.prefix ?: "${meta.id}"
"""
touch ${prefix}.bed

python - <<END
import platform
import pandas
import yaml

with open("versions.yml", "w") as fh:
yaml.safe_dump(
{
"${task.process}": {
"python": platform.python_version(),
"pandas": pandas.__version__,
}
},
fh,
default_flow_style=False,
sort_keys=False,
)
END
"""
}
72 changes: 72 additions & 0 deletions modules/nf-core/custom/bed12codonpositions/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "custom_bed12codonpositions"
description: |
Expand a BED12 into a BED6 of in-frame mRNA positions, projected back
to genomic coordinates. Default behaviour emits one row per codon (the
5' nucleotide); --step / --width / --frame control the stride, span
and offset on the spliced mRNA. Useful for codon-level work on
spliced features (e.g. ribo-seq P-site counts per codon, frame /
periodicity QC, novel-ORF tiling).
keywords:
- bed12
- bed6
- codon
- splicing
- coordinates
tools:
- "bed12codonpositions":
description: |
Python helper that expands a BED12 into per-codon BED6
positions along the spliced feature, with configurable frame,
step and span width via `ext.args`.
tool_dev_url: "https://github.com/nf-core/modules/blob/master/modules/nf-core/custom/bed12codonpositions/main.nf"
licence: ["MIT"]
identifier: ""
input:
- - meta:
type: map
description: |
Groovy Map containing sample/feature-set information
e.g. `[ id:'catalogue' ]`
- bed12:
type: file
description: |
BED12 file with one record per multi-block feature. blockStarts
are offsets from column 2 (start); blockSizes are in nt; blocks
must be in ascending genomic-coordinate order.
pattern: "*.{bed,bed12}"
ontologies:
- edam: http://edamontology.org/format_3586 # BED
output:
bed:
- - meta:
type: map
description: |
Groovy Map matching the input meta.
- ${prefix}.bed:
type: file
description: |
BED6 file with one row per in-frame mRNA position projected
back to genomic coordinates. Columns are chrom, start, end,
name (from BED12 column 4), score (preserved from BED12
column 5), strand. Sorted in mRNA-traversal order, which
means descending genomic order on '-' strand records.
pattern: "*.bed"
ontologies:
- edam: http://edamontology.org/format_3003 # BED
versions:
- versions.yml:
type: file
description: File containing software versions
pattern: "versions.yml"
ontologies:
- edam: http://edamontology.org/format_3750 # YAML
topics:
versions:
- versions.yml:
type: string
description: The name of the process
authors:
- "@pinin4fjords"
maintainers:
- "@pinin4fjords"
Comment thread
pinin4fjords marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python3

# Written by Jonathan Manning and released under the MIT license.

"""Expand a BED12 into a BED6 of in-frame mRNA positions.

Walks each record's blocks in mRNA order (5'→3'), emits every --step-th
mRNA position starting at --frame, and projects them back to genomic
coordinates. Rows are written in mRNA-traversal order, so '-' strand
records come out in descending genomic order.
"""

import argparse
import platform
import sys

import pandas as pd
import yaml

BED12_COLUMNS = [
"chrom",
"start",
"end",
"name",
"score",
"strand",
"thickStart",
"thickEnd",
"itemRgb",
"blockCount",
"blockSizes",
"blockStarts",
]


def parse_block_field(value):
return [int(x) for x in str(value).rstrip(",").split(",") if x != ""]


def mrna_to_genomic_runs(blocks, strand, mrna_start, mrna_end):
"""Project a half-open mRNA span [mrna_start, mrna_end) onto genomic
coordinates, returning a list of (g_start, g_end) BED-style runs
(one per overlapped block, in mRNA-traversal order)."""
if strand == "+":
ordered = list(blocks)
elif strand == "-":
ordered = list(reversed(blocks))
else:
return []

runs = []
cum = 0
for blk_start, blk_end in ordered:
blk_len = blk_end - blk_start
blk_lo = cum
blk_hi = cum + blk_len
cum = blk_hi

lo = max(mrna_start, blk_lo)
hi = min(mrna_end, blk_hi)
if lo >= hi:
continue
off_lo = lo - blk_lo
off_hi = hi - blk_lo
if strand == "+":
g_lo = blk_start + off_lo
g_hi = blk_start + off_hi
else:
g_hi = blk_end - off_lo
g_lo = blk_end - off_hi
runs.append((g_lo, g_hi))

return runs


def emit_rows(row, frame, step, width, keep_duplicates):
block_sizes = parse_block_field(row["blockSizes"])
block_starts = parse_block_field(row["blockStarts"])
if len(block_sizes) != int(row["blockCount"]) or len(block_starts) != int(row["blockCount"]):
sys.stderr.write(
f"warning: skipping {row['name']!r}: blockCount={row['blockCount']} but "
f"blockSizes has {len(block_sizes)} entries and blockStarts has {len(block_starts)}\\n"
)
return []

blocks = sorted((row["start"] + off, row["start"] + off + sz) for sz, off in zip(block_sizes, block_starts))
total_len = sum(be - bs for bs, be in blocks)
chrom = row["chrom"]
name = row["name"]
score = row["score"]
strand = row["strand"]

rows = []
seen = set()
for mrna_pos in range(frame, total_len, step):
if mrna_pos + width > total_len:
break
for g_start, g_end in mrna_to_genomic_runs(blocks, strand, mrna_pos, mrna_pos + width):
key = (chrom, g_start, g_end, name, strand)
if not keep_duplicates and key in seen:
continue
seen.add(key)
rows.append((chrom, g_start, g_end, name, score, strand))
return rows


parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--frame",
type=int,
default=0,
help="mRNA offset of the first position to emit (default: 0).",
)
parser.add_argument(
"--step",
type=int,
default=3,
help="Stride between successive emitted positions on the mRNA (default: 3).",
)
parser.add_argument(
"--width",
type=int,
default=1,
help="Width in nucleotides of each emitted span on the mRNA (default: 1). "
"Spans that cross a block boundary are split into one BED row per block.",
)
parser.add_argument(
"--keep-duplicates",
action="store_true",
help="Keep duplicate (chrom, start, end, name, strand) rows arising from "
"the same record (e.g. when --width >= --step).",
)
parsed_args = parser.parse_args("${args}".split() if "${args}".strip() else [])

if parsed_args.step <= 0:
raise SystemExit("--step must be positive")
if parsed_args.width <= 0:
raise SystemExit("--width must be positive")
if parsed_args.frame < 0:
raise SystemExit("--frame must be non-negative")

bed = pd.read_csv(
"${bed12}",
sep="\\t",
comment="#",
header=None,
names=BED12_COLUMNS,
dtype={"chrom": str, "name": str, "strand": str},
)
bed = bed[~bed["chrom"].astype(str).str.startswith(("track", "browser"))]

out_rows = []
for _, rec in bed.iterrows():
out_rows.extend(
emit_rows(
rec,
parsed_args.frame,
parsed_args.step,
parsed_args.width,
parsed_args.keep_duplicates,
)
)

out = pd.DataFrame(out_rows, columns=["chrom", "start", "end", "name", "score", "strand"])
out.to_csv("${prefix}.bed", sep="\\t", header=False, index=False)

with open("versions.yml", "w") as fh:
yaml.safe_dump(
{
"${task.process}": {
"python": platform.python_version(),
"pandas": pd.__version__,
}
},
fh,
default_flow_style=False,
sort_keys=False,
)
Loading