-
Notifications
You must be signed in to change notification settings - Fork 38
Expand file tree
/
Copy pathgen_data.py
More file actions
152 lines (125 loc) · 4.52 KB
/
gen_data.py
File metadata and controls
152 lines (125 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Generate the Round 3 dataset: a FASTA-like file with known pattern hits.
Run from anywhere:
uv run rounds/3_dna/gen_data.py # default ~512 MB
uv run rounds/3_dna/gen_data.py --size-mb 100
Output:
rounds/3_dna/data/genome.fasta
rounds/3_dna/data/truth.json
rounds/3_dna/data/fixture_genome.fasta
rounds/3_dna/data/fixture_truth.json
"""
from __future__ import annotations
import argparse
import json
import os
import random
from pathlib import Path
DATA_DIR = Path(__file__).parent / "data"
ALPHABET = b"ACGT"
PATTERN = b"AGTCCGTA" # the pattern attendees will search for
LINE_WIDTH = 80
# Map every byte 0–255 to one of ACGT via the low 2 bits. Pre-computed once so
# random sequence generation can run entirely in C: `rng.randbytes(n)` produces
# raw bytes, then `.translate(_BYTE_TO_ACGT)` maps them to the DNA alphabet.
_BYTE_TO_ACGT = bytes(b"ACGT"[b & 3] for b in range(256))
def _random_seq(rng: random.Random, length: int) -> bytearray:
return bytearray(rng.randbytes(length).translate(_BYTE_TO_ACGT))
def _inject_pattern(seq: bytearray, rng: random.Random, pattern: bytes, count: int) -> list[int]:
"""Overwrite ``count`` slots in ``seq`` with ``pattern`` at random offsets."""
placed: list[int] = []
attempts = 0
while len(placed) < count and attempts < count * 10:
attempts += 1
offset = rng.randrange(0, len(seq) - len(pattern))
# avoid stacking patterns on top of each other for stable ground truth
if any(abs(offset - p) < len(pattern) for p in placed):
continue
seq[offset : offset + len(pattern)] = pattern
placed.append(offset)
placed.sort()
return placed
def _write_fasta(
out_path: Path,
truth_path: Path,
target_size: int,
seed: int,
pattern: bytes,
inject_every_n: int,
) -> None:
rng = random.Random(seed)
truth: list[tuple[str, list[int]]] = []
written = 0
record_idx = 0
with out_path.open("wb") as f:
while written < target_size:
record_idx += 1
record_id = f"seq_{record_idx:06d}"
seq_len = rng.randint(5_000, 100_000)
seq = _random_seq(rng, seq_len)
positions: list[int] = []
if record_idx % inject_every_n == 0:
positions = _inject_pattern(
seq, rng, pattern, count=rng.randint(1, 4)
)
# Also scan for accidental occurrences of the pattern.
seq_bytes = bytes(seq)
scan = 0
while True:
idx = seq_bytes.find(pattern, scan)
if idx == -1:
break
if idx not in positions:
positions.append(idx)
scan = idx + 1
positions.sort()
if positions:
truth.append((record_id, positions))
header = f">{record_id}\n".encode("ascii")
f.write(header)
written += len(header)
for offset in range(0, len(seq), LINE_WIDTH):
f.write(seq_bytes[offset : offset + LINE_WIDTH])
f.write(b"\n")
written += min(LINE_WIDTH, len(seq) - offset) + 1
truth_path.write_text(
json.dumps(
{
"pattern": pattern.decode("ascii"),
"matches": [[rid, list(p)] for rid, p in truth],
},
indent=2,
)
)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--size-mb", type=int, default=512)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
DATA_DIR.mkdir(parents=True, exist_ok=True)
fixture_fasta = DATA_DIR / "fixture_genome.fasta"
fixture_truth = DATA_DIR / "fixture_truth.json"
print(f"writing fixture: {fixture_fasta} (~512 KB)")
_write_fasta(
fixture_fasta,
fixture_truth,
target_size=512 * 1024,
seed=args.seed + 100,
pattern=PATTERN,
inject_every_n=3,
)
full_fasta = DATA_DIR / "genome.fasta"
full_truth = DATA_DIR / "truth.json"
full_size = args.size_mb * 1024 * 1024
print(f"writing genome: {full_fasta} (~{args.size_mb} MB)")
_write_fasta(
full_fasta,
full_truth,
target_size=full_size,
seed=args.seed,
pattern=PATTERN,
inject_every_n=5,
)
total = os.path.getsize(full_fasta) + os.path.getsize(fixture_fasta)
print(f"done. total on disk: {total:,} bytes")
if __name__ == "__main__":
main()