Skip to content

Commit ead3e07

Browse files
Merge pull request #239 from ncsa/markov-integration
Markov integration
2 parents 60ea4fd + 0b69f8d commit ead3e07

13 files changed

Lines changed: 835 additions & 434 deletions

File tree

config_template/template_neat_config.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,7 @@ rng_seed: .
133133
# type: int | required = no
134134
min_mutations: .
135135

136-
# Overwrite the output files, if they are named the same as the current run.
137-
# Default is to quit if files already exist to avoid data destruction
136+
# Overwrite output files if they already exist
138137
# type: bool | required = no | default = false
139138
overwrite_output: .
140139

@@ -153,12 +152,12 @@ parallel_block_size: .
153152
threads: .
154153

155154
# Delete the 'splits' directory after stitching completes
156-
# Note if threads == 1, this option has no effect.
155+
# Note: If threads == 1, this option has no effect.
157156
# type = bool | required: no | default = true
158157
cleanup_splits: .
159158

160159
# Reuse existing files in '<out_dir>/splits' and skip the split step.
161160
# The directory must contain NEAT-generated files and must be in the output directory within "splits"
162-
# Note if threads == 1, this option has no effect.
161+
# Note: If threads == 1, this option has no effect.
163162
# type = bool | required: no | default = False
164-
reuse_splits: .
163+
reuse_splits: .

neat/cli/commands/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
"""Modules related to the subcommands' interfaces"""
1+
"""Modules related to the subcommands"""
2+
23
from .base import *
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import argparse
2+
3+
from ...model_quality_score import model_qual_score_runner
4+
from .base import BaseCommand
5+
from .options import output_group
6+
7+
8+
class Command(BaseCommand):
9+
"""
10+
Generate a quality score model (traditional or Markov) from FASTQ.
11+
"""
12+
13+
name = "model-qual-score"
14+
description = "Generate quality score model from FASTQ (optional Markov chain)."
15+
16+
def add_arguments(self, parser: argparse.ArgumentParser):
17+
18+
parser.add_argument(
19+
"-i",
20+
dest="input_files",
21+
metavar="FILE",
22+
nargs="+",
23+
required=True,
24+
help="Input FASTQ file(s) (gzipped or plain).",
25+
)
26+
27+
parser.add_argument(
28+
"-q",
29+
dest="quality_offset",
30+
type=int,
31+
default=33,
32+
help="Quality score offset [33].",
33+
)
34+
35+
parser.add_argument(
36+
"-Q",
37+
dest="quality_scores",
38+
type=int,
39+
nargs="+",
40+
default=[42],
41+
help="Max quality or explicit list of quality scores [42].",
42+
)
43+
44+
parser.add_argument(
45+
"-m",
46+
dest="max_num",
47+
type=int,
48+
default=-1,
49+
help="Max number of reads to process [-1 = all].",
50+
)
51+
52+
parser.add_argument(
53+
"--markov",
54+
dest="use_markov",
55+
action="store_true",
56+
default=False,
57+
help="Use Markov quality model instead of the traditional model.",
58+
)
59+
60+
parser.add_argument(
61+
"--overwrite",
62+
dest="overwrite",
63+
action="store_true",
64+
default=False,
65+
help="Overwrite existing output file if present.",
66+
)
67+
68+
output_group.add_to_parser(parser)
69+
70+
def execute(self, arguments: argparse.Namespace):
71+
72+
if len(arguments.quality_scores) == 1:
73+
qual_scores: int | list[int] = arguments.quality_scores[0]
74+
75+
else:
76+
qual_scores = arguments.quality_scores
77+
78+
model_qual_score_runner(
79+
files=arguments.input_files,
80+
offset=arguments.quality_offset,
81+
qual_scores=qual_scores,
82+
max_reads=arguments.max_num,
83+
overwrite=arguments.overwrite,
84+
output_dir=arguments.output_dir,
85+
output_prefix=arguments.prefix,
86+
use_markov=arguments.use_markov,
87+
)
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""
2+
Submodule to build quality score models for NEAT that wraps NEAT’s
3+
existing sequencing error model and traditional quality model, with the
4+
option to construct a Markov chain–based quality model instead
5+
"""
6+
7+
__all__ = ["model_qual_score_runner"]
8+
9+
from .runner import model_qual_score_runner

neat/model_quality_score/runner.py

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
"""
2+
Runner for creating quality score models.
3+
4+
This module implements the core logic for generating quality score models
5+
from input FASTQ files.
6+
"""
7+
8+
import gzip
9+
import pickle
10+
import logging
11+
from pathlib import Path
12+
from typing import Iterable, List, Optional, Tuple
13+
14+
import numpy as np
15+
16+
from ..common import validate_input_path, validate_output_path
17+
from ..model_sequencing_error.utils import parse_file
18+
from ..models import SequencingErrorModel, TraditionalQualityModel
19+
from ..models.markov_quality_model import MarkovQualityModel
20+
from ..quality_score_modeling.markov_utils import build_markov_model
21+
22+
__all__ = ["model_qual_score_runner"]
23+
24+
_LOG = logging.getLogger(__name__)
25+
26+
27+
def _prepare_quality_scores_argument(
28+
qual_scores: int | Iterable[int | float],
29+
) -> Tuple[List[int], Optional[List[int]]]:
30+
"""
31+
Returns:
32+
- full_range_scores: required by parse_file indexing
33+
- allowed_bins: None (no binning) or sorted unique user bins (for Markov binning)
34+
"""
35+
36+
if isinstance(qual_scores, int):
37+
max_q = int(qual_scores)
38+
return list(range(0, max_q + 1)), None
39+
40+
bins = sorted({int(x) for x in qual_scores})
41+
42+
if not bins:
43+
raise ValueError("quality_scores list must not be empty.")
44+
45+
max_q = max(bins)
46+
47+
# parse_file requires indices up to max_q (it indexes by raw Q value)
48+
return list(range(0, max_q + 1)), bins
49+
50+
51+
def model_qual_score_runner(
52+
files: List[str],
53+
offset: int,
54+
qual_scores: int | Iterable[int | float],
55+
max_reads: int,
56+
use_markov: bool,
57+
overwrite: bool,
58+
output_dir: str,
59+
output_prefix: str,
60+
) -> None:
61+
"""Create and save a quality score model from FASTQ data."""
62+
63+
if len(files) > 2:
64+
_LOG.info("Only processing the first two input files")
65+
files = files[:2]
66+
67+
# Validate input paths
68+
for file in files:
69+
validate_input_path(file)
70+
71+
_LOG.debug("Input files: %s", ", ".join(str(x) for x in files))
72+
_LOG.debug("Quality offset: %d", offset)
73+
74+
final_quality_scores, allowed_bins = _prepare_quality_scores_argument(qual_scores)
75+
_LOG.debug("Quality scores range: %s", final_quality_scores)
76+
77+
if allowed_bins is not None:
78+
_LOG.debug("Markov binning enabled with bins: %s", allowed_bins)
79+
80+
if max_reads in (-1, None):
81+
num_records_to_process = float("inf")
82+
83+
else:
84+
num_records_to_process = max_reads
85+
86+
# Validate output directory and file
87+
validate_output_path(output_dir, is_file=False)
88+
output_path = Path(output_dir)
89+
output_file = output_path / f"{output_prefix}.p.gz"
90+
validate_output_path(output_file, overwrite=overwrite)
91+
92+
_LOG.info("Writing output to: %s", output_file)
93+
94+
# Containers for per-file quality model parameters
95+
read_parameters: List[np.ndarray] = []
96+
average_errors: List[float] = []
97+
read_length = 0
98+
99+
# Traditional model parameters (existing NEAT utility)
100+
for idx_file, file in enumerate(files, start=1):
101+
102+
_LOG.info("Reading file %d of %d", idx_file, len(files))
103+
104+
params_by_position, file_avg_error, read_length = parse_file(
105+
file,
106+
final_quality_scores,
107+
num_records_to_process,
108+
offset,
109+
read_length,
110+
)
111+
112+
read_parameters.append(params_by_position)
113+
average_errors.append(file_avg_error)
114+
115+
_LOG.info("Finished reading file %d", idx_file)
116+
117+
if not read_parameters:
118+
raise RuntimeError("No quality score parameters were computed. Check input FASTQ files.")
119+
120+
average_error = float(np.average(average_errors)) if average_errors else 0.0
121+
_LOG.info("Average sequencing error across files: %f", average_error)
122+
123+
# Prepare models for each input file
124+
models: List[Tuple[SequencingErrorModel, TraditionalQualityModel, Optional[MarkovQualityModel]]] = []
125+
126+
for idx in range(len(read_parameters)):
127+
128+
# Sequencing error model (always produced)
129+
seq_err_model = SequencingErrorModel(avg_seq_error=average_error, read_length=read_length)
130+
131+
# Traditional quality model (always produced)
132+
trad_model = TraditionalQualityModel(
133+
average_error=average_error,
134+
quality_scores=np.array(final_quality_scores),
135+
qual_score_probs=read_parameters[idx],
136+
)
137+
138+
markov_model: Optional[MarkovQualityModel] = None
139+
140+
# Optionally build Markov quality model
141+
142+
if use_markov:
143+
144+
# Position-specific transition matrices
145+
init_dist, pos_dists, trans_dists, max_quality, train_read_length = build_markov_model(
146+
[files[idx]],
147+
num_records_to_process,
148+
offset,
149+
allowed_quality_scores=allowed_bins,
150+
)
151+
152+
markov_model = MarkovQualityModel(
153+
initial_distribution=init_dist,
154+
position_distributions=pos_dists,
155+
max_quality=max_quality,
156+
read_length=train_read_length,
157+
transition_distributions=trans_dists,
158+
)
159+
160+
models.append((seq_err_model, trad_model, markov_model))
161+
162+
# Write out the models
163+
164+
with gzip.open(output_file, "wb") as out_model:
165+
166+
if len(models) == 1:
167+
168+
seq_err1, trad1, markov1 = models[0]
169+
pickle.dump(
170+
{
171+
"error_model1": seq_err1,
172+
"error_model2": None,
173+
"qual_score_model1": markov1 if use_markov else trad1,
174+
"qual_score_model2": None,
175+
},
176+
out_model,
177+
)
178+
179+
elif len(models) == 2:
180+
181+
(seq_err1, trad1, markov1), (seq_err2, trad2, markov2) = models
182+
pickle.dump(
183+
{
184+
"error_model1": seq_err1,
185+
"error_model2": seq_err2,
186+
"qual_score_model1": markov1 if use_markov else trad1,
187+
"qual_score_model2": markov2 if use_markov else trad2,
188+
},
189+
out_model,
190+
)
191+
192+
else:
193+
194+
# NEAT's read simulator only understands one or two models
195+
raise RuntimeError(f"Expected at most two quality models, but constructed {len(models)}.")
196+
197+
_LOG.info("Quality score model saved to %s", output_file)

neat/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .error_models import *
22
from .mutation_model import *
33
from .fragment_model import *
4+
from .markov_quality_model import *

0 commit comments

Comments
 (0)