ncsa · joshfactorial · May 3, 2026 · Apr 28, 2026 · May 3, 2026 · May 3, 2026
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -1,3 +1,18 @@
+# NEAT v4.4
+- Official release of NEAT 4.0. Represents major contributions from NCSA and beyond.
+- Added parallel processing support, making NEAT production-ready for large genomes.
+- Includes more options for quality score modeling, a bacterial genome wrapper, additional tests, and performance improvements.
+- Fixed inverted indel gate condition in the sequencing error model: indel errors were never generated due to a `>` vs `<=` logic error.
+- Fixed deletion blacklist bug: the anchor position of a deletion was incorrectly blacklisted, causing the deletion to remove itself during error cleanup.
+- Fixed quality array float promotion: applying a deletion error produced an empty quality slice (`np.array([])`), which defaulted to `float64` and broke quality string encoding.
+- Fixed fallback infinite loop in error selection when all quality scores are uniform.
+- Fixed off-by-one in the main error selection loop.
+- Added `errors_per_read` pre-calculation in the runner for more accurate per-contig error budgeting proportional to coverage and contig length.
+- Fixed trinucleotide context slice off-by-one in variant generation.
+- Removed debug `import pdb` statements from production code.
+- Replaced debug print sentinels in the mutation model with proper log warnings.
+- Expanded test coverage for the error model, runner, and read simulator.
+
 # NEAT has a new home
 NEAT is now a part of the NCSA github and active development will continue here. Please direct issues, comments, and requests to the NCSA issue tracker. Submit pull requests here insead of the old repo.
 

diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Welcome to the NEAT project, the NExt-generation sequencing Analysis Toolkit, ve
 
 ## NEAT v4.4
 
-NEAT 4.4 fixes a few bugs related to NEAT. This release of NEAT includes several fixes to bugs and a little bit of restructuring, including more options for quality score modeling, a bacterial genome wrapper, additional tests, and more. Our tests show much improved performance. If the logs seem excessive, you might try using the `--log-level ERROR` to reduce the output from the logs. See the [ChangeLog](ChangeLog.md) for notes. 
+NEAT 4.4 is the official release of NEAT 4.0, including parallel processing support and significant bug fixes to the sequencing error model. See the [ChangeLog](ChangeLog.md) for details. 
 
 We have completed major revisions on NEAT since 3.4 and consider NEAT 4.4 to be a stable release, in that we will continue to update and provide bug fixes and support. We will consider new features and pull requests. Please include justification for major changes. See [contribute](CONTRIBUTING.md) for more information. If you'd like to use some of our code in your own, no problem! Just review the [license](LICENSE.md), first.
 
@@ -22,8 +22,8 @@ To cite this work, please use:
 
 ## Table of Contents
 
-* [The NEAT Project v4.4](#the-neat-project-v436)
-* [NEAT v4.4](#neat-v435)
+* [The NEAT Project v4.4](#the-neat-project-v44)
+* [NEAT v4.4](#neat-v44)
 * [Table of Contents](#table-of-contents)
   * [Prerequisites](#prerequisites)
   * [Installation](#installation)

diff --git a/neat/models/error_models.py b/neat/models/error_models.py
@@ -10,6 +10,7 @@
 
 from Bio.Seq import Seq
 from Bio import SeqRecord
+from numpy import median
 
 from neat import variants
 
@@ -165,8 +166,9 @@ def __init__(
     def get_sequencing_errors(
             self,
             padding: int,
-            reference_segment: SeqRecord,
+            reference_segment: Seq,
             quality_scores: np.ndarray,
+            num_errors,
             rng
     ):
         """
@@ -175,8 +177,9 @@ def get_sequencing_errors(
         :param padding: this is the amount of space we have in the read for deletions.
         :param reference_segment: The section of the reference from which the read is drawn
         :param quality_scores: Array of quality scores for the read
-        :return: Modified sequence and associated quality scores
+        :param num_errors: The estimated number of errors to add.
         :param rng: random number generator.
+        :return: Modified sequence and associated quality scores
         """
 
         error_indexes = []
@@ -189,9 +192,20 @@ def get_sequencing_errors(
         if self.average_error == 0:
             return introduced_errors
         else:
-            for i in range(len(quality_scores)):
-                if rng.random() < quality_score_error_rate[quality_scores[i]]:
-                    error_indexes.append(i)
+            i = len(quality_scores)
+            while len(error_indexes) < num_errors and i > 0:
+                index = rng.choice(list(range(len(quality_scores))))
+                if rng.random() < quality_score_error_rate[quality_scores[index]]:
+                    error_indexes.append(index)
+                i -= 1
+            # Fallback: if quality scores are too high to naturally reach num_errors,
+            # force errors at positions with at-or-below-median quality scores.
+            # Using <= so that uniform quality arrays (all scores equal) always make progress.
+            median_score = median(quality_scores)
+            while len(error_indexes) < num_errors:
+                index = rng.integers(len(quality_scores))
+                if quality_scores[index] <= median_score:
+                    error_indexes.append(index)
 
         total_indel_length = 0
         # To prevent deletion collisions
@@ -205,27 +219,27 @@ def get_sequencing_errors(
 
             # This is to prevent deletion error collisions and to keep there from being too many indel errors.
             if 0 < index < self.read_length - max(
-                    self.deletion_len_model) and total_indel_length > self.read_length // 4:
+                    self.deletion_len_model) and total_indel_length <= self.read_length // 4:
                 error_type = rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))
 
             # Deletion error
             if error_type == Deletion:
-                deletion_length = self.get_deletion_length()
+                deletion_length = self.get_deletion_length(rng)
                 if padding - deletion_length < 0:
                     # No space in this read to add this deletion
                     continue
-                deletion_reference = reference_segment.seq[index: index + deletion_length + 1]
+                deletion_reference = reference_segment[index: index + deletion_length + 1]
                 deletion_alternate = deletion_reference[0]
                 introduced_errors.append(
                     ErrorContainer(Deletion, index, deletion_length, deletion_reference, deletion_alternate)
                 )
                 total_indel_length += deletion_length
 
-                del_blacklist.extend(list(range(index, index + deletion_length)))
+                del_blacklist.extend(list(range(index + 1, index + deletion_length + 1)))
                 padding -= deletion_length
 
             elif error_type == Insertion:
-                insertion_length = self.get_insertion_length()
+                insertion_length = self.get_insertion_length(rng)
                 insertion_reference = reference_segment[index]
                 insert_string = ''.join(rng.choice(ALLOWED_NUCL, size=insertion_length))
                 insertion_alternate = insertion_reference + insert_string

diff --git a/neat/models/mutation_model.py b/neat/models/mutation_model.py
@@ -75,8 +75,6 @@ def __init__(self,
         self.variant_probs = variant_probs
         self.transition_matrix = transition_matrix
         self.is_cancer = is_cancer
-        self.all_dels = []
-        self.all_ins = []
 
     def get_mutation_type(self, rng: Generator) -> VariantTypes:
         """
@@ -125,7 +123,6 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int, rng: Generat
         # Now pick a random alternate, weighted by the probabilities
         alt = rng.choice(ALLOWED_NUCL, p=transition_probs)
         temp_snv = SingleNucleotideVariant(reference_location, alt=alt)
-        self.all_ins.append(temp_snv)
         return temp_snv
 
     def generate_insertion(self, location: int, ref: Seq, rng: Generator) -> Insertion:
@@ -158,5 +155,4 @@ def generate_deletion(self, location: int, rng: Generator) -> Deletion:
         # Plus one so we make sure to grab the first base too.
         # Note: if we happen to go past the end of the sequence, it will just be shorter.
         temp_del = Deletion(location, length)
-        self.all_dels.append(temp_del)
         return temp_del
diff --git a/neat/models/variant_models.py b/neat/models/variant_models.py
@@ -2,7 +2,6 @@
 Classes for the variant models included in NEAT.
 Every Variant type in variants > variant_types must have a corresponding model in order to be fully implemented.
 """
-import pdb
 import re
 import logging
 import abc
@@ -165,10 +164,10 @@ def map_local_trinuc_bias(
                     for match in re.finditer(trinuc, str(sequence)):
                         # match.start() + 1 puts us at the center of the trinuc
                         if match.start() + 1 > len(self.local_trinuc_bias):
-                            print("???")
+                            _LOG.warning(f"Trinuc bias index out of range: {match.start() + 1} > {len(self.local_trinuc_bias)}")
                         self.local_trinuc_bias[match.start() + 1] = self.trinuc_mutation_bias[TRINUC_IND[trinuc]]
                         if len(self.local_trinuc_bias) != len(sequence):
-                            print("???")
+                            _LOG.warning(f"Trinuc bias length mismatch: {len(self.local_trinuc_bias)} != {len(sequence)}")
 
             # Now we normalize the bias
             self.local_trinuc_bias = self.local_trinuc_bias / sum(self.local_trinuc_bias)

diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
@@ -1,11 +1,15 @@
 """
 Runner for generate_reads task
 """
+import gzip
 import logging
+import os
+import pickle
 import shutil
 import subprocess
 import time
 import multiprocessing as mp
+from math import ceil
 
 from pathlib import Path
 
@@ -16,6 +20,7 @@
 from .utils import Options, OutputFileWriter, parse_beds, parse_input_vcf
 from ..common import validate_input_path, validate_output_path
 from .single_runner import read_simulator_single
+from ..models import SequencingErrorModel
 from ..variants import ContigVariants
 from .utils.split_inputs import main as split_main
 from .utils.stitch_outputs import main as stitch_main
@@ -74,6 +79,31 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str):
     reference_index = SeqIO.index(str(options.reference), "fasta")
     reference_keys_with_lens = {key: len(value) for key, value in reference_index.items()}
 
+    # We need sequencing errors to get the quality score attributes, even for the vcf
+    if options.avg_seq_error:
+        average_error = options.avg_seq_error
+    elif options.error_model:
+        error_models = pickle.load(gzip.open(options.error_model))
+        average_error = error_models["error_model1"].average_error
+        # We just need the error value
+        del error_models
+    else:
+        # Use the default value
+        average_error = 0.009228843915252066
+
+    # _LOG.debug('Sequencing error and quality score models loaded')
+    # We need to estimate how many total errors to add
+    total_reference_length = sum(reference_keys_with_lens.values())
+    # This is an estimate due to some random effects
+    number_of_bases_in_analysis = total_reference_length * options.coverage
+    # Each base called has an "average_error" chance of being an error
+    total_errors = ceil(average_error * number_of_bases_in_analysis)
+    # Normalization gives the percent value for each item with a sum of all values being 1.0
+    normalized_counts = {k: v / total_reference_length for (k, v) in reference_keys_with_lens.items()}
+    # Multiply the normalized count by the total errors. This gives the number of errors that should be
+    # introduced into the contig
+    errors_per_contig = {k: ceil(v * total_errors) for (k, v) in normalized_counts.items()}
+
     count = 0
     for contig in reference_keys_with_lens:
         count += reference_keys_with_lens[contig]
@@ -132,15 +162,20 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str):
 
     output_opts: list = []
     output_files: list = []
-    # a dict with contig keys, then the thread index, and finally the applicable contig variants as the value
-    # TODO Remove if not needed
-    # all_variants: dict[str, dict[int, ContigVariants]] = {chrom: {} for chrom in reference_index.keys()}
     thread_idx = 1
     contig_list = list(reference_keys_with_lens.keys())
     contig_dict = {contig: contig_list.index(contig) for contig in reference_keys_with_lens.keys()}
     for contig in splits_files_dict:
         contig_index = contig_dict[contig]
         for ((start, length), splits_file) in splits_files_dict[contig].items():
+            block_percentage = length / reference_keys_with_lens[contig]
+            block_errors = errors_per_contig[contig] * block_percentage
+            estimated_number_of_reads = (length // options.read_len) * options.coverage
+            errors_per_read = round(block_errors / estimated_number_of_reads)
+            if errors_per_read < 1.0 and block_errors > 0:
+                # We know we need a few errors, but it's a small number total
+                if options.rng.random() < average_error:
+                    errors_per_read += 1
             current_output_dir = options.temp_dir_path / splits_file.stem
             current_output_dir.mkdir(parents=True, exist_ok=True)
             # Create local filenames based on fasta indexing scheme.
@@ -175,10 +210,9 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str):
                     target_regions_dict[contig],
                     discard_regions_dict[contig],
                     mutation_rate_dict[contig],
+                    errors_per_read,
                 )
                 _LOG.info(f"Completed simulating contig {contig}.")
-                # TODO Remove if not needed
-                # all_variants[contig][idx] = local_variants
                 output_files.append((thread_idx, files_written))
             else:
                 thread_input_variants = filter_thread_variants(input_variants_dict[contig], (start, start+length))
@@ -196,7 +230,8 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str):
                     thread_input_variants,
                     thread_target_regions,
                     thread_discard_regions,
-                    thread_mutation_regions
+                    thread_mutation_regions,
+                    errors_per_read,
                 ))
             thread_idx += 1
 
@@ -210,8 +245,6 @@ def read_simulator_runner(config: str, output_dir: str, file_prefix: str):
 
         # Need to organize the results, as above
         for idx, contig, local_variants, files_written in results.get():
-            # TODO Remove if not needed
-            # all_variants[contig][idx] = local_variants
             output_files.append((idx, files_written))
 
     _LOG.info("Processing complete, writing output")

diff --git a/neat/read_simulator/single_runner.py b/neat/read_simulator/single_runner.py
@@ -4,13 +4,14 @@
 import gzip
 import os
 import pickle
-import pdb
 
 import pysam
 from Bio import SeqIO, bgzf
 import logging
 from pathlib import Path
 
+from math import ceil, floor
+
 from .utils import OutputFileWriter, \
     generate_variants, generate_reads, Options, recalibrate_mutation_regions
 from ..variants import ContigVariants
@@ -25,28 +26,28 @@ def read_simulator_single(
         thread_idx: int,
         block_start: int,
         local_options: Options,
-        bam_header: list | None,
+        bam_header: dict | None,
         contig_name: str,
         contig_index: int,
         input_variants_local: ContigVariants,
         target_regions: list,
         discard_regions: list,
         mutation_regions: list,
+        errors_per_read: int,
 ) -> tuple[int, str, ContigVariants, dict[str, Path], ]:
     """
     inputs:
     :param thread_idx: index of current thread
     :param block_start: Where on the reference does this block start? For a full contig, this will be 0.
     :param local_options: options for current thread and reference chunk
-    :param bam_header: Pass in the outer bam header.
+    :param bam_header: Pass in the outer bam header, which is a dictionary of the contigs plus lengths.
     :param contig_name: The original list of contig names.
     :param contig_index: The index of the contig which this chunk comes from
     :param input_variants_local: The input variants for this block
-    TODO I'm counting on the target and discard regions not being used. They are likely broken with multithreading.
-            Probably they make more sense after the fact now
     :param target_regions: Target regions for the run
     :param discard_regions:  discard regions for the run
     :param mutation_regions: mutation regions (unchecked) for the run
+    :param errors_per_read: How many errors this contig will receive
 
     Ideally this should work for either a file chunk or contig. We'll assume here that we're
     getting either an entire contig or a file chunk, and that no new subdivisions are needed.
@@ -113,6 +114,7 @@ def read_simulator_single(
             thread_idx,
             local_seq_record,
             seq_error_model,
+            errors_per_read,
             qual_score_model,
             fraglen_model,
             local_variants,

diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
@@ -1,7 +1,6 @@
 import logging
 import pickle
 import time
-import pdb
 from math import ceil
 from pathlib import Path
 
@@ -149,6 +148,7 @@ def generate_reads(
         thread_index: int,
         reference: SeqRecord,
         error_model: SequencingErrorModel,
+        errors_per_read: int,
         qual_model: TraditionalQualityModel,
         fraglen_model: FragmentLengthModel,
         contig_variants: ContigVariants,
@@ -166,6 +166,7 @@ def generate_reads(
     :param thread_index: Index of current thread
     :param reference: The reference segment that reads will be drawn from.
     :param error_model: The error model for this run, the forward strand
+    :param errors_per_read: Total number of errors to add to contig
     :param qual_model: The quality score model for this run, forward strand
     :param fraglen_model: The fragment length model for this run
     :param contig_variants: An object containing all input and randomly generated variants to be included.
@@ -285,9 +286,9 @@ def generate_reads(
             fastq_handle,
             options.quality_offset,
             options.produce_fastq,
+            errors_per_read,
             options.rng
         )
-
         # skip over read 2 for single ended reads.
         if options.paired_ended:
             # Padding, as above
@@ -313,7 +314,6 @@ def generate_reads(
             )
 
             read_2.mutations = find_applicable_mutations(read_2, contig_variants)
-
             if options.produce_fastq:
                 fastq_handle = ofw.files_to_write[ofw.fq2]
             else:
@@ -324,6 +324,7 @@ def generate_reads(
                 fastq_handle,
                 options.quality_offset,
                 options.produce_fastq,
+                errors_per_read,
                 options.rng
             )
             reads_to_write.append((read_1, read_2))