ncsa
diff --git a/‎.gitignore‎
Lines changed: 26 additions & 1 deletion b/‎.gitignore‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎neat/read_simulator/utils/bed_func.py‎
Lines changed: 16 additions & 17 deletions b/‎neat/read_simulator/utils/bed_func.py‎
Lines changed: 16 additions & 17 deletions
diff --git a/‎neat/read_simulator/utils/generate_reads.py‎
Lines changed: 7 additions & 0 deletions b/‎neat/read_simulator/utils/generate_reads.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎neat/read_simulator/utils/generate_variants.py‎
Lines changed: 6 additions & 6 deletions b/‎neat/read_simulator/utils/generate_variants.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎neat/read_simulator/utils/vcf_func.py‎
Lines changed: 16 additions & 10 deletions b/‎neat/read_simulator/utils/vcf_func.py‎
Lines changed: 16 additions & 10 deletions
diff --git a/‎neat/variants/contig_variants.py‎
Lines changed: 6 additions & 6 deletions b/‎neat/variants/contig_variants.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 30 additions & 0 deletions b/‎tests/conftest.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎tests/test_cli/test_basic_cli.py‎
Lines changed: 1 addition & 0 deletions b/‎tests/test_cli/test_basic_cli.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/test_models/test_error_and_mut_models.py‎
Lines changed: 4 additions & 30 deletions b/‎tests/test_models/test_error_and_mut_models.py‎
Lines changed: 4 additions & 30 deletions
@@ -1,6 +1,31 @@
 # Ignore filetypes
 *.pyc
+*.pyo
+*.pyd
+__pycache__/
+
+# Virtual environments
 /python2env/
+/.venv/
+/venv/
+/env/
+
+# IDEs
 /.ipynb_checkpoints/
 /.vscode/
-/.idea/
+/.idea/
+
+# Test & coverage artifacts
+.coverage
+.coverage.*
+htmlcov/
+.pytest_cache/
+
+# NEAT log files
+*.log
+
+# Build / packaging
+dist/
+build/
+*.egg-info/
+*.egg
@@ -21,30 +21,29 @@
 
 def intersect_regions(mutation_regions: list, block_tuple: tuple[int, int], default_value: float) -> list:
     """
+    Clips each mutation region to the block window and returns only the overlapping
+    sub-intervals, preserving each region's mutation rate.
+
     Our assumption here is that mutation regions is a continuous list, such that
     for each region, the end of the previous region is the start of the next region,
     and there are no gaps. This should be true of anything generated from parse_beds, but
     needs some tests to verify
     """
     ret_list = []
     block_start, block_end = block_tuple
-    for i in range(len(mutation_regions)):
-        region = mutation_regions[i]
-        if region[0] <= block_start < region[1]:
-            # We found the first region covering the block
-            if block_end <= region[1]:
-                # If the block spans the entire region, we have a special case
-                ret_list.append((block_start, block_end, region[2]))
-                # nothing more to do
-                return ret_list
-            ret_list.append((block_start, region[1], region[2]))
-        elif region[0] <= block_end < region[1]:
-            # We found the last region covering the block
-            ret_list.append((region[0], block_end, region[2]))
-            # nothing more to do
-            return ret_list
-    # If we haven't returned yet, then we did not find the end in our mutations list
-    ret_list.append((mutation_regions[-1][1], block_end, default_value))
+    for region in mutation_regions:
+        overlap_start = max(region[0], block_start)
+        overlap_end = min(region[1], block_end)
+        if overlap_start < overlap_end:
+            ret_list.append((overlap_start, overlap_end, region[2]))
+
+    if not ret_list:
+        # Block is entirely outside all provided regions
+        ret_list.append((block_start, block_end, default_value))
+    elif ret_list[-1][1] < block_end:
+        # Block extends past the last region; fill the tail with the default rate
+        ret_list.append((ret_list[-1][1], block_end, default_value))
+
     return ret_list
 
 def parse_beds(options: Options, ref_keys_counts: dict) -> list:
 
@@ -183,6 +183,13 @@ def generate_reads(
     # _LOG.info(f'Sampling reads for thread {thread_index}...')
     start_time = time.time()
 
+    if len(reference) < options.read_len:
+        _LOG.warning(
+            f"Contig '{contig_name}' (length {len(reference)}) is shorter than read_len "
+            f"({options.read_len}). Skipping contig."
+        )
+        return []
+
     # _LOG.debug("Covering dataset.")
     t = time.time()
     reads = cover_dataset(
 
@@ -77,8 +77,9 @@ def generate_variants(
             for variant in input_variants.contig_variants[variant_location]:
                 return_variants.add_variant(variant)
 
-    # pase out the mutation rates
-    mutation_rates = np.array([x[2] for x in mutation_rate_regions])
+    # pase out the mutation rates; substitute None with the model average
+    mutation_rates = np.array([x[2] if x[2] is not None else mutation_model.avg_mut_rate
+                               for x in mutation_rate_regions])
 
     # Trying to use a random window to keep memory under control. May need to adjust this number.
     max_window_size = 1000
@@ -114,13 +115,12 @@ def generate_variants(
     # _LOG.info(f'Planning to add {how_many_mutations} mutations. The final number may be less.')
 
     while how_many_mutations > 0:
-        # Pick a region based on the mutation rates
-        # (default is one rate for the whole chromosome, so this will be trivial in that case
-        # for this selection, we'll normalize the mutation rates
-        probability_rates = mutation_rates / sum(mutation_rates)
         # We need to intersect our chosen mutation region with our block
         local_mut_regions = bed_func.intersect_regions(mutation_rate_regions, (ref_start, ref_start + len(reference)), options.mutation_rate)
         # For no input mutation regions bed, this will return the entire sequence.
+        # Build probability weights from the intersected regions so the lengths always match.
+        local_rates = np.array([r[2] if r[2] is not None else mutation_model.avg_mut_rate for r in local_mut_regions])
+        probability_rates = local_rates / sum(local_rates)
         mut_region = options.rng.choice(a=local_mut_regions, p=probability_rates)
         mut_region_offset = (int(mut_region[0]-ref_start), int(mut_region[1]-ref_start), mut_region[2])
 
 
@@ -161,19 +161,23 @@ def parse_input_vcf(
                         # Retrieve the GT from the first sample in the record
                         genotype = retrieve_genotype(record)
 
-                    elif "WP" in [x.split('=') for x in record[7].split(';')]:
+                    elif "WP" in [x.split('=')[0] for x in record[7].split(';') if '=' in x]:
                         """
                         "WP" is the legacy code NEAT used for genotype it added. It was found in the INFO field.
                         We're just going to make a sample column in this version of NEAT
                         The logic of the statement is split the info field on ';' which is used as a divider in that field.
                         Most but not all fields also have an '=', so split there too, then look for "WP"
                         """
                         format_column = f"GT:{record[8]}"
-                        for record in record[7].split(';'):
-                            if record.startswith('WP'):
-                                genotype = record.split('=')[1].replace('/', '|').split('|')
+                        sample_field = record[9]
+                        for info_item in record[7].split(';'):
+                            if info_item.startswith('WP') and '=' in info_item:
+                                genotype = info_item.split('=')[1].replace('/', '|').split('|')
                                 genotype = np.array([int(x) for x in genotype])
-                                normal_sample_field = f"{get_genotype_string(genotype)}:{record[9]}"
+                                normal_sample_field = f"{get_genotype_string(genotype)}:{sample_field}"
+                            elif info_item.startswith('WP'):
+                                _LOG.error(f'Malformed WP field in INFO (missing value): {record[7]}')
+                                sys.exit(1)
 
                     else:
                         format_column = 'GT:' + record[8]
@@ -182,20 +186,22 @@ def parse_input_vcf(
                         gt_field = get_genotype_string(genotype)
                         normal_sample_field = f'{gt_field}:{record[9]}'
 
-                elif "WP" in [x.split('=') for x in record[7].split(';')]:
+                elif "WP" in [x.split('=')[0] for x in record[7].split(';') if '=' in x]:
                     """
                     "WP" is the legacy code NEAT used for genotype it added. It was found in the INFO field.
                     We're just going to make a sample column in this version of NEAT
                     The logic of the statement is split the info field on ';' which is used as a divider in that field.
                     Most but not all fields also have an '=', so split there too, then look for "WP"
                     """
                     format_column = "GT"
-                    info_split = record[7].split(';')
-                    for record in info_split:
-                        if record.startswith('WP'):
-                            genotype = record.split('=')[1].replace('/', '|').split('|')
+                    for info_item in record[7].split(';'):
+                        if info_item.startswith('WP') and '=' in info_item:
+                            genotype = info_item.split('=')[1].replace('/', '|').split('|')
                             genotype = np.array([int(x) for x in genotype])
                             normal_sample_field = get_genotype_string(genotype)
+                        elif info_item.startswith('WP'):
+                            _LOG.error(f'Malformed WP field in INFO (missing value): {record[7]}')
+                            sys.exit(1)
 
                 else:
                     # If there was no format column, there's no sample column, so we'll generate one
 
@@ -49,7 +49,7 @@ def check_if_del(self, other):
 
     def check_if_ins(self, other):
         for insert in self.all_ins:
-            if np.array_equal(other.genotype, insert.genotype) and insert.contains(other):
+            if np.array_equal(other.genotype, insert.genotype) and insert.contains(other.position1):
                 return insert
         return None
 
@@ -150,11 +150,11 @@ def get_sample_info(variant):
             return get_genotype_string(variant.genotype)
 
     def remove_variant(self, variant):
-        if variant.position in self.variant_locations:
-            if variant in self.contig_variants[variant.position]:
-                self.contig_variants[variant.position].remove(variant)
-            if not self.contig_variants[variant.position]:
-                self.variant_locations.remove(variant.position)
+        if variant.position1 in self.variant_locations:
+            if variant in self.contig_variants[variant.position1]:
+                self.contig_variants[variant.position1].remove(variant)
+            if not self.contig_variants[variant.position1]:
+                self.variant_locations.remove(variant.position1)
 
     def __getitem__(self, input_location: int) -> list:
         """
 
@@ -0,0 +1,30 @@
+import logging
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _isolate_neat_logging():
+    """
+    Close and remove any FileHandlers attached to NEAT loggers before each test.
+    Prevents 'ValueError: I/O operation on closed file' errors when a FileHandler
+    from a previous test is still attached after its underlying file is closed.
+    Propagation is left intact so caplog can capture NEAT log output.
+    """
+    def _close_file_handlers(logger):
+        for h in list(logger.handlers):
+            if isinstance(h, logging.FileHandler):
+                logger.removeHandler(h)
+                try:
+                    h.close()
+                except Exception:
+                    pass
+
+    for name, logger in list(logging.Logger.manager.loggerDict.items()):
+        if (name == "neat" or name.startswith("neat.")) and isinstance(logger, logging.Logger):
+            _close_file_handlers(logger)
+
+    yield
+
+    for name, logger in list(logging.Logger.manager.loggerDict.items()):
+        if (name == "neat" or name.startswith("neat.")) and isinstance(logger, logging.Logger):
+            _close_file_handlers(logger)
@@ -36,6 +36,7 @@ def test_basic_cli():
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            cwd=str(td),
         )
         assert proc.returncode == 0, f"STDERR:\n{proc.stderr}"
         assert out.exists()
@@ -41,26 +41,8 @@ def test_mutation_model_generate_snv_trinuc():
     assert snv.alt in ["A", "C", "G", "T"]
 
 
-def test_sequencing_error_model_zero_error_returns_none_or_empty():
-    """
-    avg_seq_error == 0 should yield no errors.
-    """
-    rng = default_rng(4)
-    sem = SequencingErrorModel(avg_seq_error=0.0)
-    ref = SeqRecord(Seq("A" * 40), id="chr1")
-    quals = np.array([40] * 40, dtype=int)
-    result = sem.get_sequencing_errors(
-        padding=20,
-        reference_segment=ref,
-        quality_scores=quals,
-        rng=rng,
-    )
-    if isinstance(result, tuple):
-        introduced, pad = result
-        assert introduced == []
-        assert pad >= 0
-    else:
-        assert result == []
+    # test_sequencing_error_model_zero_error_returns_none_or_empty removed:
+    # duplicate of test_error_models.py::test_sem_zero_error_rate_returns_empty
 
 
 def test_traditional_quality_model_shapes_and_range():
@@ -135,16 +117,8 @@ def test_mutation_model_snv_does_not_keep_reference_base():
     assert snv.alt != central
 
 
-def test_traditional_quality_model_reproducible_with_seed():
-    """Quality model should be deterministic given the same RNG state."""
-    rng1 = default_rng(8)
-    rng2 = default_rng(8)
-    qm = TraditionalQualityModel(average_error=0.01)
-
-    qs1 = qm.get_quality_scores(model_read_length=151, length=100, rng=rng1)
-    qs2 = qm.get_quality_scores(model_read_length=151, length=100, rng=rng2)
-
-    assert np.array_equal(qs1, qs2)
+    # test_traditional_quality_model_reproducible_with_seed removed:
+    # duplicate of test_error_models.py::test_tqm_get_quality_scores_reproducible
 
 
 def test_sequencing_error_model_reproducible_with_seed():
Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ def test_basic_cli():`
`36`	`36`	`stdout=subprocess.PIPE,`
`37`	`37`	`stderr=subprocess.PIPE,`
`38`	`38`	`text=True,`
	`39`	`+ cwd=str(td),`
`39`	`40`	`)`
`40`	`41`	`assert proc.returncode == 0, f"STDERR:\n{proc.stderr}"`
`41`	`42`	`assert out.exists()`