Merge pull request #300 from ncsa/audit-splits-options

joshfactorial · web-flow · commit 765b5ece3e48 · 2026-05-19T23:18:25.000-05:00
Drop cleanup_splits and reuse_splits; bump to v4.5.1
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -1,3 +1,18 @@
+# NEAT v4.5.1
+
+Removed the `cleanup_splits` and `reuse_splits` config options.
+
+`reuse_splits` was fully broken (it raised `FileNotFoundError` unconditionally
+regardless of whether the splits directory existed). `cleanup_splits` existed
+solely to support `reuse_splits`; without it the option has no value — the
+splits directory always lives in a `TemporaryDirectory` that is cleaned up
+automatically when the run exits.
+
+Both keys are now in `DEPRECATED_KEYS`: configs that still include them
+receive a one-line deprecation warning and continue parsing cleanly, so no
+user config breaks. The README examples and all internal test fixtures have
+been updated to drop these keys.
+
 # NEAT v4.5.0
 
 New `neat compare-vcfs` subcommand: compares a downstream variant caller's VCF
diff --git a/README.md b/README.md
@@ -205,8 +205,6 @@ More parameters are below:
 | `min_mutations`     | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig.   |
 | `threads`           | Number of threads to use. More than 1 will use multi-threading to speed up processing. With `threads > 1`, NEAT splits each contig into chunks; with `threads == 1`, one chunk per contig is used.            |
 | `parallel_block_size` | Per-chunk size in bases when `threads > 1`. Default `0` (auto-tune from total genome length and thread count, targeting ~8 chunks per thread). Set to a positive integer to override. Ignored when `threads == 1`. |
-| `cleanup_splits`    | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run.                     |
-| `reuse_splits`      | If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`.                                                                                             |
 
 The command line options for NEAT are as follows:
 
@@ -280,7 +278,6 @@ The inputs in single-threaded, contig-based mode most closely replicate the beha
 The configuration used:
 
 - `threads: 1`  (NEAT processes one contig per chunk in single-thread mode)
-- `cleanup_splits: True`
 
 | Organism        | File size (bytes) | Avg. runtime (ms) | Avg. runtime (min) |
 |-----------------|-------------------|-------------------|--------------------|
@@ -302,7 +299,6 @@ Here we enabled NEAT’s multi-threaded mode, which splits contigs into size-bas
 - `parallel_block_size: 500000`
 - `produce_bam: false`
 - `threads: 7`
-- `cleanup_splits: True`
 
 | Organism        | File size (bytes) | Avg. runtime (ms) | Avg. runtime (min) |
 |-----------------|-------------------|-------------------|--------------------|
diff --git a/config_template/simple_template.yml b/config_template/simple_template.yml
@@ -29,5 +29,3 @@ overwrite_output: .
 
 parallel_block_size: .
 threads: .
-cleanup_splits: .
-reuse_splits: .
diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml
@@ -1,4 +1,4 @@
-## Template for NEAT's read-simulator (as of version 4.3.5, parallelization-friendly)
+## Template for NEAT's read-simulator (as of version 4.5.1, parallelization-friendly)
 ## Any parameter that is not required but has a default value will use the
 ## default value even if the variable is not included in the config. For
 ## required items, they must be included in the config and they must be given a value.
@@ -152,14 +152,3 @@ parallel_block_size: .
 # Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run
 # type = int | required: no | default = all available
 threads: .
-
-# Delete the 'splits' directory after stitching completes
-# Note: If threads == 1, this option has no effect.
-# type = bool | required: no | default = true
-cleanup_splits: .
-
-# Reuse existing files in '<out_dir>/splits' and skip the split step.
-# The directory must contain NEAT-generated files and must be in the output directory within "splits"
-# Note: If threads == 1, this option has no effect.
-# type = bool | required: no | default = False
-reuse_splits: .
diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py
@@ -82,9 +82,7 @@ def __init__(self,
                  produce_fastq: bool = True,
                  min_mutations: int = 0,
                  parallel_block_size: int = 0,
-                 cleanup_splits: bool = True,
                  splits_dir: Path | None = None,
-                 reuse_splits: bool = False,
                  gc_model: Path | None = None,
                  **kwargs: Any
                  ):
@@ -136,9 +134,7 @@ def __init__(self,
             chunks of this size). Default 0 auto-tunes from total genome length and thread count, targeting
             ~8 chunks per thread. Specify a positive integer to override. Ignored when threads == 1, where
             one chunk per contig is used.
-        :param cleanup_splits: Set to False in order to preserve splits after run
-        :param reuse_splits: Attempts to reuse existing splits file
-        """
+"""
         super().__init__(**kwargs)
         self.reference: Path = reference
         self.output_dir: Path = output_dir
@@ -173,9 +169,7 @@ def __init__(self,
         # threads > 1, contigs are split into chunks of `parallel_block_size`; with
         # threads == 1, each contig is processed as a single chunk.
         self.parallel_block_size: int = parallel_block_size
-        self.cleanup_splits: bool = cleanup_splits
         self.splits_dir: Path | None = splits_dir
-        self.reuse_splits: bool = reuse_splits
         self.gc_model: Path | None = Path(gc_model) if gc_model else None
         # Genome-wide mean GC bias weight, computed once at the runner level when
         # gc_model is loaded. cover_dataset divides per-chunk reads by this rather
@@ -251,8 +245,6 @@ def from_cli(output_dir: Path,
             'overwrite_output': (bool, False, None, None),
             'parallel_block_size': (int, 0, None, None),
             'threads': (int, 1, 1, 1000),
-            'cleanup_splits': (bool, True, None, None),
-            'reuse_splits': (bool, False, None, None),
             'gc_model': (Path, None, 'exists', None)
         }
 
@@ -312,6 +304,8 @@ def check_and_log_error(keyname, value_to_check, crit1, crit2):
     # Map: deprecated key -> short reason shown to the user.
     DEPRECATED_KEYS = {
         "parallel_mode": "splitting strategy is now derived from `threads`",
+        "cleanup_splits": "splits are always written to a temporary directory and cleaned up automatically",
+        "reuse_splits": "removed; splits are regenerated on each run",
     }
 
     def read_yaml(self, config_yaml: Path, args: dict):
@@ -467,23 +461,7 @@ def log_configuration(self):
         else:
             _LOG.info('Single threading - 1 thread.')
             _LOG.info('Splitting input by contig.')
-        if self.reuse_splits:
-            splits_dir = Path(f'{self.output_dir}/splits/')
-            _LOG.info(f'Reusing existing splits {splits_dir}.')
-            if not splits_dir.is_dir():
-                raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}")
-            else:
-                if self.reuse_splits:
-                    raise FileNotFoundError(f'reuse_splits=True')
-                else:
-                    _LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits')
-            _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
-        elif not self.cleanup_splits:
-            splits_dir = Path(f'{self.output_dir}/splits/')
-            _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
-        else:
-            splits_dir = self.temp_dir_path / "splits"
-
+        splits_dir = self.temp_dir_path / "splits"
         validate_output_path(splits_dir, False)
         self.splits_dir = splits_dir
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "neat-genreads"
-version = "4.5.0"
+version = "4.5.1"
 description = "NGS Simulation toolkit"
 readme = "README.md"
 authors = ["Joshua Allen <jallen17@illinois.edu>", "Keshav Gandhi <krg3@illinois.edu>"]
diff --git a/tests/test_compare_vcfs/test_integration.py b/tests/test_compare_vcfs/test_integration.py
@@ -71,7 +71,6 @@ def _write_config(path: Path, ref_path: Path) -> Path:
         "rng_seed: 42\n"
         "mutation_rate: 0.01\n"
         "overwrite_output: true\n"
-        "cleanup_splits: true\n"
     )
     path.write_text(cfg, encoding="utf-8")
     return path
@@ -250,8 +249,7 @@ def test_compare_vcfs_real_happy_with_chrom_mismatched_bed(tmp_path, happy_bin,
         "rng_seed: 42\n"
         "mutation_rate: 0.01\n"
         f"mutation_bed: {bed}\n"
-        "overwrite_output: true\n"
-        "cleanup_splits: true\n",
+        "overwrite_output: true\n",
         encoding="utf-8",
     )
     read_simulator_runner(str(cfg_path), str(sim_out), "run")
diff --git a/tests/test_read_simulator/test_gc_bias_pipeline.py b/tests/test_read_simulator/test_gc_bias_pipeline.py
@@ -27,7 +27,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:
         "coverage": 2,
         "rng_seed": 42,
         "overwrite_output": "true",
-        "cleanup_splits": "true",
     }
     defaults.update(overrides)
     lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())
diff --git a/tests/test_read_simulator/test_models_pipeline.py b/tests/test_read_simulator/test_models_pipeline.py
@@ -26,7 +26,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:
         "coverage": 2,
         "rng_seed": 42,
         "overwrite_output": "true",
-        "cleanup_splits": "true",
     }
     defaults.update(overrides)
     lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())
diff --git a/tests/test_read_simulator/test_options.py b/tests/test_read_simulator/test_options.py
@@ -75,8 +75,6 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias):
 
         parallel_block_size: 500000
         threads: 2
-        cleanup_splits: false
-        reuse_splits: false
         """
     ).strip() + "\n"
 
@@ -102,8 +100,8 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias):
     assert opts.vcf is None
 
     assert opts.threads == 2
-    assert opts.splits_dir == outdir / "splits"
     assert opts.splits_dir.is_dir()
+    assert opts.splits_dir.name == "splits"
 
 
 def test_from_cli_paired_end_fragments(tmp_path: _PathAlias):
@@ -125,8 +123,6 @@ def test_from_cli_paired_end_fragments(tmp_path: _PathAlias):
         overwrite_output: true
 
         threads: 1
-        cleanup_splits: true
-        reuse_splits: false
         """
     ).strip() + "\n"
 
@@ -160,8 +156,6 @@ def test_default_values():
     # auto-tune logic. The splitting strategy itself is no longer a user-facing option —
     # it's derived from `threads` at runtime.
     assert opts.parallel_block_size == 0
-    assert opts.cleanup_splits is True
-    assert opts.reuse_splits is False
     assert opts.overwrite_output is False
     assert opts.rescale_qualities is False
     assert opts.min_mutations == 0
@@ -286,6 +280,27 @@ def test_read_yaml_deprecated_parallel_mode_warns(tmp_path: _PathAlias, caplog):
     )
 
 
+def test_deprecated_cleanup_splits_warns(tmp_path: _PathAlias, caplog):
+    """cleanup_splits and reuse_splits must fire a deprecation warning and not crash."""
+    ref = _project_root() / "data" / "H1N1.fa"
+    cfg = _textwrap.dedent(
+        f"""
+        reference: {ref}
+        cleanup_splits: true
+        reuse_splits: false
+        threads: 1
+        """
+    ).strip() + "\n"
+    yml_path = tmp_path / "old_config.yml"
+    yml_path.write_text(cfg, encoding="utf-8")
+    import logging as _logging
+    with caplog.at_level(_logging.WARNING):
+        Options.from_cli(tmp_path, "out", yml_path)
+    keys_warned = {rec.message.split("`")[1] for rec in caplog.records if "deprecated" in rec.message}
+    assert "cleanup_splits" in keys_warned
+    assert "reuse_splits" in keys_warned
+
+
 def test_log_configuration_fragment_mean_less_than_read_len_exits(tmp_path: _PathAlias):
     ref = _project_root() / "data" / "H1N1.fa"
     opts = Options(reference=ref, output_dir=tmp_path, output_prefix="out",
@@ -313,27 +328,3 @@ def test_log_configuration_paired_without_model_or_mean_exits(tmp_path: _PathAli
         opts.log_configuration()
 
 
-def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias):
-    cfg = _textwrap.dedent(
-        f"""
-        reference: {(_project_root() / 'data' / 'H1N1.fa').as_posix()}
-        paired_ended: false
-        produce_fastq: true
-        produce_bam: false
-        produce_vcf: false
-        threads: 4
-        parallel_block_size: 500000
-        cleanup_splits: true
-        reuse_splits: true
-        overwrite_output: true
-        """
-    ).strip() + "\n"
-
-    yml_path = tmp_path / "neat_from_cli_reuse.yml"
-    yml_path.write_text(cfg, encoding="utf-8")
-    
-    outdir = tmp_path / "out"
-    outdir.mkdir(parents=True, exist_ok=True)
-
-    with _pytest.raises(FileNotFoundError, match=r"reuse_splits=True"):
-        Options.from_cli(outdir, "reuse", yml_path)
diff --git a/tests/test_read_simulator/test_runner.py b/tests/test_read_simulator/test_runner.py
@@ -263,7 +263,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:
         "coverage": 2,
         "rng_seed": 42,
         "overwrite_output": "true",
-        "cleanup_splits": "true",
     }
     defaults.update(overrides)
     lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:`
`27`	`27`	`"coverage": 2,`
`28`	`28`	`"rng_seed": 42,`
`29`	`29`	`"overwrite_output": "true",`
`30`		`- "cleanup_splits": "true",`
`31`	`30`	`}`
`32`	`31`	`defaults.update(overrides)`
`33`	`32`	`lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:`
`26`	`26`	`"coverage": 2,`
`27`	`27`	`"rng_seed": 42,`
`28`	`28`	`"overwrite_output": "true",`
`29`		`- "cleanup_splits": "true",`
`30`	`29`	`}`
`31`	`30`	`defaults.update(overrides)`
`32`	`31`	`lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())`