diff --git a/ChangeLog.md b/ChangeLog.md index 5cc4f683..bf730060 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,3 +1,18 @@ +# NEAT v4.5.1 + +Removed the `cleanup_splits` and `reuse_splits` config options. + +`reuse_splits` was fully broken (it raised `FileNotFoundError` unconditionally +regardless of whether the splits directory existed). `cleanup_splits` existed +solely to support `reuse_splits`; without it the option has no value — the +splits directory always lives in a `TemporaryDirectory` that is cleaned up +automatically when the run exits. + +Both keys are now in `DEPRECATED_KEYS`: configs that still include them +receive a one-line deprecation warning and continue parsing cleanly, so no +user config breaks. The README examples and all internal test fixtures have +been updated to drop these keys. + # NEAT v4.5.0 New `neat compare-vcfs` subcommand: compares a downstream variant caller's VCF diff --git a/README.md b/README.md index f67d670d..73cb783f 100755 --- a/README.md +++ b/README.md @@ -205,8 +205,6 @@ More parameters are below: | `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. | | `threads` | Number of threads to use. More than 1 will use multi-threading to speed up processing. With `threads > 1`, NEAT splits each contig into chunks; with `threads == 1`, one chunk per contig is used. | | `parallel_block_size` | Per-chunk size in bases when `threads > 1`. Default `0` (auto-tune from total genome length and thread count, targeting ~8 chunks per thread). Set to a positive integer to override. Ignored when `threads == 1`. | -| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. | -| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. | The command line options for NEAT are as follows: @@ -280,7 +278,6 @@ The inputs in single-threaded, contig-based mode most closely replicate the beha The configuration used: - `threads: 1` (NEAT processes one contig per chunk in single-thread mode) -- `cleanup_splits: True` | Organism | File size (bytes) | Avg. runtime (ms) | Avg. runtime (min) | |-----------------|-------------------|-------------------|--------------------| @@ -302,7 +299,6 @@ Here we enabled NEAT’s multi-threaded mode, which splits contigs into size-bas - `parallel_block_size: 500000` - `produce_bam: false` - `threads: 7` -- `cleanup_splits: True` | Organism | File size (bytes) | Avg. runtime (ms) | Avg. runtime (min) | |-----------------|-------------------|-------------------|--------------------| diff --git a/config_template/simple_template.yml b/config_template/simple_template.yml index 9eab62b4..e7a6ac27 100644 --- a/config_template/simple_template.yml +++ b/config_template/simple_template.yml @@ -29,5 +29,3 @@ overwrite_output: . parallel_block_size: . threads: . -cleanup_splits: . -reuse_splits: . diff --git a/config_template/template_neat_config.yml b/config_template/template_neat_config.yml index 342ccbb2..ac622704 100644 --- a/config_template/template_neat_config.yml +++ b/config_template/template_neat_config.yml @@ -1,4 +1,4 @@ -## Template for NEAT's read-simulator (as of version 4.3.5, parallelization-friendly) +## Template for NEAT's read-simulator (as of version 4.5.1, parallelization-friendly) ## Any parameter that is not required but has a default value will use the ## default value even if the variable is not included in the config. For ## required items, they must be included in the config and they must be given a value. @@ -152,14 +152,3 @@ parallel_block_size: . # Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run # type = int | required: no | default = all available threads: . - -# Delete the 'splits' directory after stitching completes -# Note: If threads == 1, this option has no effect. -# type = bool | required: no | default = true -cleanup_splits: . - -# Reuse existing files in '/splits' and skip the split step. -# The directory must contain NEAT-generated files and must be in the output directory within "splits" -# Note: If threads == 1, this option has no effect. -# type = bool | required: no | default = False -reuse_splits: . diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index 991857e7..193c3c4b 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -82,9 +82,7 @@ def __init__(self, produce_fastq: bool = True, min_mutations: int = 0, parallel_block_size: int = 0, - cleanup_splits: bool = True, splits_dir: Path | None = None, - reuse_splits: bool = False, gc_model: Path | None = None, **kwargs: Any ): @@ -136,9 +134,7 @@ def __init__(self, chunks of this size). Default 0 auto-tunes from total genome length and thread count, targeting ~8 chunks per thread. Specify a positive integer to override. Ignored when threads == 1, where one chunk per contig is used. - :param cleanup_splits: Set to False in order to preserve splits after run - :param reuse_splits: Attempts to reuse existing splits file - """ +""" super().__init__(**kwargs) self.reference: Path = reference self.output_dir: Path = output_dir @@ -173,9 +169,7 @@ def __init__(self, # threads > 1, contigs are split into chunks of `parallel_block_size`; with # threads == 1, each contig is processed as a single chunk. self.parallel_block_size: int = parallel_block_size - self.cleanup_splits: bool = cleanup_splits self.splits_dir: Path | None = splits_dir - self.reuse_splits: bool = reuse_splits self.gc_model: Path | None = Path(gc_model) if gc_model else None # Genome-wide mean GC bias weight, computed once at the runner level when # gc_model is loaded. cover_dataset divides per-chunk reads by this rather @@ -251,8 +245,6 @@ def from_cli(output_dir: Path, 'overwrite_output': (bool, False, None, None), 'parallel_block_size': (int, 0, None, None), 'threads': (int, 1, 1, 1000), - 'cleanup_splits': (bool, True, None, None), - 'reuse_splits': (bool, False, None, None), 'gc_model': (Path, None, 'exists', None) } @@ -312,6 +304,8 @@ def check_and_log_error(keyname, value_to_check, crit1, crit2): # Map: deprecated key -> short reason shown to the user. DEPRECATED_KEYS = { "parallel_mode": "splitting strategy is now derived from `threads`", + "cleanup_splits": "splits are always written to a temporary directory and cleaned up automatically", + "reuse_splits": "removed; splits are regenerated on each run", } def read_yaml(self, config_yaml: Path, args: dict): @@ -467,23 +461,7 @@ def log_configuration(self): else: _LOG.info('Single threading - 1 thread.') _LOG.info('Splitting input by contig.') - if self.reuse_splits: - splits_dir = Path(f'{self.output_dir}/splits/') - _LOG.info(f'Reusing existing splits {splits_dir}.') - if not splits_dir.is_dir(): - raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}") - else: - if self.reuse_splits: - raise FileNotFoundError(f'reuse_splits=True') - else: - _LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits') - _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.') - elif not self.cleanup_splits: - splits_dir = Path(f'{self.output_dir}/splits/') - _LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.') - else: - splits_dir = self.temp_dir_path / "splits" - + splits_dir = self.temp_dir_path / "splits" validate_output_path(splits_dir, False) self.splits_dir = splits_dir diff --git a/pyproject.toml b/pyproject.toml index 6b5c73d7..f4581275 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "neat-genreads" -version = "4.5.0" +version = "4.5.1" description = "NGS Simulation toolkit" readme = "README.md" authors = ["Joshua Allen ", "Keshav Gandhi "] diff --git a/tests/test_compare_vcfs/test_integration.py b/tests/test_compare_vcfs/test_integration.py index cc903a16..8bb0b7b8 100644 --- a/tests/test_compare_vcfs/test_integration.py +++ b/tests/test_compare_vcfs/test_integration.py @@ -71,7 +71,6 @@ def _write_config(path: Path, ref_path: Path) -> Path: "rng_seed: 42\n" "mutation_rate: 0.01\n" "overwrite_output: true\n" - "cleanup_splits: true\n" ) path.write_text(cfg, encoding="utf-8") return path @@ -250,8 +249,7 @@ def test_compare_vcfs_real_happy_with_chrom_mismatched_bed(tmp_path, happy_bin, "rng_seed: 42\n" "mutation_rate: 0.01\n" f"mutation_bed: {bed}\n" - "overwrite_output: true\n" - "cleanup_splits: true\n", + "overwrite_output: true\n", encoding="utf-8", ) read_simulator_runner(str(cfg_path), str(sim_out), "run") diff --git a/tests/test_read_simulator/test_gc_bias_pipeline.py b/tests/test_read_simulator/test_gc_bias_pipeline.py index d0c14ffb..7cd244d3 100644 --- a/tests/test_read_simulator/test_gc_bias_pipeline.py +++ b/tests/test_read_simulator/test_gc_bias_pipeline.py @@ -27,7 +27,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path: "coverage": 2, "rng_seed": 42, "overwrite_output": "true", - "cleanup_splits": "true", } defaults.update(overrides) lines = "\n".join(f"{k}: {v}" for k, v in defaults.items()) diff --git a/tests/test_read_simulator/test_models_pipeline.py b/tests/test_read_simulator/test_models_pipeline.py index 6706b9f9..82b95f97 100644 --- a/tests/test_read_simulator/test_models_pipeline.py +++ b/tests/test_read_simulator/test_models_pipeline.py @@ -26,7 +26,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path: "coverage": 2, "rng_seed": 42, "overwrite_output": "true", - "cleanup_splits": "true", } defaults.update(overrides) lines = "\n".join(f"{k}: {v}" for k, v in defaults.items()) diff --git a/tests/test_read_simulator/test_options.py b/tests/test_read_simulator/test_options.py index 6a82d5d6..86ebf78d 100644 --- a/tests/test_read_simulator/test_options.py +++ b/tests/test_read_simulator/test_options.py @@ -75,8 +75,6 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): parallel_block_size: 500000 threads: 2 - cleanup_splits: false - reuse_splits: false """ ).strip() + "\n" @@ -102,8 +100,8 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias): assert opts.vcf is None assert opts.threads == 2 - assert opts.splits_dir == outdir / "splits" assert opts.splits_dir.is_dir() + assert opts.splits_dir.name == "splits" def test_from_cli_paired_end_fragments(tmp_path: _PathAlias): @@ -125,8 +123,6 @@ def test_from_cli_paired_end_fragments(tmp_path: _PathAlias): overwrite_output: true threads: 1 - cleanup_splits: true - reuse_splits: false """ ).strip() + "\n" @@ -160,8 +156,6 @@ def test_default_values(): # auto-tune logic. The splitting strategy itself is no longer a user-facing option — # it's derived from `threads` at runtime. assert opts.parallel_block_size == 0 - assert opts.cleanup_splits is True - assert opts.reuse_splits is False assert opts.overwrite_output is False assert opts.rescale_qualities is False assert opts.min_mutations == 0 @@ -286,6 +280,27 @@ def test_read_yaml_deprecated_parallel_mode_warns(tmp_path: _PathAlias, caplog): ) +def test_deprecated_cleanup_splits_warns(tmp_path: _PathAlias, caplog): + """cleanup_splits and reuse_splits must fire a deprecation warning and not crash.""" + ref = _project_root() / "data" / "H1N1.fa" + cfg = _textwrap.dedent( + f""" + reference: {ref} + cleanup_splits: true + reuse_splits: false + threads: 1 + """ + ).strip() + "\n" + yml_path = tmp_path / "old_config.yml" + yml_path.write_text(cfg, encoding="utf-8") + import logging as _logging + with caplog.at_level(_logging.WARNING): + Options.from_cli(tmp_path, "out", yml_path) + keys_warned = {rec.message.split("`")[1] for rec in caplog.records if "deprecated" in rec.message} + assert "cleanup_splits" in keys_warned + assert "reuse_splits" in keys_warned + + def test_log_configuration_fragment_mean_less_than_read_len_exits(tmp_path: _PathAlias): ref = _project_root() / "data" / "H1N1.fa" opts = Options(reference=ref, output_dir=tmp_path, output_prefix="out", @@ -313,27 +328,3 @@ def test_log_configuration_paired_without_model_or_mean_exits(tmp_path: _PathAli opts.log_configuration() -def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias): - cfg = _textwrap.dedent( - f""" - reference: {(_project_root() / 'data' / 'H1N1.fa').as_posix()} - paired_ended: false - produce_fastq: true - produce_bam: false - produce_vcf: false - threads: 4 - parallel_block_size: 500000 - cleanup_splits: true - reuse_splits: true - overwrite_output: true - """ - ).strip() + "\n" - - yml_path = tmp_path / "neat_from_cli_reuse.yml" - yml_path.write_text(cfg, encoding="utf-8") - - outdir = tmp_path / "out" - outdir.mkdir(parents=True, exist_ok=True) - - with _pytest.raises(FileNotFoundError, match=r"reuse_splits=True"): - Options.from_cli(outdir, "reuse", yml_path) diff --git a/tests/test_read_simulator/test_runner.py b/tests/test_read_simulator/test_runner.py index d01c71d4..13d51086 100644 --- a/tests/test_read_simulator/test_runner.py +++ b/tests/test_read_simulator/test_runner.py @@ -263,7 +263,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path: "coverage": 2, "rng_seed": 42, "overwrite_output": "true", - "cleanup_splits": "true", } defaults.update(overrides) lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())