Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# NEAT v4.5.1

Removed the `cleanup_splits` and `reuse_splits` config options.

`reuse_splits` was fully broken (it raised `FileNotFoundError` unconditionally
regardless of whether the splits directory existed). `cleanup_splits` existed
solely to support `reuse_splits`; without it the option has no value — the
splits directory always lives in a `TemporaryDirectory` that is cleaned up
automatically when the run exits.

Both keys are now in `DEPRECATED_KEYS`: configs that still include them
receive a one-line deprecation warning and continue parsing cleanly, so no
user config breaks. The README examples and all internal test fixtures have
been updated to drop these keys.

# NEAT v4.5.0

New `neat compare-vcfs` subcommand: compares a downstream variant caller's VCF
Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,8 +205,6 @@ More parameters are below:
| `min_mutations` | Set the minimum number of mutations that NEAT should add, per contig. Default is 0. We recommend setting this to at least one for small chromosomes, so NEAT will produce at least one mutation per contig. |
| `threads` | Number of threads to use. More than 1 will use multi-threading to speed up processing. With `threads > 1`, NEAT splits each contig into chunks; with `threads == 1`, one chunk per contig is used. |
| `parallel_block_size` | Per-chunk size in bases when `threads > 1`. Default `0` (auto-tune from total genome length and thread count, targeting ~8 chunks per thread). Set to a positive integer to override. Ignored when `threads == 1`. |
| `cleanup_splits` | If running more than one simulation on the same input fasta, you can reuse splits files. By default, this will be set to `False`, and splits files will be deleted at the end of the run. |
| `reuse_splits` | If an existing splits file exists in the output folder, it will use those splits, if this value is set to `True`. |

The command line options for NEAT are as follows:

Expand Down Expand Up @@ -280,7 +278,6 @@ The inputs in single-threaded, contig-based mode most closely replicate the beha
The configuration used:

- `threads: 1` (NEAT processes one contig per chunk in single-thread mode)
- `cleanup_splits: True`

| Organism | File size (bytes) | Avg. runtime (ms) | Avg. runtime (min) |
|-----------------|-------------------|-------------------|--------------------|
Expand All @@ -302,7 +299,6 @@ Here we enabled NEAT’s multi-threaded mode, which splits contigs into size-bas
- `parallel_block_size: 500000`
- `produce_bam: false`
- `threads: 7`
- `cleanup_splits: True`

| Organism | File size (bytes) | Avg. runtime (ms) | Avg. runtime (min) |
|-----------------|-------------------|-------------------|--------------------|
Expand Down
2 changes: 0 additions & 2 deletions config_template/simple_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,3 @@ overwrite_output: .

parallel_block_size: .
threads: .
cleanup_splits: .
reuse_splits: .
13 changes: 1 addition & 12 deletions config_template/template_neat_config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## Template for NEAT's read-simulator (as of version 4.3.5, parallelization-friendly)
## Template for NEAT's read-simulator (as of version 4.5.1, parallelization-friendly)
## Any parameter that is not required but has a default value will use the
## default value even if the variable is not included in the config. For
## required items, they must be included in the config and they must be given a value.
Expand Down Expand Up @@ -152,14 +152,3 @@ parallel_block_size: .
# Maximum number of concurrent NEAT jobs (threads or hyperthreads) to run
# type = int | required: no | default = all available
threads: .

# Delete the 'splits' directory after stitching completes
# Note: If threads == 1, this option has no effect.
# type = bool | required: no | default = true
cleanup_splits: .

# Reuse existing files in '<out_dir>/splits' and skip the split step.
# The directory must contain NEAT-generated files and must be in the output directory within "splits"
# Note: If threads == 1, this option has no effect.
# type = bool | required: no | default = False
reuse_splits: .
30 changes: 4 additions & 26 deletions neat/read_simulator/utils/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ def __init__(self,
produce_fastq: bool = True,
min_mutations: int = 0,
parallel_block_size: int = 0,
cleanup_splits: bool = True,
splits_dir: Path | None = None,
reuse_splits: bool = False,
gc_model: Path | None = None,
**kwargs: Any
):
Expand Down Expand Up @@ -136,9 +134,7 @@ def __init__(self,
chunks of this size). Default 0 auto-tunes from total genome length and thread count, targeting
~8 chunks per thread. Specify a positive integer to override. Ignored when threads == 1, where
one chunk per contig is used.
:param cleanup_splits: Set to False in order to preserve splits after run
:param reuse_splits: Attempts to reuse existing splits file
"""
"""
super().__init__(**kwargs)
self.reference: Path = reference
self.output_dir: Path = output_dir
Expand Down Expand Up @@ -173,9 +169,7 @@ def __init__(self,
# threads > 1, contigs are split into chunks of `parallel_block_size`; with
# threads == 1, each contig is processed as a single chunk.
self.parallel_block_size: int = parallel_block_size
self.cleanup_splits: bool = cleanup_splits
self.splits_dir: Path | None = splits_dir
self.reuse_splits: bool = reuse_splits
self.gc_model: Path | None = Path(gc_model) if gc_model else None
# Genome-wide mean GC bias weight, computed once at the runner level when
# gc_model is loaded. cover_dataset divides per-chunk reads by this rather
Expand Down Expand Up @@ -251,8 +245,6 @@ def from_cli(output_dir: Path,
'overwrite_output': (bool, False, None, None),
'parallel_block_size': (int, 0, None, None),
'threads': (int, 1, 1, 1000),
'cleanup_splits': (bool, True, None, None),
'reuse_splits': (bool, False, None, None),
'gc_model': (Path, None, 'exists', None)
}

Expand Down Expand Up @@ -312,6 +304,8 @@ def check_and_log_error(keyname, value_to_check, crit1, crit2):
# Map: deprecated key -> short reason shown to the user.
DEPRECATED_KEYS = {
"parallel_mode": "splitting strategy is now derived from `threads`",
"cleanup_splits": "splits are always written to a temporary directory and cleaned up automatically",
"reuse_splits": "removed; splits are regenerated on each run",
}

def read_yaml(self, config_yaml: Path, args: dict):
Expand Down Expand Up @@ -467,23 +461,7 @@ def log_configuration(self):
else:
_LOG.info('Single threading - 1 thread.')
_LOG.info('Splitting input by contig.')
if self.reuse_splits:
splits_dir = Path(f'{self.output_dir}/splits/')
_LOG.info(f'Reusing existing splits {splits_dir}.')
if not splits_dir.is_dir():
raise FileNotFoundError(f"reuse_splits=True but splits dir not found: {splits_dir}")
else:
if self.reuse_splits:
raise FileNotFoundError(f'reuse_splits=True')
else:
_LOG.warning(f'Reused splits set to True, but splits dir not found: {splits_dir}. Creating new splits')
_LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
elif not self.cleanup_splits:
splits_dir = Path(f'{self.output_dir}/splits/')
_LOG.info(f'Preserving splits for next run in directory {self.splits_dir}.')
else:
splits_dir = self.temp_dir_path / "splits"

splits_dir = self.temp_dir_path / "splits"
validate_output_path(splits_dir, False)
self.splits_dir = splits_dir

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "neat-genreads"
version = "4.5.0"
version = "4.5.1"
description = "NGS Simulation toolkit"
readme = "README.md"
authors = ["Joshua Allen <jallen17@illinois.edu>", "Keshav Gandhi <krg3@illinois.edu>"]
Expand Down
4 changes: 1 addition & 3 deletions tests/test_compare_vcfs/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ def _write_config(path: Path, ref_path: Path) -> Path:
"rng_seed: 42\n"
"mutation_rate: 0.01\n"
"overwrite_output: true\n"
"cleanup_splits: true\n"
)
path.write_text(cfg, encoding="utf-8")
return path
Expand Down Expand Up @@ -250,8 +249,7 @@ def test_compare_vcfs_real_happy_with_chrom_mismatched_bed(tmp_path, happy_bin,
"rng_seed: 42\n"
"mutation_rate: 0.01\n"
f"mutation_bed: {bed}\n"
"overwrite_output: true\n"
"cleanup_splits: true\n",
"overwrite_output: true\n",
encoding="utf-8",
)
read_simulator_runner(str(cfg_path), str(sim_out), "run")
Expand Down
1 change: 0 additions & 1 deletion tests/test_read_simulator/test_gc_bias_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:
"coverage": 2,
"rng_seed": 42,
"overwrite_output": "true",
"cleanup_splits": "true",
}
defaults.update(overrides)
lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())
Expand Down
1 change: 0 additions & 1 deletion tests/test_read_simulator/test_models_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:
"coverage": 2,
"rng_seed": 42,
"overwrite_output": "true",
"cleanup_splits": "true",
}
defaults.update(overrides)
lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())
Expand Down
53 changes: 22 additions & 31 deletions tests/test_read_simulator/test_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias):

parallel_block_size: 500000
threads: 2
cleanup_splits: false
reuse_splits: false
"""
).strip() + "\n"

Expand All @@ -102,8 +100,8 @@ def test_from_cli_single_end_with_threads_and_splits(tmp_path: _PathAlias):
assert opts.vcf is None

assert opts.threads == 2
assert opts.splits_dir == outdir / "splits"
assert opts.splits_dir.is_dir()
assert opts.splits_dir.name == "splits"


def test_from_cli_paired_end_fragments(tmp_path: _PathAlias):
Expand All @@ -125,8 +123,6 @@ def test_from_cli_paired_end_fragments(tmp_path: _PathAlias):
overwrite_output: true

threads: 1
cleanup_splits: true
reuse_splits: false
"""
).strip() + "\n"

Expand Down Expand Up @@ -160,8 +156,6 @@ def test_default_values():
# auto-tune logic. The splitting strategy itself is no longer a user-facing option —
# it's derived from `threads` at runtime.
assert opts.parallel_block_size == 0
assert opts.cleanup_splits is True
assert opts.reuse_splits is False
assert opts.overwrite_output is False
assert opts.rescale_qualities is False
assert opts.min_mutations == 0
Expand Down Expand Up @@ -286,6 +280,27 @@ def test_read_yaml_deprecated_parallel_mode_warns(tmp_path: _PathAlias, caplog):
)


def test_deprecated_cleanup_splits_warns(tmp_path: _PathAlias, caplog):
"""cleanup_splits and reuse_splits must fire a deprecation warning and not crash."""
ref = _project_root() / "data" / "H1N1.fa"
cfg = _textwrap.dedent(
f"""
reference: {ref}
cleanup_splits: true
reuse_splits: false
threads: 1
"""
).strip() + "\n"
yml_path = tmp_path / "old_config.yml"
yml_path.write_text(cfg, encoding="utf-8")
import logging as _logging
with caplog.at_level(_logging.WARNING):
Options.from_cli(tmp_path, "out", yml_path)
keys_warned = {rec.message.split("`")[1] for rec in caplog.records if "deprecated" in rec.message}
assert "cleanup_splits" in keys_warned
assert "reuse_splits" in keys_warned


def test_log_configuration_fragment_mean_less_than_read_len_exits(tmp_path: _PathAlias):
ref = _project_root() / "data" / "H1N1.fa"
opts = Options(reference=ref, output_dir=tmp_path, output_prefix="out",
Expand Down Expand Up @@ -313,27 +328,3 @@ def test_log_configuration_paired_without_model_or_mean_exits(tmp_path: _PathAli
opts.log_configuration()


def test_from_cli_reuse_splits_missing_dir_raises(tmp_path: _PathAlias):
cfg = _textwrap.dedent(
f"""
reference: {(_project_root() / 'data' / 'H1N1.fa').as_posix()}
paired_ended: false
produce_fastq: true
produce_bam: false
produce_vcf: false
threads: 4
parallel_block_size: 500000
cleanup_splits: true
reuse_splits: true
overwrite_output: true
"""
).strip() + "\n"

yml_path = tmp_path / "neat_from_cli_reuse.yml"
yml_path.write_text(cfg, encoding="utf-8")

outdir = tmp_path / "out"
outdir.mkdir(parents=True, exist_ok=True)

with _pytest.raises(FileNotFoundError, match=r"reuse_splits=True"):
Options.from_cli(outdir, "reuse", yml_path)
1 change: 0 additions & 1 deletion tests/test_read_simulator/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,6 @@ def _write_config(path: Path, ref_path: Path, **overrides) -> Path:
"coverage": 2,
"rng_seed": 42,
"overwrite_output": "true",
"cleanup_splits": "true",
}
defaults.update(overrides)
lines = "\n".join(f"{k}: {v}" for k, v in defaults.items())
Expand Down
Loading