Skip to content

Commit 754a765

Browse files
committed
Use individuals_nodes in vcf code
1 parent f3444d9 commit 754a765

4 files changed

Lines changed: 46 additions & 62 deletions

File tree

python/CHANGELOG.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22
[0.6.4] - 2025-XX-XX
33
--------------------
44

5+
**Changes**
6+
7+
- Allow mixed sample and non-sample nodes to be output by ``write_vcf``.
8+
and for individuals without nodes to be specified for output, although
9+
they will not appear in the VCF. Individuals without nodes no longer
10+
raise an error and are instead ignored, except in the case where they
11+
are explicitly specified in the ``individuals`` argument.
12+
(:user:`benjeffery`, :pr:`3157`)
13+
514
**Features**
615

716
- Add ``TreeSequence.sample_nodes_by_ploidy`` method to return the sample nodes

python/tests/test_vcf.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,8 @@ def test_bad_ploidy(self):
340340
# Non divisible
341341
for bad_ploidy in [3, 7]:
342342
with pytest.raises(
343-
ValueError, match="Sample size must be divisible by ploidy"
343+
ValueError,
344+
match="Number of sample nodes 10 is not a multiple of ploidy",
344345
):
345346
ts.write_vcf(io.StringIO, bad_ploidy)
346347

@@ -358,8 +359,10 @@ def test_individuals_no_nodes_as_argument(self):
358359
tables = ts1.dump_tables()
359360
tables.individuals.add_row()
360361
ts2 = tables.tree_sequence()
361-
with pytest.raises(ValueError, match="0 not associated with a node"):
362-
ts2.as_vcf(individuals=[0])
362+
with pytest.raises(
363+
ValueError, match="No samples were associated with the individuals"
364+
):
365+
ts2.as_vcf(individuals=[0], allow_position_zero=True)
363366

364367
def test_ploidy_with_sample_individuals(self):
365368
ts = msprime.sim_ancestry(3, random_seed=2)
@@ -397,10 +400,13 @@ def test_mixed_sample_non_sample_individuals(self):
397400
tables.nodes.individual = individual
398401
ts = tables.tree_sequence()
399402
ts = tsutil.insert_branch_sites(ts)
403+
ts.as_vcf(allow_position_zero=True, isolated_as_missing=False)
400404
with pytest.raises(
401-
ValueError, match="0 has nodes that are sample and non-sample"
405+
tskit.LibraryError,
406+
match="Cannot generate genotypes for non-samples when isolated "
407+
"nodes are considered as missing",
402408
):
403-
ts.as_vcf()
409+
ts.as_vcf(allow_position_zero=True)
404410
# but it's OK if we run without the affected individual
405411
assert len(ts.as_vcf(individuals=[1, 2], allow_position_zero=True)) > 0
406412

@@ -414,12 +420,7 @@ def test_samples_with_and_without_individuals(self):
414420
tables.nodes.individual = individual
415421
ts = tables.tree_sequence()
416422
ts = tsutil.insert_branch_sites(ts)
417-
with pytest.raises(
418-
ValueError, match="Sample nodes must either all be associated"
419-
):
420-
ts.as_vcf()
421-
# But it's OK if explicitly specify that sample
422-
assert len(ts.as_vcf(individuals=[0], allow_position_zero=True)) > 0
423+
ts.as_vcf(allow_position_zero=True)
423424

424425
def test_bad_individuals(self):
425426
ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)

python/tskit/trees.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6432,14 +6432,9 @@ def write_vcf(
64326432
with sample nodes are included in increasing order of individual ID.
64336433
64346434
Subsets or permutations of the sample individuals may be specified
6435-
using the ``individuals`` argument. It is an error to specify any
6436-
individuals that are not associated with any nodes, or whose
6437-
nodes are not all samples.
6438-
6439-
Mixed-sample individuals (e.g., those associated with one node
6440-
that is a sample and another that is not) in the data model will
6441-
result in an error by default. However, such individuals can be
6442-
excluded using the ``individuals`` argument.
6435+
using the ``individuals`` argument. An individual specified that
6436+
is not associated with any nodes will be ignored and not included in the
6437+
output.
64436438
64446439
If there are no individuals in the tree sequence,
64456440
synthetic individuals are created by combining adjacent samples, and
@@ -10503,6 +10498,8 @@ def sample_nodes_by_ploidy(self, ploidy):
1050310498
:return: A 2D array of node IDs, where each row has length `ploidy`.
1050410499
:rtype: numpy.ndarray
1050510500
"""
10501+
if ploidy < 1:
10502+
raise ValueError("Ploidy must be >= 1")
1050610503
sample_node_ids = np.flatnonzero(self.nodes_flags & tskit.NODE_IS_SAMPLE)
1050710504
num_samples = len(sample_node_ids)
1050810505
if num_samples == 0:

python/tskit/vcf.py

Lines changed: 20 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
"""
2626
import numpy as np
2727

28-
import tskit
2928
from . import provenance
3029

3130

@@ -140,54 +139,32 @@ def __make_sample_mapping(self, ploidy, individuals):
140139
raise ValueError(
141140
"Cannot specify ploidy when individuals are present in tables "
142141
)
143-
144-
if individuals is None:
145-
# Find all sample nodes that reference individuals
146-
individuals = np.unique(ts.nodes_individual[ts.samples()])
147-
if len(individuals) == 1 and individuals[0] == tskit.NULL:
148-
# No samples refer to individuals
149-
individuals = None
150-
else:
151-
# np.unique sorts the argument, so if NULL (-1) is present it
152-
# will be the first value.
153-
if individuals[0] == tskit.NULL:
154-
raise ValueError(
155-
"Sample nodes must either all be associated with individuals "
156-
"or not associated with any individuals"
157-
)
142+
# If there are no individuals, or all the individuals are not associated with
143+
# nodes, then we split by ploidy.
144+
if ts.num_individuals > 0 and np.any(ts.individuals_nodes != -1):
145+
individuals_nodes = ts.individuals_nodes
158146
else:
147+
if ploidy is None:
148+
ploidy = 1
149+
individuals_nodes = ts.sample_nodes_by_ploidy(ploidy)
150+
151+
if individuals is not None:
159152
individuals = np.array(individuals, dtype=np.int32)
160153
if len(individuals) == 0:
161154
raise ValueError("List of sample individuals empty")
155+
if any(individuals < 0) or any(individuals >= ts.num_individuals):
156+
raise ValueError("Invalid individual IDs provided.")
157+
individuals_nodes = ts.individuals_nodes[individuals]
162158

163-
if individuals is not None:
164-
self.samples = []
165-
# FIXME this could probably be done more efficiently.
166-
for i in individuals:
167-
if i < 0 or i >= self.tree_sequence.num_individuals:
168-
raise ValueError("Invalid individual IDs provided.")
169-
ind = self.tree_sequence.individual(i)
170-
if len(ind.nodes) == 0:
171-
raise ValueError(f"Individual {i} not associated with a node")
172-
is_sample = {ts.node(u).is_sample() for u in ind.nodes}
173-
if len(is_sample) != 1:
174-
raise ValueError(
175-
f"Individual {ind.id} has nodes that are sample and "
176-
"non-samples"
177-
)
178-
self.samples.extend(ind.nodes)
179-
self.individual_ploidies.append(len(ind.nodes))
180-
else:
181-
if ploidy is None:
182-
ploidy = 1
183-
if ploidy < 1:
184-
raise ValueError("Ploidy must be >= 1")
185-
if ts.num_samples % ploidy != 0:
186-
raise ValueError("Sample size must be divisible by ploidy")
187-
self.individual_ploidies = np.full(
188-
ts.sample_size // ploidy, ploidy, dtype=np.int32
189-
)
159+
self.samples = []
160+
for ind_nodes in individuals_nodes:
161+
wanted_nodes = ind_nodes[ind_nodes != -1]
162+
if len(wanted_nodes) > 0:
163+
self.samples.extend(wanted_nodes)
164+
self.individual_ploidies.append(len(wanted_nodes))
190165
self.num_individuals = len(self.individual_ploidies)
166+
if self.num_individuals == 0:
167+
raise ValueError("No samples were associated with the individuals")
191168

192169
def __write_header(self, output):
193170
print("##fileformat=VCFv4.2", file=output)

0 commit comments

Comments
 (0)