Use individuals_nodes in vcf code

benjeffery · benjeffery · commit 6b8848fa3ae9 · 2025-05-07T15:10:00.000+01:00
diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py
@@ -340,7 +340,8 @@ def test_bad_ploidy(self):
         # Non divisible
         for bad_ploidy in [3, 7]:
             with pytest.raises(
-                ValueError, match="Sample size must be divisible by ploidy"
+                ValueError,
+                match="Number of sample nodes 10 is not a multiple of ploidy",
             ):
                 ts.write_vcf(io.StringIO, bad_ploidy)
 
@@ -390,17 +391,19 @@ def test_duplicate_individuals(self):
     def test_mixed_sample_non_sample_individuals(self):
         ts = msprime.sim_ancestry(3, random_seed=2)
         tables = ts.dump_tables()
-        tables.individuals.add_row()
         # Add a reference to an individual from a non-sample
         individual = tables.nodes.individual
         individual[-1] = 0
         tables.nodes.individual = individual
         ts = tables.tree_sequence()
         ts = tsutil.insert_branch_sites(ts)
+        ts.as_vcf(allow_position_zero=True, isolated_as_missing=False)
         with pytest.raises(
-            ValueError, match="0 has nodes that are sample and non-sample"
+            tskit.LibraryError,
+            match="Cannot generate genotypes for non-samples when isolated "
+            "nodes are considered as missing",
         ):
-            ts.as_vcf()
+            ts.as_vcf(allow_position_zero=True)
         # but it's OK if we run without the affected individual
         assert len(ts.as_vcf(individuals=[1, 2], allow_position_zero=True)) > 0
 
@@ -414,12 +417,7 @@ def test_samples_with_and_without_individuals(self):
         tables.nodes.individual = individual
         ts = tables.tree_sequence()
         ts = tsutil.insert_branch_sites(ts)
-        with pytest.raises(
-            ValueError, match="Sample nodes must either all be associated"
-        ):
-            ts.as_vcf()
-        # But it's OK if explicitly specify that sample
-        assert len(ts.as_vcf(individuals=[0], allow_position_zero=True)) > 0
+        ts.as_vcf(allow_position_zero=True)
 
     def test_bad_individuals(self):
         ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -10503,6 +10503,8 @@ def sample_nodes_by_ploidy(self, ploidy):
         :return: A 2D array of node IDs, where each row has length `ploidy`.
         :rtype: numpy.ndarray
         """
+        if ploidy < 1:
+            raise ValueError("Ploidy must be >= 1")
         sample_node_ids = np.flatnonzero(self.nodes_flags & tskit.NODE_IS_SAMPLE)
         num_samples = len(sample_node_ids)
         if num_samples == 0:
diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py
@@ -25,7 +25,6 @@
 """
 import numpy as np
 
-import tskit
 from . import provenance
 
 
@@ -140,53 +139,32 @@ def __make_sample_mapping(self, ploidy, individuals):
             raise ValueError(
                 "Cannot specify ploidy when individuals are present in tables "
             )
-
-        if individuals is None:
-            # Find all sample nodes that reference individuals
-            individuals = np.unique(ts.nodes_individual[ts.samples()])
-            if len(individuals) == 1 and individuals[0] == tskit.NULL:
-                # No samples refer to individuals
-                individuals = None
-            else:
-                # np.unique sorts the argument, so if NULL (-1) is present it
-                # will be the first value.
-                if individuals[0] == tskit.NULL:
-                    raise ValueError(
-                        "Sample nodes must either all be associated with individuals "
-                        "or not associated with any individuals"
-                    )
+        # If there are no individuals, or all the individuals are not associated with
+        # nodes, then we split by ploidy.
+        if ts.num_individuals > 0 and np.any(ts.individuals_nodes != -1):
+            individuals_nodes = ts.individuals_nodes
         else:
+            if ploidy is None:
+                ploidy = 1
+            individuals_nodes = ts.sample_nodes_by_ploidy(ploidy)
+
+        if individuals is not None:
             individuals = np.array(individuals, dtype=np.int32)
             if len(individuals) == 0:
                 raise ValueError("List of sample individuals empty")
+            if any(individuals < 0) or any(individuals >= ts.num_individuals):
+                raise ValueError("Invalid individual IDs provided.")
+            individuals_nodes = ts.individuals_nodes[individuals]
 
-        if individuals is not None:
-            self.samples = []
-            # FIXME this could probably be done more efficiently.
-            for i in individuals:
-                if i < 0 or i >= self.tree_sequence.num_individuals:
-                    raise ValueError("Invalid individual IDs provided.")
-                ind = self.tree_sequence.individual(i)
-                if len(ind.nodes) == 0:
-                    raise ValueError(f"Individual {i} not associated with a node")
-                is_sample = {ts.node(u).is_sample() for u in ind.nodes}
-                if len(is_sample) != 1:
-                    raise ValueError(
-                        f"Individual {ind.id} has nodes that are sample and "
-                        "non-samples"
-                    )
-                self.samples.extend(ind.nodes)
-                self.individual_ploidies.append(len(ind.nodes))
-        else:
-            if ploidy is None:
-                ploidy = 1
-            if ploidy < 1:
-                raise ValueError("Ploidy must be >= 1")
-            if ts.num_samples % ploidy != 0:
-                raise ValueError("Sample size must be divisible by ploidy")
-            self.individual_ploidies = np.full(
-                ts.sample_size // ploidy, ploidy, dtype=np.int32
-            )
+        self.samples = []
+        for i, row in enumerate(individuals_nodes):
+            wanted_nodes = row[row != -1]
+            # This error only fires if an individual was specifically specified.
+            if len(wanted_nodes) == 0 and individuals is not None:
+                raise ValueError(f"Individual {i} not associated with a node")
+            if len(wanted_nodes) > 0:
+                self.samples.extend(wanted_nodes)
+                self.individual_ploidies.append(len(wanted_nodes))
         self.num_individuals = len(self.individual_ploidies)
 
     def __write_header(self, output):