diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index a869bcd09d..aa105a2dab 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -17,12 +17,27 @@ so a set of tree sequence can be added to the right of an existing one. (:user:`hyanwong`, :pr:`3165`, :issue:`3164`) +- Add ``TreeSequence.map_to_vcf_model`` method to return a mapping of + the tree sequence to the VCF model. + (:user:`benjeffery`, :pr:`3163`) **Fixes** - Correct assertion message when tables are compared with metadata ignored. (:user:`benjeffery`, :pr:`3162`, :issue:`3161`) +**Breaking changes** + +- ``TreeSequence.write_vcf`` now filters non-sample nodes from individuals + by default, instead of raising an error. These nodes can be included using the + new ``include_non_sample_nodes`` argument. + By default individual names (sample IDs) in VCF output are now of the form + ``tsk_{individual.id}`` Previously these were always + ``"tsk_{j}" for j in range(num_individuals)``. This may break some downstream + code if individuals are specified. To fix, manually specify ``individual_names`` + to the required pattern. + (:user:`benjeffery`, :pr:`3163`) + -------------------- [0.6.3] - 2025-04-28 -------------------- diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 856987b383..5c8c4256c8 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -5668,3 +5668,245 @@ def test_different_node_flags(self): result = ts.sample_nodes_by_ploidy(2) assert result.shape == (1, 2) assert_array_equal(result, np.array([[0, 2]])) + + +class TestMapToVcfModel: + def test_no_individuals_default_ploidy(self): + ts = tskit.Tree.generate_balanced(4).tree_sequence + assert ts.num_individuals == 0 + + # Default ploidy should be 1 + result = ts.map_to_vcf_model() + assert isinstance(result, tskit.VcfModelMapping) + assert result.individuals_nodes.shape == (4, 1) + for i in range(4): + assert result.individuals_nodes[i, 0] == i + assert result.individuals_name.shape == (4,) + for i in range(4): + assert result.individuals_name[i] == f"tsk_{i}" + + with pytest.raises( + ValueError, + match="Cannot include non-sample nodes when individuals are not present", + ): + ts.map_to_vcf_model(include_non_sample_nodes=True) + + def test_no_individuals_custom_ploidy(self): + ts = tskit.Tree.generate_balanced(6).tree_sequence + assert ts.num_individuals == 0 + + # Use ploidy = 2 + result = ts.map_to_vcf_model(ploidy=2) + assert isinstance(result, tskit.VcfModelMapping) + assert result.individuals_nodes.shape == (3, 2) + for i in range(3): + assert result.individuals_nodes[i, 0] == i * 2 + assert result.individuals_nodes[i, 1] == i * 2 + 1 + assert result.individuals_name.shape == (3,) + for i in range(3): + assert result.individuals_name[i] == f"tsk_{i}" + + def test_no_individuals_uneven_ploidy(self): + ts = tskit.Tree.generate_balanced(5).tree_sequence + # This tree sequence has no individuals + assert ts.num_individuals == 0 + + # 5 samples cannot be evenly divided into ploidy=2 + with pytest.raises(ValueError, match="not a multiple"): + ts.map_to_vcf_model(ploidy=2) + + def test_with_individuals(self): + ts = msprime.sim_ancestry( + 5, + random_seed=42, + ) + result = ts.map_to_vcf_model() + assert isinstance(result, tskit.VcfModelMapping) + assert result.individuals_nodes.shape == (5, 2) + assert np.array_equal( + result.individuals_nodes, + np.array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]), + ) + assert result.individuals_name.shape == (5,) + for i in range(5): + assert result.individuals_name[i] == f"tsk_{i}" + + def test_with_individuals_and_ploidy_error(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + ts = tables.tree_sequence() + + with pytest.raises(ValueError, match="Cannot specify ploidy when individuals"): + ts.map_to_vcf_model(ploidy=2) + + def test_specific_individuals(self): + tables = tskit.TableCollection(1.0) + # Create 5 individuals with varying ploidy + for i in range(5): + tables.individuals.add_row() + # Individuals have ploidy i+1 + for _ in range(i + 1): + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=i) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model(individuals=[1, 3]) + assert isinstance(result, tskit.VcfModelMapping) + # Individual 1 has ploidy 2, individual 3 has ploidy 4 + assert result.individuals_nodes.shape == (2, 5) + assert np.array_equal(result.individuals_nodes[0], [1, 2, -1, -1, -1]) + assert np.array_equal(result.individuals_nodes[1], [6, 7, 8, 9, -1]) + + assert result.individuals_name.shape == (2,) + assert result.individuals_name[0] == "tsk_1" + assert result.individuals_name[1] == "tsk_3" + + def test_individual_with_no_nodes(self): + tables = tskit.TableCollection(1.0) + # Individual with no nodes + tables.individuals.add_row() + # Individual with nodes + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model() + assert result.individuals_nodes.shape == (2, 1) + assert np.array_equal(result.individuals_nodes, [[-1], [0]]) + + def test_individual_with_no_nodes_only(self): + tables = tskit.TableCollection(1.0) + # Individual with no nodes + tables.individuals.add_row() + # Individual with nodes + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model(individuals=[0]) + assert result.individuals_nodes.shape == (1, 1) + assert np.array_equal(result.individuals_nodes, [[-1]]) + + def test_invalid_individual_id(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + ts = tables.tree_sequence() + + with pytest.raises(ValueError, match="Invalid individual ID"): + ts.map_to_vcf_model(individuals=[-1]) + + with pytest.raises(ValueError, match="Invalid individual ID"): + ts.map_to_vcf_model(individuals=[1]) + + def test_mixed_sample_non_sample_ordering(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=0, time=0, individual=0) # Non-sample node + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=0, time=0, individual=0) # Non-sample node + tables.individuals.add_row() + tables.nodes.add_row(flags=0, time=0, individual=1) # Non-sample node + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model() + assert result.individuals_nodes.shape == (2, 4) + assert np.array_equal( + result.individuals_nodes, + np.array([[0, 2, -1, -1], [5, -1, -1, -1]]), + ) + + result = ts.map_to_vcf_model(include_non_sample_nodes=True) + assert result.individuals_nodes.shape == (2, 4) + assert np.array_equal( + result.individuals_nodes, + np.array([[0, 1, 2, 3], [4, 5, -1, -1]]), + ) + + def test_samples_without_individuals_warning(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + # Node with individual + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + # Node without individual + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=tskit.NULL) + ts = tables.tree_sequence() + + with warnings.catch_warnings(record=True) as w: + ts.map_to_vcf_model() + assert len(w) == 1 + assert "At least one sample node does not have an individual ID" in str( + w[0].message + ) + + def test_metadata_key_for_names(self): + tables = tskit.TableCollection(1.0) + + # Add individuals with metadata + tables.individuals.metadata_schema = tskit.MetadataSchema( + { + "codec": "json", + "type": "object", + "properties": {"name": {"type": "string"}}, + } + ) + tables.individuals.add_row(metadata={"name": "ind1"}) + tables.individuals.add_row(metadata={"name": "ind2"}) + + # Add nodes + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + result = ts.map_to_vcf_model(name_metadata_key="name") + assert result.individuals_name.shape == (2,) + assert result.individuals_name[0] == "ind1" + assert result.individuals_name[1] == "ind2" + + def test_custom_individual_names(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + custom_names = ["individual_A", "individual_B"] + result = ts.map_to_vcf_model(individual_names=custom_names) + assert result.individuals_name.shape == (2,) + assert result.individuals_name[0] == "individual_A" + assert result.individuals_name[1] == "individual_B" + + def test_name_conflict_error(self): + tables = tskit.TableCollection(1.0) + ts = tables.tree_sequence() + with pytest.raises( + ValueError, + match="Cannot specify both name_metadata_key and individual_names", + ): + ts.map_to_vcf_model( + name_metadata_key="name", individual_names=["custom_name"] + ) + + def test_name_count_mismatch_error(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + tables.individuals.add_row() + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + with pytest.raises( + ValueError, match="number of individuals does not match the number of names" + ): + ts.map_to_vcf_model(individual_names=["only_one_name"]) + + def test_all_individuals_no_nodes(self): + tables = tskit.TableCollection(1.0) + tables.individuals.add_row() + tables.individuals.add_row() + ts = tables.tree_sequence() + result = ts.map_to_vcf_model() + assert result.individuals_nodes.shape == (2, 0) diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index a1ce300702..06ce4d771f 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -29,6 +29,7 @@ import os import tempfile import textwrap +import warnings import msprime import numpy as np @@ -335,12 +336,13 @@ class TestInterface: def test_bad_ploidy(self): ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2) for bad_ploidy in [-1, 0]: - with pytest.raises(ValueError, match="Ploidy must be >= 1"): + with pytest.raises(ValueError, match="Ploidy must be a positive integer"): ts.write_vcf(io.StringIO, bad_ploidy) # Non divisible for bad_ploidy in [3, 7]: with pytest.raises( - ValueError, match="Sample size must be divisible by ploidy" + ValueError, + match="Number of sample nodes 10 is not a multiple of ploidy", ): ts.write_vcf(io.StringIO, bad_ploidy) @@ -349,17 +351,29 @@ def test_individuals_no_nodes_default_args(self): tables = ts1.dump_tables() tables.individuals.add_row() ts2 = tables.tree_sequence() - assert ts1.as_vcf(allow_position_zero=True) == ts2.as_vcf( - allow_position_zero=True - ) + # ts1 should work as it has no individuals + ts1.as_vcf(allow_position_zero=True) + # ts2 should fail as it has individuals but no nodes + with warnings.catch_warnings(record=True) as w: + with pytest.raises(ValueError, match="No samples in resulting VCF model"): + ts2.as_vcf(allow_position_zero=True) + assert len(w) == 1 + assert "At least one sample node does not have an individual ID" in str( + w[0].message + ) def test_individuals_no_nodes_as_argument(self): ts1 = msprime.simulate(10, mutation_rate=0.1, random_seed=2) tables = ts1.dump_tables() tables.individuals.add_row() ts2 = tables.tree_sequence() - with pytest.raises(ValueError, match="0 not associated with a node"): - ts2.as_vcf(individuals=[0]) + with warnings.catch_warnings(record=True) as w: + with pytest.raises(ValueError, match="No samples in resulting VCF model"): + ts2.as_vcf(individuals=[0]) + assert len(w) == 1 + assert "At least one sample node does not have an individual ID" in str( + w[0].message + ) def test_ploidy_with_sample_individuals(self): ts = msprime.sim_ancestry(3, random_seed=2) @@ -378,7 +392,7 @@ def test_ploidy_with_no_node_individuals(self): def test_empty_individuals(self): ts = msprime.sim_ancestry(3, random_seed=2) ts = tsutil.insert_branch_sites(ts) - with pytest.raises(ValueError, match="List of sample individuals empty"): + with pytest.raises(ValueError, match="No individuals specified"): ts.as_vcf(individuals=[]) def test_duplicate_individuals(self): @@ -387,23 +401,6 @@ def test_duplicate_individuals(self): with pytest.raises(tskit.LibraryError, match="TSK_ERR_DUPLICATE_SAMPLE"): ts.as_vcf(individuals=[0, 0], allow_position_zero=True) - def test_mixed_sample_non_sample_individuals(self): - ts = msprime.sim_ancestry(3, random_seed=2) - tables = ts.dump_tables() - tables.individuals.add_row() - # Add a reference to an individual from a non-sample - individual = tables.nodes.individual - individual[-1] = 0 - tables.nodes.individual = individual - ts = tables.tree_sequence() - ts = tsutil.insert_branch_sites(ts) - with pytest.raises( - ValueError, match="0 has nodes that are sample and non-sample" - ): - ts.as_vcf() - # but it's OK if we run without the affected individual - assert len(ts.as_vcf(individuals=[1, 2], allow_position_zero=True)) > 0 - def test_samples_with_and_without_individuals(self): ts = tskit.Tree.generate_balanced(3).tree_sequence tables = ts.dump_tables() @@ -414,19 +411,19 @@ def test_samples_with_and_without_individuals(self): tables.nodes.individual = individual ts = tables.tree_sequence() ts = tsutil.insert_branch_sites(ts) - with pytest.raises( - ValueError, match="Sample nodes must either all be associated" - ): - ts.as_vcf() - # But it's OK if explicitly specify that sample - assert len(ts.as_vcf(individuals=[0], allow_position_zero=True)) > 0 + with warnings.catch_warnings(record=True) as w: + ts.as_vcf(allow_position_zero=True) + assert len(w) == 1 + assert "At least one sample node does not have an individual ID" in str( + w[0].message + ) def test_bad_individuals(self): ts = msprime.simulate(10, mutation_rate=0.1, random_seed=2) ts = tsutil.insert_individuals(ts, ploidy=2) - with pytest.raises(ValueError, match="Invalid individual IDs provided."): + with pytest.raises(ValueError, match="Invalid individual ID"): ts.write_vcf(io.StringIO(), individuals=[0, -1]) - with pytest.raises(ValueError, match="Invalid individual IDs provided."): + with pytest.raises(ValueError, match="Invalid individual ID"): ts.write_vcf(io.StringIO(), individuals=[1, 2, ts.num_individuals]) def test_ploidy_positional(self): @@ -527,20 +524,17 @@ def test_bad_length_individuals(self): ts = tsutil.insert_individuals(ts, ploidy=2) with pytest.raises( ValueError, - match="individual_names must have length equal to" - " the number of individuals", + match="The number of individuals does not match the number of names", ): ts.write_vcf(io.StringIO(), individual_names=[]) with pytest.raises( ValueError, - match="individual_names must have length equal to" - " the number of individuals", + match="The number of individuals does not match the number of names", ): ts.write_vcf(io.StringIO(), individual_names=["x" for _ in range(4)]) with pytest.raises( ValueError, - match="individual_names must have length equal to" - " the number of individuals", + match="The number of individuals does not match the number of names", ): ts.write_vcf( io.StringIO(), @@ -549,8 +543,7 @@ def test_bad_length_individuals(self): ) with pytest.raises( ValueError, - match="individual_names must have length equal to" - " the number of individuals", + match="The number of individuals does not match the number of names", ): ts.write_vcf( io.StringIO(), @@ -563,14 +556,12 @@ def test_bad_length_ploidy(self): assert ts.num_sites > 0 with pytest.raises( ValueError, - match="individual_names must have length equal to" - " the number of individuals", + match="The number of individuals does not match the number of names", ): ts.write_vcf(io.StringIO(), ploidy=2, individual_names=[]) with pytest.raises( ValueError, - match="individual_names must have length equal to" - " the number of individuals", + match="The number of individuals does not match the number of names", ): ts.write_vcf( io.StringIO(), ploidy=2, individual_names=["x" for _ in range(4)] @@ -907,7 +898,7 @@ def test_no_individuals_ploidy_3_names(self): expected = textwrap.dedent(s) assert ( drop_header( - ts.as_vcf(ploidy=3, individual_names="A", allow_position_zero=True) + ts.as_vcf(ploidy=3, individual_names=["A"], allow_position_zero=True) ) == expected ) @@ -940,7 +931,7 @@ def test_individual_0(self): def test_individual_1(self): ts = self.ts() s = """\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0 + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_1 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1 @@ -954,7 +945,7 @@ def test_individual_1(self): def test_reversed(self): ts = self.ts() s = """\ - #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_0\ttsk_1 + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\ttsk_1\ttsk_0 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|0 1\t2\t1\t0\t1\t.\tPASS\t.\tGT\t1\t0|1 1\t4\t2\t0\t1\t.\tPASS\t.\tGT\t1\t0|0 @@ -984,3 +975,78 @@ def test_reversed_names(self): ) == expected ) + + +class TestVcfMapping: + def test_mix_sample_non_sample(self): + ts = tskit.Tree.generate_balanced(5, span=10).tree_sequence + ts = tsutil.insert_branch_sites(ts) + assert ts.num_nodes >= 8 + tables = ts.dump_tables() + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() + individual = tables.nodes.individual + assert np.all(individual == -1) + # First has only non-sample nodes + individual[7] = 0 + # Second has 2 sample nodes + individual[0] = 1 + individual[1] = 1 + # Third has 1 non-sample and 1 sample + individual[5] = 2 + individual[2] = 2 + # Fourth has sandwiched non-sample + individual[3] = 3 + individual[6] = 3 + individual[4] = 3 + tables.nodes.individual = individual + ts = tables.tree_sequence() + + # Individual "A" is redacted as has no nodes + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tB\tC\tD + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|1\t0\t0|0 + 1\t1\t1\t0\t1\t.\tPASS\t.\tGT\t1|0\t0\t0|0 + 1\t2\t2\t0\t1\t.\tPASS\t.\tGT\t0|1\t0\t0|0 + 1\t3\t3\t0\t1\t.\tPASS\t.\tGT\t0|0\t1\t1|1 + 1\t4\t4\t0\t1\t.\tPASS\t.\tGT\t0|0\t1\t0|0 + 1\t6\t5\t0\t1\t.\tPASS\t.\tGT\t0|0\t0\t1|1 + 1\t7\t6\t0\t1\t.\tPASS\t.\tGT\t0|0\t0\t1|0 + 1\t8\t7\t0\t1\t.\tPASS\t.\tGT\t0|0\t0\t0|1""" + expected = textwrap.dedent(s) + assert ( + drop_header( + ts.as_vcf( + individual_names=["A", "B", "C", "D"], + allow_position_zero=True, + ), + ) + == expected + ) + + # Now with non-sample nodes, so A is included, C becomes diploid + # and D is triploid + s = """\ + #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tA\tB\tC\tD + 1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|1\t0|1\t0|0|0 + 1\t1\t1\t0\t1\t.\tPASS\t.\tGT\t0\t1|0\t0|0\t0|0|0 + 1\t2\t2\t0\t1\t.\tPASS\t.\tGT\t0\t0|1\t0|0\t0|0|0 + 1\t3\t3\t0\t1\t.\tPASS\t.\tGT\t1\t0|0\t1|0\t1|1|1 + 1\t4\t4\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t1|0\t0|0|0 + 1\t6\t5\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t0|0\t1|1|1 + 1\t7\t6\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t0|0\t1|0|0 + 1\t8\t7\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t0|0\t0|1|0""" + expected = textwrap.dedent(s) + assert ( + drop_header( + ts.as_vcf( + individual_names=["A", "B", "C", "D"], + allow_position_zero=True, + include_non_sample_nodes=True, + isolated_as_missing=False, + ), + ) + == expected + ) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 2c8b2eb262..37c41a767e 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -59,6 +59,12 @@ LEGACY_MS_LABELS = "legacy_ms" +@dataclass +class VcfModelMapping: + individuals_nodes: np.ndarray + individuals_name: np.ndarray + + class CoalescenceRecord(NamedTuple): left: float right: float @@ -6412,6 +6418,7 @@ def write_vcf( sample_mask=None, isolated_as_missing=None, allow_position_zero=None, + include_non_sample_nodes=None, ): """ Convert the genetic variation data in this tree sequence to Variant @@ -6427,21 +6434,21 @@ def write_vcf( :ref:`sec_export_vcf_constructing_gt` section for more details and examples. - If individuals that are associated with sample nodes are defined in the + If individuals are defined in the data model (see :ref:`sec_individual_table_definition`), the genotypes - for each of the individual's samples are combined into a phased - multiploid values at each site. By default, all individuals associated - with sample nodes are included in increasing order of individual ID. + for each of the individual's nodes are combined into a phased + multiploid values at each site. By default, all individuals are + included with their sample nodes, individuals with no nodes are + omitted. The ``include_non_sample_nodes`` argument can be used to + included non-sample nodes in the output VCF. Subsets or permutations of the sample individuals may be specified - using the ``individuals`` argument. It is an error to specify any - individuals that are not associated with any nodes, or whose - nodes are not all samples. + using the ``individuals`` argument. Mixed-sample individuals (e.g., those associated with one node that is a sample and another that is not) in the data model will - result in an error by default. However, such individuals can be - excluded using the ``individuals`` argument. + only have the sample nodes output by default. However, non-sample + nodes can be included using the ``include_non_sample_nodes`` argument. If there are no individuals in the tree sequence, synthetic individuals are created by combining adjacent samples, and @@ -6542,6 +6549,8 @@ def write_vcf( output to the VCF, otherwise if one is present an error will be raised. The VCF spec does not allow for sites at position 0. However, in practise many tools will be fine with this. Default: False. + :param bool include_non_sample_nodes: If True, include non-sample nodes + in the output VCF. By default, only sample nodes are included. """ if allow_position_zero is None: allow_position_zero = False @@ -6556,6 +6565,7 @@ def write_vcf( sample_mask=sample_mask, isolated_as_missing=isolated_as_missing, allow_position_zero=allow_position_zero, + include_non_sample_nodes=include_non_sample_nodes, ) writer.write(output) @@ -10607,6 +10617,8 @@ def sample_nodes_by_ploidy(self, ploidy): :return: A 2D array of node IDs, where each row has length `ploidy`. :rtype: numpy.ndarray """ + if ploidy <= 0 or ploidy != int(ploidy): + raise ValueError("Ploidy must be a positive integer") sample_node_ids = np.flatnonzero(self.nodes_flags & tskit.NODE_IS_SAMPLE) num_samples = len(sample_node_ids) if num_samples == 0: @@ -10620,6 +10632,131 @@ def sample_nodes_by_ploidy(self, ploidy): sample_node_ids = sample_node_ids.reshape((num_samples_per_individual, ploidy)) return sample_node_ids + def map_to_vcf_model( + self, + individuals=None, + ploidy=None, + name_metadata_key=None, + individual_names=None, + include_non_sample_nodes=None, + ): + """ + Maps the sample nodes in this tree sequence to a representation suitable for + VCF output, using the individuals if present. + + Creates a VcfModelMapping object that contains both the nodes-to-individual + mapping as a 2D array of (individuals, nodes) and the individual names. The + mapping is created by first checking if the tree sequence contains individuals. + If it does, the mapping is created using the individuals in the tree sequence. + By default only the sample nodes of the individuals are included in the mapping, + unless `include_non_sample_nodes` is set to True, in which case all nodes + belonging to the individuals are included. Any individuals without any nodes + will have no nodes in their row of the mapping, being essentially of zero ploidy. + If no individuals are present, the mapping is created using only the sample nodes + and the specified ploidy. + + If neither `name_metadata_key` nor `individual_names` is not specified, the + individual names are set to "tsk_{individual_id}" for each individual. If + no individuals are present, the individual names are set to "tsk_{i}" with + `0 <= i < num_sample_nodes/ploidy`. + + A Warning are emmitted if any sample nodes do not have an individual ID. + + :param list individuals: Specific individual IDs to include in the VCF. If not + specified and the tree sequence contains individuals, all individuals are + included at least one node. + :param int ploidy: The ploidy, or number of nodes per individual. Only used when + the tree sequence does not contain individuals. Cannot be used if the tree + sequence contains individuals. Defaults to 1 if not specified. + :param str name_metadata_key: The key in the individual metadata to use + for individual names. Cannot be specified simultaneously with + individual_names. + :param list individual_names: The names to use for each individual. Cannot + be specified simultaneously with name_metadata_key. + :param bool include_non_sample_nodes: If True, include all nodes belonging to + the individuals in the mapping. If False, only include sample nodes. + Deafults to False. + :return: A VcfModelMapping containing the node-to-individual mapping and + individual names. + :raises ValueError: If both name_metadata_key and individual_names are specified, + if ploidy is specified when individuals are present, if an invalid individual + ID is specified, if a specified individual has no nodes, or if the number of + individuals doesn't match the number of names. + """ + if include_non_sample_nodes is None: + include_non_sample_nodes = False + + if name_metadata_key is not None and individual_names is not None: + raise ValueError( + "Cannot specify both name_metadata_key and individual_names" + ) + + if self.num_individuals > 0 and ploidy is not None: + raise ValueError( + "Cannot specify ploidy when individuals are present in the tree sequence" + ) + + if self.num_individuals == 0 and include_non_sample_nodes: + raise ValueError( + "Cannot include non-sample nodes when individuals are not present in " + "the tree sequence" + ) + + if self.num_individuals > 0 and np.any( + np.logical_and( + self.nodes_individual == tskit.NULL, + self.nodes_flags & tskit.NODE_IS_SAMPLE, + ) + ): + warnings.warn( + "At least one sample node does not have an individual ID.", stacklevel=1 + ) + + if self.num_individuals == 0 and individuals is None: + if ploidy is None: + ploidy = 1 + individuals_nodes = self.sample_nodes_by_ploidy(ploidy) + if individual_names is None: + individual_names = [f"tsk_{i}" for i in range(len(individuals_nodes))] + else: + if individuals is None: + individuals = np.arange(self.num_individuals, dtype=np.int32) + if len(individuals) == 0: + raise ValueError("No individuals specified") + if min(individuals) < 0 or max(individuals) >= self.num_individuals: + raise ValueError("Invalid individual ID") + + individuals_nodes = self.individuals_nodes[individuals] + non_sample_nodes = np.logical_not( + self.nodes_flags[individuals_nodes] & tskit.NODE_IS_SAMPLE + ) + if np.any(non_sample_nodes) and not include_non_sample_nodes: + individuals_nodes[non_sample_nodes] = -1 + rows_to_reorder = np.any(non_sample_nodes, axis=1) + for i in np.where(rows_to_reorder)[0]: + row = individuals_nodes[i] + individuals_nodes[i] = np.concatenate( + [row[row != -1], row[row == -1]] + ) + + if individual_names is None: + if name_metadata_key is not None: + individual_names = [ + self.individual(i).metadata[name_metadata_key] + for i in individuals + ] + else: + individual_names = [f"tsk_{i}" for i in individuals] + + individual_names = np.array(individual_names, dtype=object) + + if len(individuals_nodes) != len(individual_names): + raise ValueError( + "The number of individuals does not match the number of names" + ) + + return VcfModelMapping(individuals_nodes, individual_names) + ############################################ # # Deprecated APIs. These are either already unsupported, or will be unsupported in a diff --git a/python/tskit/vcf.py b/python/tskit/vcf.py index dcbea01cb6..502d397be0 100644 --- a/python/tskit/vcf.py +++ b/python/tskit/vcf.py @@ -25,7 +25,6 @@ """ import numpy as np -import tskit from . import provenance @@ -64,19 +63,38 @@ def __init__( sample_mask, isolated_as_missing, allow_position_zero, + include_non_sample_nodes, ): self.tree_sequence = tree_sequence self.contig_id = contig_id self.isolated_as_missing = isolated_as_missing - self.__make_sample_mapping(ploidy, individuals) - if individual_names is None: - individual_names = [f"tsk_{j}" for j in range(self.num_individuals)] - self.individual_names = individual_names - if len(self.individual_names) != self.num_individuals: - raise ValueError( - "individual_names must have length equal to the number of individuals" - ) + vcf_model = tree_sequence.map_to_vcf_model( + individuals=individuals, + ploidy=ploidy, + individual_names=individual_names, + include_non_sample_nodes=include_non_sample_nodes, + ) + # Remove individuals with zero ploidy as these cannot be + # represented in VCF. + individuals_nodes = vcf_model.individuals_nodes + to_keep = (individuals_nodes != -1).any(axis=1) + individuals_nodes = individuals_nodes[to_keep] + self.individual_names = vcf_model.individuals_name[to_keep] + self.individual_ploidies = [ + len(nodes[nodes >= 0]) for nodes in individuals_nodes + ] + self.num_individuals = len(self.individual_names) + + if len(individuals_nodes) == 0: + raise ValueError("No samples in resulting VCF model") + + # Flatten the array of node IDs, filtering out the -1 padding values + self.samples = [] + for row in individuals_nodes: + for node_id in row: + if node_id != -1: + self.samples.append(node_id) # Transform coordinates for VCF if position_transform is None: @@ -126,69 +144,6 @@ def __init__( '"position_transform = lambda x: np.fmax(1, x)"' ) - def __make_sample_mapping(self, ploidy, individuals): - """ - Compute the sample IDs for each VCF individual and the template for - writing out genotypes. - """ - ts = self.tree_sequence - self.samples = None - self.individual_ploidies = [] - - # Cannot use "ploidy" when *any* individuals are present. - if ts.num_individuals > 0 and ploidy is not None: - raise ValueError( - "Cannot specify ploidy when individuals are present in tables " - ) - - if individuals is None: - # Find all sample nodes that reference individuals - individuals = np.unique(ts.nodes_individual[ts.samples()]) - if len(individuals) == 1 and individuals[0] == tskit.NULL: - # No samples refer to individuals - individuals = None - else: - # np.unique sorts the argument, so if NULL (-1) is present it - # will be the first value. - if individuals[0] == tskit.NULL: - raise ValueError( - "Sample nodes must either all be associated with individuals " - "or not associated with any individuals" - ) - else: - individuals = np.array(individuals, dtype=np.int32) - if len(individuals) == 0: - raise ValueError("List of sample individuals empty") - - if individuals is not None: - self.samples = [] - # FIXME this could probably be done more efficiently. - for i in individuals: - if i < 0 or i >= self.tree_sequence.num_individuals: - raise ValueError("Invalid individual IDs provided.") - ind = self.tree_sequence.individual(i) - if len(ind.nodes) == 0: - raise ValueError(f"Individual {i} not associated with a node") - is_sample = {ts.node(u).is_sample() for u in ind.nodes} - if len(is_sample) != 1: - raise ValueError( - f"Individual {ind.id} has nodes that are sample and " - "non-samples" - ) - self.samples.extend(ind.nodes) - self.individual_ploidies.append(len(ind.nodes)) - else: - if ploidy is None: - ploidy = 1 - if ploidy < 1: - raise ValueError("Ploidy must be >= 1") - if ts.num_samples % ploidy != 0: - raise ValueError("Sample size must be divisible by ploidy") - self.individual_ploidies = np.full( - ts.sample_size // ploidy, ploidy, dtype=np.int32 - ) - self.num_individuals = len(self.individual_ploidies) - def __write_header(self, output): print("##fileformat=VCFv4.2", file=output) print(f"##source=tskit {provenance.__version__}", file=output)