Skip to content

Commit a60764c

Browse files
committed
Add additional tests for vcf mapping
1 parent 00a9d08 commit a60764c

3 files changed

Lines changed: 83 additions & 3 deletions

File tree

python/tests/test_vcf.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -975,3 +975,78 @@ def test_reversed_names(self):
975975
)
976976
== expected
977977
)
978+
979+
980+
class TestVcfMapping:
981+
def test_mix_sample_non_sample(self):
982+
ts = tskit.Tree.generate_balanced(5, span=10).tree_sequence
983+
ts = tsutil.insert_branch_sites(ts)
984+
assert ts.num_nodes >= 8
985+
tables = ts.dump_tables()
986+
tables.individuals.add_row()
987+
tables.individuals.add_row()
988+
tables.individuals.add_row()
989+
tables.individuals.add_row()
990+
individual = tables.nodes.individual
991+
assert np.all(individual == -1)
992+
# First has only non-sample nodes
993+
individual[7] = 0
994+
# Second has 2 sample nodes
995+
individual[0] = 1
996+
individual[1] = 1
997+
# Third has 1 non-sample and 1 sample
998+
individual[5] = 2
999+
individual[2] = 2
1000+
# Fourth has sandwiched non-sample
1001+
individual[3] = 3
1002+
individual[6] = 3
1003+
individual[4] = 3
1004+
tables.nodes.individual = individual
1005+
ts = tables.tree_sequence()
1006+
1007+
# Individual "A" is redacted as has no nodes
1008+
s = """\
1009+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tB\tC\tD
1010+
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t1|1\t0\t0|0
1011+
1\t1\t1\t0\t1\t.\tPASS\t.\tGT\t1|0\t0\t0|0
1012+
1\t2\t2\t0\t1\t.\tPASS\t.\tGT\t0|1\t0\t0|0
1013+
1\t3\t3\t0\t1\t.\tPASS\t.\tGT\t0|0\t1\t1|1
1014+
1\t4\t4\t0\t1\t.\tPASS\t.\tGT\t0|0\t1\t0|0
1015+
1\t6\t5\t0\t1\t.\tPASS\t.\tGT\t0|0\t0\t1|1
1016+
1\t7\t6\t0\t1\t.\tPASS\t.\tGT\t0|0\t0\t1|0
1017+
1\t8\t7\t0\t1\t.\tPASS\t.\tGT\t0|0\t0\t0|1"""
1018+
expected = textwrap.dedent(s)
1019+
assert (
1020+
drop_header(
1021+
ts.as_vcf(
1022+
individual_names=["A", "B", "C", "D"],
1023+
allow_position_zero=True,
1024+
),
1025+
)
1026+
== expected
1027+
)
1028+
1029+
# Now with non-sample nodes, so A is included, C becomes diploid
1030+
# and D is triploid
1031+
s = """\
1032+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tA\tB\tC\tD
1033+
1\t0\t0\t0\t1\t.\tPASS\t.\tGT\t0\t1|1\t0|1\t0|0|0
1034+
1\t1\t1\t0\t1\t.\tPASS\t.\tGT\t0\t1|0\t0|0\t0|0|0
1035+
1\t2\t2\t0\t1\t.\tPASS\t.\tGT\t0\t0|1\t0|0\t0|0|0
1036+
1\t3\t3\t0\t1\t.\tPASS\t.\tGT\t1\t0|0\t1|0\t1|1|1
1037+
1\t4\t4\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t1|0\t0|0|0
1038+
1\t6\t5\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t0|0\t1|1|1
1039+
1\t7\t6\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t0|0\t1|0|0
1040+
1\t8\t7\t0\t1\t.\tPASS\t.\tGT\t0\t0|0\t0|0\t0|1|0"""
1041+
expected = textwrap.dedent(s)
1042+
assert (
1043+
drop_header(
1044+
ts.as_vcf(
1045+
individual_names=["A", "B", "C", "D"],
1046+
allow_position_zero=True,
1047+
include_non_sample_nodes=True,
1048+
isolated_as_missing=False,
1049+
),
1050+
)
1051+
== expected
1052+
)

python/tskit/trees.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6416,6 +6416,7 @@ def write_vcf(
64166416
sample_mask=None,
64176417
isolated_as_missing=None,
64186418
allow_position_zero=None,
6419+
include_non_sample_nodes=None,
64196420
):
64206421
"""
64216422
Convert the genetic variation data in this tree sequence to Variant
@@ -6559,6 +6560,7 @@ def write_vcf(
65596560
sample_mask=sample_mask,
65606561
isolated_as_missing=isolated_as_missing,
65616562
allow_position_zero=allow_position_zero,
6563+
include_non_sample_nodes=include_non_sample_nodes,
65626564
)
65636565
writer.write(output)
65646566

python/tskit/vcf.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,23 +63,26 @@ def __init__(
6363
sample_mask,
6464
isolated_as_missing,
6565
allow_position_zero,
66+
include_non_sample_nodes,
6667
):
6768
self.tree_sequence = tree_sequence
6869
self.contig_id = contig_id
6970
self.isolated_as_missing = isolated_as_missing
7071

7172
vcf_model = tree_sequence.map_to_vcf_model(
72-
individuals=individuals, ploidy=ploidy, individual_names=individual_names
73+
individuals=individuals,
74+
ploidy=ploidy,
75+
individual_names=individual_names,
76+
include_non_sample_nodes=include_non_sample_nodes,
7377
)
7478
# Remove individuals with zero ploidy as these cannot be
7579
# represented in VCF.
7680
individuals_nodes = vcf_model.individuals_nodes
7781
to_keep = (individuals_nodes != -1).any(axis=1)
7882
individuals_nodes = individuals_nodes[to_keep]
7983
self.individual_names = vcf_model.individuals_name[to_keep]
80-
8184
self.individual_ploidies = [
82-
len(nodes[nodes >= 0]) for nodes in vcf_model.individuals_nodes
85+
len(nodes[nodes >= 0]) for nodes in individuals_nodes
8386
]
8487
self.num_individuals = len(self.individual_names)
8588

0 commit comments

Comments
 (0)