Skip to content

Commit e9a6df7

Browse files
committed
Fill in some of the phylogenetic todos
1 parent 969bf16 commit e9a6df7

1 file changed

Lines changed: 119 additions & 18 deletions

File tree

phylogen.md

Lines changed: 119 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,105 @@ tips, as in the example below:
2525
```{code-cell}
2626
:"tags": ["hide-input"]
2727
import tskit
28+
import numpy as np
29+
30+
def collapse_tree_for_plot(tree, max_tips=20, style=None):
31+
"""
32+
Return a condensed version of a single tree by collapsing large clades until
33+
at most `max_tips` labelled tips remain.
34+
35+
The returned tuple is:
36+
(plot_tree, node_labels, style)
37+
38+
where `plot_tree` is a tree suitable for plotting with `draw_svg`, `node_labels`
39+
labels the retained sample tips plus the collapsed clades, and `style` highlights
40+
the collapsed clades.
41+
42+
This helper is intended for plotting a single large tree. It requires the tree to
43+
come from a tree sequence containing exactly one tree.
44+
"""
45+
ts = tree.tree_sequence
46+
if ts.num_trees != 1:
47+
raise ValueError("collapse_tree_for_plot currently expects a single-tree tree sequence")
48+
if tree.num_roots != 1:
49+
raise ValueError("collapse_tree_for_plot currently expects a single-root tree")
50+
if max_tips < 2:
51+
raise ValueError("max_tips must be at least 2")
52+
53+
postorder = list(tree.nodes(order="postorder"))
54+
counts = np.zeros(ts.num_nodes, dtype=int)
55+
for u in postorder:
56+
if tree.is_sample(u):
57+
counts[u] = 1
58+
else:
59+
counts[u] = sum(counts[v] for v in tree.children(u))
60+
61+
displayed_tips = tree.num_samples()
62+
candidate_nodes = [
63+
u for u in postorder
64+
if counts[u] > 1 and tree.parent(u) != tskit.NULL
65+
]
66+
collapsed_nodes = set()
67+
forbidden = set()
68+
69+
while displayed_tips > max_tips:
70+
needed_reduction = displayed_tips - max_tips
71+
candidates = [u for u in candidate_nodes if u not in collapsed_nodes and u not in forbidden]
72+
if len(candidates) == 0:
73+
break
74+
75+
# Prefer a clade that gets us close to the target without overshooting.
76+
acceptable = [u for u in candidates if counts[u] - 1 <= needed_reduction]
77+
if len(acceptable) > 0:
78+
u = max(acceptable, key=lambda x: counts[x])
79+
else:
80+
u = min(candidates, key=lambda x: counts[x])
81+
82+
collapsed_nodes.add(u)
83+
displayed_tips -= counts[u] - 1
84+
85+
# Do not collapse descendants or ancestors of an already-collapsed node.
86+
stack = list(tree.children(u))
87+
while len(stack) > 0:
88+
v = stack.pop()
89+
forbidden.add(v)
90+
stack.extend(tree.children(v))
91+
v = tree.parent(u)
92+
while v != tskit.NULL:
93+
forbidden.add(v)
94+
v = tree.parent(v)
95+
96+
keep_nodes = []
97+
stack = list(tree.roots)
98+
while len(stack) > 0:
99+
u = stack.pop()
100+
if u in collapsed_nodes:
101+
keep_nodes.append(u)
102+
elif tree.is_sample(u):
103+
keep_nodes.append(u)
104+
else:
105+
stack.extend(tree.children(u))
106+
107+
plot_ts = ts.simplify(sorted(keep_nodes), filter_nodes=False)
108+
plot_tree = plot_ts.first()
109+
110+
node_labels = {}
111+
style = [style or ""]
112+
for u in plot_tree.nodes():
113+
if u in collapsed_nodes:
114+
node_labels[u] = f"{counts[u]:,} tips"
115+
scale = min(8.0, 1.0 + counts[u] / max_tips)
116+
style.append(
117+
f".n{u} > .sym {{clip-path: polygon(50% 0%, 100% 100%, 0% 100%); "
118+
f"transform: scale({scale}, 3.5)}}"
119+
)
120+
style.append(
121+
f".n{u} > .lab {{transform: translateY(20px); font-size: 12px}}"
122+
)
123+
elif plot_tree.is_sample(u):
124+
node_labels[u] = str(u)
125+
126+
return plot_tree, node_labels, "".join(style)
28127
```
29128

30129
```{code-cell}
@@ -36,11 +135,19 @@ print("Tree sequence takes up", big_tree.tree_sequence.nbytes / 1024**2, "Mb")
36135
print(f"Generating a 'comb' (pectinate) tree of {num_tips} tips took:")
37136
```
38137

39-
:::{todo}
40-
Display the million tip tree in condensed form, when
41-
https://github.com/tskit-dev/tskit/issues/2372#issuecomment-1298518380 and
42-
https://github.com/tskit-dev/tskit/issues/2628 are solved
43-
:::
138+
Clearly, plotting a tree with a million tips is problematic, but we can draw a summary
139+
(see the {ref}`visualization tutorial<sec_tskit_viz_SVG_examples_larger_plots>` for details):
140+
141+
142+
```{code-cell}
143+
plot_tree, labels, css = collapse_tree_for_plot( # function from the top of this notebook
144+
big_tree, max_tips=20, style=".y-axis .tick .lab {font-size: 8pt}"
145+
)
146+
plot_tree.draw_svg(
147+
size=(1000, 400), node_labels=labels, style=css, time_scale="rank", y_axis=True
148+
)
149+
```
150+
44151

45152
Calculations on these huge trees can be very efficient:
46153

@@ -322,11 +429,11 @@ print(
322429
tree.branch_length(node_id),
323430
)
324431
```
325-
:::{todo}
326-
The branch distance between two samples is also easy to calculate
327432

328-
NB: Turn the following in to a code cell
329-
```
433+
The branch distance between two samples is also easy to calculate using
434+
{meth}`~Tree.distance_between`:
435+
436+
```{code-cell}
330437
target_node_1 = 5
331438
target_node_2 = 7
332439
print(
@@ -335,9 +442,7 @@ print(
335442
"and",
336443
target_node_2,
337444
"is",
338-
# See https://github.com/tskit-dev/tskit/issues/2627 - what should be call this
339-
# so as not to get mixed up with tree.path_length which counts the number of edges
340-
# tree.branch_distance(target_node_1, target_node_2),
445+
tree.distance_between(target_node_1, target_node_2), # beware: tree.path_length counts number of edges
341446
)
342447
```
343448

@@ -349,18 +454,15 @@ calculation is to use {meth}`TreeSequence.divergence`, part of the the standard
349454
multiple sets of samples in {ref}`sec_phylogen_multiple_trees` to be calculated
350455
efficiently:
351456

352-
NB: Turn the following in to a code cell
353-
```
457+
```{code-cell}
354458
target_node_1 = 5
355459
target_node_2 = 7
356460
print(
357-
"Branch distance using built-in stats framework:"
461+
"Branch distance using built-in stats framework:",
358462
tree.tree_sequence.divergence(([5], [7]), mode="branch", windows="trees")
359463
)
360464
```
361465

362-
:::
363-
364466

365467
(sec_phylogen_multiroot)=
366468
### Roots and multiroot trees
@@ -503,4 +605,3 @@ The rest of the {program}`tskit` tutorials will lead you through the concepts in
503605
storing and analysing sequences of many correlated trees. For a simple introduction, you
504606
might want to start with {ref}`sec_what_is`.
505607

506-

0 commit comments

Comments
 (0)