Skip to content

Random split polytomy #815

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@

**Features**

- Add ``split_polytomies`` method to the Tree class
(:user:`hyanwong`, :user:`jeromekelleher`, :issue:`809`, :pr:`815`)

- Tree accessor functions (e.g. ``ts.first()``, ``ts.at()`` pass extra parameters such as
``sample_indexes`` to the underlying ``Tree`` constructor; also ``root_threshold`` can
be specified when calling ``ts.trees()`` (:user:`hyanwong`, :issue:`847`, :pr:`848`)
Expand Down
30 changes: 30 additions & 0 deletions python/tests/test_combinatorics.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import collections
import io
import itertools
import random

import msprime
import numpy as np
Expand Down Expand Up @@ -976,3 +977,32 @@ def test_forward_time_wright_fisher_simplified(self):
ts = tables.tree_sequence()
samples = ts.samples()
self.verify_topologies(ts, sample_sets=[samples[:10], samples[10:]])


class TestTreeNode:
"""
Tests for the TreeNode class used to build simple trees in memory.
"""

@pytest.mark.parametrize("n", [2, 3, 5, 10])
def test_random_binary_tree(self, n):
# Note this doesn't check any statistical properties of the returned
# trees, just that a single instance returned in a valid binary tree.
rng = random.Random(32)
labels = range(n)
root = comb.TreeNode.random_binary_tree(labels, rng)

stack = [root]
num_nodes = 0
recovered_labels = []
while len(stack) > 0:
node = stack.pop()
num_nodes += 1
if node.label is not None:
assert len(node.children) == 0
recovered_labels.append(node.label)
for child in node.children:
assert child.parent == node
stack.append(child)
assert len(recovered_labels) == n
assert sorted(recovered_labels) == list(labels)
219 changes: 219 additions & 0 deletions python/tests/test_topology.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"""
Test cases for the supported topological variations and operations.
"""
import collections
import functools
import io
import itertools
Expand Down Expand Up @@ -7923,3 +7924,221 @@ def test_star_branch_length(self):

assert ts.node(ts.first().root).time == branch_length
assert ts.kc_distance(topological_equiv_ts) == 0


def num_leaf_labelled_binary_trees(n):
"""
Returns the number of leaf labelled binary trees with n leaves.

TODO: this would probably be helpful to have in the combinatorics
module.

https://oeis.org/A005373/
"""
return int(np.math.factorial(2 * n - 3) / (2 ** (n - 2) * np.math.factorial(n - 2)))


class TestPolytomySplitting:
"""
Test the ability to randomly split polytomies
"""

# A complex ts with polytomies
#
# 1.00┊ 6 ┊ 6 ┊ 6 ┊ ┊ 6 ┊
# ┊ ┏━┳┻┳━┓ ┊ ┏━┳┻┳━┓ ┊ ┏━━╋━┓ ┊ ┊ ┏━┳┻┳━┓ ┊
# 0.50┊ 5 ┃ ┃ ┃ ┊ 5 ┃ ┃ ┃ ┊ 5 ┃ ┃ ┊ 5 ┊ ┃ ┃ ┃ ┃ ┊
# ┊ ┃ ┃ ┃ ┃ . ┊ ┃ ┃ ┃ ┃ ┊ . ┏┻┓ ┃ ┃ ┊ . ┏━┳┻┳━┓ ┊ . ┃ ┃ ┃ ┃ ┊
# 0.00┊ 0 2 3 4 1 ┊ 0 1 2 3 4 ┊ 0 1 2 3 4 ┊ 0 1 2 3 4 ┊ 0 1 2 3 4 ┊
# 0.00 0.20 0.40 0.60 0.80 1.00
nodes_polytomy_44344 = """\
id is_sample population time
0 1 0 0.0
1 1 0 0.0
2 1 0 0.0
3 1 0 0.0
4 1 0 0.0
5 0 0 0.5
6 0 0 1.0
"""
edges_polytomy_44344 = """\
id left right parent child
0 0.0 0.2 5 0
1 0.0 0.8 5 1
2 0.0 0.4 6 2
3 0.4 0.8 5 2
4 0.0 0.6 6 3,4
5 0.0 0.6 6 5
6 0.6 0.8 5 3,4
7 0.8 1.0 6 1,2,3,4
"""

def tree_polytomy_4(self):
return tskit.Tree.generate_star(4)

def ts_polytomy_44344(self):
return tskit.load_text(
nodes=io.StringIO(self.nodes_polytomy_44344),
edges=io.StringIO(self.edges_polytomy_44344),
strict=False,
)

@pytest.mark.slow
@pytest.mark.parametrize("n", [2, 3, 4, 5])
def test_all_topologies(self, n):
N = num_leaf_labelled_binary_trees(n)
ranks = collections.Counter()
tree = tskit.Tree.generate_star(n)
for seed in range(20 * N):
split_tree = tree.split_polytomies(random_seed=seed)
ranks[split_tree.rank()] += 1
# There are N possible binary trees here, we should have seen them
# all with high probability after 20 N attempts.
assert len(ranks) == N

def verify_trees(self, source_tree, split_tree, epsilon=None):
if epsilon is None:
epsilon = 1e-10
N = 0
for u in split_tree.nodes():
assert split_tree.num_children(u) < 3
N += 1
if u >= source_tree.tree_sequence.num_nodes:
# This is a new node
assert epsilon == pytest.approx(split_tree.branch_length(u))
assert N == len(list(split_tree.leaves())) * 2 - 1
for u in source_tree.nodes():
if source_tree.num_children(u) <= 2:
assert source_tree.children(u) == split_tree.children(u)
else:
assert len(split_tree.children(u)) == 2

@pytest.mark.parametrize("n", [2, 3, 4, 5, 6])
def test_resolve_star(self, n):
tree = tskit.Tree.generate_star(n)
self.verify_trees(tree, tree.split_polytomies(random_seed=12))

def test_large_epsilon(self):
tree = tskit.Tree.generate_star(10, branch_length=100)
eps = 10
split = tree.split_polytomies(random_seed=12234, epsilon=eps)
self.verify_trees(tree, split, epsilon=eps)

def verify_tree_sequence_splits(self, ts):
n_poly = 0
for e in ts.edgesets():
if len(e.children) > 2:
n_poly += 1
assert n_poly > 3
assert ts.num_trees > 3
for tree in ts.trees():
binary_tree = tree.split_polytomies(random_seed=11)
assert binary_tree.interval == tree.interval
for u in binary_tree.nodes():
assert binary_tree.num_children(u) < 3
for u in tree.nodes():
assert binary_tree.time(u) == tree.time(u)
resolved_ts = binary_tree.tree_sequence
assert resolved_ts.sequence_length == ts.sequence_length
assert resolved_ts.num_trees <= 3
if tree.interval[0] == 0:
assert resolved_ts.num_trees == 2
null_tree = resolved_ts.last()
assert null_tree.num_roots == ts.num_samples
elif tree.interval[1] == ts.sequence_length:
assert resolved_ts.num_trees == 2
null_tree = resolved_ts.first()
assert null_tree.num_roots == ts.num_samples
else:
null_tree = resolved_ts.first()
assert null_tree.num_roots == ts.num_samples
null_tree.next()
assert null_tree.num_roots == tree.num_roots
null_tree.next()
assert null_tree.num_roots == ts.num_samples

def test_complex_examples(self):
self.verify_tree_sequence_splits(self.ts_polytomy_44344())

def test_nonbinary_simulation(self):
demographic_events = [
msprime.SimpleBottleneck(time=1.0, population=0, proportion=0.95)
]
ts = msprime.simulate(
20,
recombination_rate=10,
mutation_rate=5,
demographic_events=demographic_events,
random_seed=7,
)
self.verify_tree_sequence_splits(ts)

def test_seeds(self):
base = tskit.Tree.generate_star(5)
t1 = base.split_polytomies(random_seed=1234)
t2 = base.split_polytomies(random_seed=1234)
assert t1.tree_sequence.tables.equals(
t2.tree_sequence.tables, ignore_timestamps=True
)
t2 = base.split_polytomies(random_seed=1)
assert not t1.tree_sequence.tables.equals(
t2.tree_sequence.tables, ignore_provenance=True
)

def test_internal_polytomy(self):
# 9
# ┏━┳━━━┻┳━━━━┓
# ┃ ┃ 8 ┃
# ┃ ┃ ┏━━╋━━┓ ┃
# ┃ ┃ ┃ 7 ┃ ┃
# ┃ ┃ ┃ ┏┻┓ ┃ ┃
# 0 1 2 3 5 4 6
t1 = tskit.Tree.unrank(7, (6, 25))
t2 = t1.split_polytomies(random_seed=1234)
assert t2.parent(3) == 7
assert t2.parent(5) == 7
assert t2.root == 9
for u in t2.nodes():
assert t2.num_children(u) in [0, 2]

def test_binary_tree(self):
t1 = msprime.simulate(10, random_seed=1234).first()
t2 = t1.split_polytomies(random_seed=1234)
tables = t1.tree_sequence.dump_tables()
assert tables.equals(t2.tree_sequence.tables, ignore_provenance=True)

def test_bad_method(self):
with pytest.raises(ValueError, match="Method"):
self.tree_polytomy_4().split_polytomies(method="something_else")

def test_epsilon_for_nodes(self):
with pytest.raises(
_tskit.LibraryError,
match="not small enough to create new nodes below a polytomy",
):
self.tree_polytomy_4().split_polytomies(epsilon=1, random_seed=12)

def test_epsilon_for_mutations(self):
tables = tskit.Tree.generate_star(3).tree_sequence.dump_tables()
root_time = tables.nodes.time[-1]
assert root_time == 1
site = tables.sites.add_row(position=0.5, ancestral_state="0")
tables.mutations.add_row(site=site, time=0.9, node=0, derived_state="1")
tables.mutations.add_row(site=site, time=0.9, node=1, derived_state="1")
tree = tables.tree_sequence().first()
with pytest.raises(
_tskit.LibraryError,
match="not small enough to create new nodes below a polytomy",
):
tree.split_polytomies(epsilon=0.5, random_seed=123)

def test_provenance(self):
tree = self.tree_polytomy_4()
ts_split = tree.split_polytomies(random_seed=14).tree_sequence
record = json.loads(ts_split.provenance(ts_split.num_provenances - 1).record)
assert record["parameters"]["command"] == "split_polytomies"
ts_split = tree.split_polytomies(
random_seed=12, record_provenance=False
).tree_sequence
record = json.loads(ts_split.provenance(ts_split.num_provenances - 1).record)
assert record["parameters"]["command"] != "split_polytomies"
Loading