tskit-dev · mergify · Nov 21, 2020 · Aug 28, 2020 · Nov 18, 2020
diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst
@@ -77,6 +77,9 @@
 
 **Features**
 
+- Add ``split_polytomies`` method to the Tree class
+  (:user:`hyanwong`, :user:`jeromekelleher`, :issue:`809`, :pr:`815`)
+
 - Tree accessor functions (e.g. ``ts.first()``, ``ts.at()`` pass extra parameters such as
   ``sample_indexes`` to the underlying ``Tree`` constructor; also ``root_threshold`` can
   be specified when calling ``ts.trees()`` (:user:`hyanwong`, :issue:`847`, :pr:`848`)

diff --git a/python/tests/test_combinatorics.py b/python/tests/test_combinatorics.py
@@ -26,6 +26,7 @@
 import collections
 import io
 import itertools
+import random
 
 import msprime
 import numpy as np
@@ -976,3 +977,32 @@ def test_forward_time_wright_fisher_simplified(self):
         ts = tables.tree_sequence()
         samples = ts.samples()
         self.verify_topologies(ts, sample_sets=[samples[:10], samples[10:]])
+
+
+class TestTreeNode:
+    """
+    Tests for the TreeNode class used to build simple trees in memory.
+    """
+
+    @pytest.mark.parametrize("n", [2, 3, 5, 10])
+    def test_random_binary_tree(self, n):
+        # Note this doesn't check any statistical properties of the returned
+        # trees, just that a single instance returned in a valid binary tree.
+        rng = random.Random(32)
+        labels = range(n)
+        root = comb.TreeNode.random_binary_tree(labels, rng)
+
+        stack = [root]
+        num_nodes = 0
+        recovered_labels = []
+        while len(stack) > 0:
+            node = stack.pop()
+            num_nodes += 1
+            if node.label is not None:
+                assert len(node.children) == 0
+                recovered_labels.append(node.label)
+            for child in node.children:
+                assert child.parent == node
+                stack.append(child)
+        assert len(recovered_labels) == n
+        assert sorted(recovered_labels) == list(labels)
diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py
@@ -23,6 +23,7 @@
 """
 Test cases for the supported topological variations and operations.
 """
+import collections
 import functools
 import io
 import itertools
@@ -7923,3 +7924,221 @@ def test_star_branch_length(self):
 
         assert ts.node(ts.first().root).time == branch_length
         assert ts.kc_distance(topological_equiv_ts) == 0
+
+
+def num_leaf_labelled_binary_trees(n):
+    """
+    Returns the number of leaf labelled binary trees with n leaves.
+
+    TODO: this would probably be helpful to have in the combinatorics
+    module.
+
+    https://oeis.org/A005373/
+    """
+    return int(np.math.factorial(2 * n - 3) / (2 ** (n - 2) * np.math.factorial(n - 2)))
+
+
+class TestPolytomySplitting:
+    """
+    Test the ability to randomly split polytomies
+    """
+
+    # A complex ts with polytomies
+    #
+    # 1.00┊    6      ┊      6    ┊       6   ┊           ┊      6    ┊
+    #     ┊ ┏━┳┻┳━┓   ┊   ┏━┳┻┳━┓ ┊    ┏━━╋━┓ ┊           ┊   ┏━┳┻┳━┓ ┊
+    # 0.50┊ 5 ┃ ┃ ┃   ┊   5 ┃ ┃ ┃ ┊    5  ┃ ┃ ┊      5    ┊   ┃ ┃ ┃ ┃ ┊
+    #     ┊ ┃ ┃ ┃ ┃ . ┊   ┃ ┃ ┃ ┃ ┊ . ┏┻┓ ┃ ┃ ┊ . ┏━┳┻┳━┓ ┊ . ┃ ┃ ┃ ┃ ┊
+    # 0.00┊ 0 2 3 4 1 ┊ 0 1 2 3 4 ┊ 0 1 2 3 4 ┊ 0 1 2 3 4 ┊ 0 1 2 3 4 ┊
+    #   0.00        0.20        0.40        0.60        0.80        1.00
+    nodes_polytomy_44344 = """\
+    id      is_sample   population      time
+    0       1           0               0.0
+    1       1           0               0.0
+    2       1           0               0.0
+    3       1           0               0.0
+    4       1           0               0.0
+    5       0           0               0.5
+    6       0           0               1.0
+    """
+    edges_polytomy_44344 = """\
+    id      left     right    parent  child
+    0       0.0      0.2      5       0
+    1       0.0      0.8      5       1
+    2       0.0      0.4      6       2
+    3       0.4      0.8      5       2
+    4       0.0      0.6      6       3,4
+    5       0.0      0.6      6       5
+    6       0.6      0.8      5       3,4
+    7       0.8      1.0      6       1,2,3,4
+    """
+
+    def tree_polytomy_4(self):
+        return tskit.Tree.generate_star(4)
+
+    def ts_polytomy_44344(self):
+        return tskit.load_text(
+            nodes=io.StringIO(self.nodes_polytomy_44344),
+            edges=io.StringIO(self.edges_polytomy_44344),
+            strict=False,
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("n", [2, 3, 4, 5])
+    def test_all_topologies(self, n):
+        N = num_leaf_labelled_binary_trees(n)
+        ranks = collections.Counter()
+        tree = tskit.Tree.generate_star(n)
+        for seed in range(20 * N):
+            split_tree = tree.split_polytomies(random_seed=seed)
+            ranks[split_tree.rank()] += 1
+        # There are N possible binary trees here, we should have seen them
+        # all with high probability after 20 N attempts.
+        assert len(ranks) == N
+
+    def verify_trees(self, source_tree, split_tree, epsilon=None):
+        if epsilon is None:
+            epsilon = 1e-10
+        N = 0
+        for u in split_tree.nodes():
+            assert split_tree.num_children(u) < 3
+            N += 1
+            if u >= source_tree.tree_sequence.num_nodes:
+                # This is a new node
+                assert epsilon == pytest.approx(split_tree.branch_length(u))
+        assert N == len(list(split_tree.leaves())) * 2 - 1
+        for u in source_tree.nodes():
+            if source_tree.num_children(u) <= 2:
+                assert source_tree.children(u) == split_tree.children(u)
+            else:
+                assert len(split_tree.children(u)) == 2
+
+    @pytest.mark.parametrize("n", [2, 3, 4, 5, 6])
+    def test_resolve_star(self, n):
+        tree = tskit.Tree.generate_star(n)
+        self.verify_trees(tree, tree.split_polytomies(random_seed=12))
+
+    def test_large_epsilon(self):
+        tree = tskit.Tree.generate_star(10, branch_length=100)
+        eps = 10
+        split = tree.split_polytomies(random_seed=12234, epsilon=eps)
+        self.verify_trees(tree, split, epsilon=eps)
+
+    def verify_tree_sequence_splits(self, ts):
+        n_poly = 0
+        for e in ts.edgesets():
+            if len(e.children) > 2:
+                n_poly += 1
+        assert n_poly > 3
+        assert ts.num_trees > 3
+        for tree in ts.trees():
+            binary_tree = tree.split_polytomies(random_seed=11)
+            assert binary_tree.interval == tree.interval
+            for u in binary_tree.nodes():
+                assert binary_tree.num_children(u) < 3
+            for u in tree.nodes():
+                assert binary_tree.time(u) == tree.time(u)
+            resolved_ts = binary_tree.tree_sequence
+            assert resolved_ts.sequence_length == ts.sequence_length
+            assert resolved_ts.num_trees <= 3
+            if tree.interval[0] == 0:
+                assert resolved_ts.num_trees == 2
+                null_tree = resolved_ts.last()
+                assert null_tree.num_roots == ts.num_samples
+            elif tree.interval[1] == ts.sequence_length:
+                assert resolved_ts.num_trees == 2
+                null_tree = resolved_ts.first()
+                assert null_tree.num_roots == ts.num_samples
+            else:
+                null_tree = resolved_ts.first()
+                assert null_tree.num_roots == ts.num_samples
+                null_tree.next()
+                assert null_tree.num_roots == tree.num_roots
+                null_tree.next()
+                assert null_tree.num_roots == ts.num_samples
+
+    def test_complex_examples(self):
+        self.verify_tree_sequence_splits(self.ts_polytomy_44344())
+
+    def test_nonbinary_simulation(self):
+        demographic_events = [
+            msprime.SimpleBottleneck(time=1.0, population=0, proportion=0.95)
+        ]
+        ts = msprime.simulate(
+            20,
+            recombination_rate=10,
+            mutation_rate=5,
+            demographic_events=demographic_events,
+            random_seed=7,
+        )
+        self.verify_tree_sequence_splits(ts)
+
+    def test_seeds(self):
+        base = tskit.Tree.generate_star(5)
+        t1 = base.split_polytomies(random_seed=1234)
+        t2 = base.split_polytomies(random_seed=1234)
+        assert t1.tree_sequence.tables.equals(
+            t2.tree_sequence.tables, ignore_timestamps=True
+        )
+        t2 = base.split_polytomies(random_seed=1)
+        assert not t1.tree_sequence.tables.equals(
+            t2.tree_sequence.tables, ignore_provenance=True
+        )
+
+    def test_internal_polytomy(self):
+        #       9
+        # ┏━┳━━━┻┳━━━━┓
+        # ┃ ┃    8    ┃
+        # ┃ ┃ ┏━━╋━━┓ ┃
+        # ┃ ┃ ┃  7  ┃ ┃
+        # ┃ ┃ ┃ ┏┻┓ ┃ ┃
+        # 0 1 2 3 5 4 6
+        t1 = tskit.Tree.unrank(7, (6, 25))
+        t2 = t1.split_polytomies(random_seed=1234)
+        assert t2.parent(3) == 7
+        assert t2.parent(5) == 7
+        assert t2.root == 9
+        for u in t2.nodes():
+            assert t2.num_children(u) in [0, 2]
+
+    def test_binary_tree(self):
+        t1 = msprime.simulate(10, random_seed=1234).first()
+        t2 = t1.split_polytomies(random_seed=1234)
+        tables = t1.tree_sequence.dump_tables()
+        assert tables.equals(t2.tree_sequence.tables, ignore_provenance=True)
+
+    def test_bad_method(self):
+        with pytest.raises(ValueError, match="Method"):
+            self.tree_polytomy_4().split_polytomies(method="something_else")
+
+    def test_epsilon_for_nodes(self):
+        with pytest.raises(
+            _tskit.LibraryError,
+            match="not small enough to create new nodes below a polytomy",
+        ):
+            self.tree_polytomy_4().split_polytomies(epsilon=1, random_seed=12)
+
+    def test_epsilon_for_mutations(self):
+        tables = tskit.Tree.generate_star(3).tree_sequence.dump_tables()
+        root_time = tables.nodes.time[-1]
+        assert root_time == 1
+        site = tables.sites.add_row(position=0.5, ancestral_state="0")
+        tables.mutations.add_row(site=site, time=0.9, node=0, derived_state="1")
+        tables.mutations.add_row(site=site, time=0.9, node=1, derived_state="1")
+        tree = tables.tree_sequence().first()
+        with pytest.raises(
+            _tskit.LibraryError,
+            match="not small enough to create new nodes below a polytomy",
+        ):
+            tree.split_polytomies(epsilon=0.5, random_seed=123)
+
+    def test_provenance(self):
+        tree = self.tree_polytomy_4()
+        ts_split = tree.split_polytomies(random_seed=14).tree_sequence
+        record = json.loads(ts_split.provenance(ts_split.num_provenances - 1).record)
+        assert record["parameters"]["command"] == "split_polytomies"
+        ts_split = tree.split_polytomies(
+            random_seed=12, record_provenance=False
+        ).tree_sequence
+        record = json.loads(ts_split.provenance(ts_split.num_provenances - 1).record)
+        assert record["parameters"]["command"] != "split_polytomies"