Add/refine tests, draft docstring

lkirk · lkirk · commit 3f4c0ed1e493 · 2026-03-17T17:42:26.000-05:00
Clean up dimension handling around summary functions and normalisation.
There is a slight speed advantage (according to a microbenchmark) and a
huge readability advantage to simply returning [value]. I keep all
computations specifying `keepdims`, but remove list indexing (i.e.
`AB[[0]]`) in favor of returning a list with a single scalar. It turns
out that vectorised numpy functions are actually slower in some cases
because the data we're operating on is so small. Finally, fix the
default normalisation function so that it works both on one-way and
two-way statistics. Users will still need to specify `hap_norm` when
appropriate (and a special case of `hap_norm` for two-way stats).

Per Peter's comment, I investigated dimension dropping and indeed,
general stats don't drop dimensions so I removed the dimension dropping
code. However, we return a matrix of `(m, m, k)` and we want
`(k, m, m)`, so `np.moveaxis` is still needed.

Added tests:
* Multiallelic multi sample-set. This tests operations on two sample
  sets for multiallelic data (which excercises the norm function with
  multiple sample sets). This test highlighted the slight changes needed
  to the default normalisation function.
* Multi outputs. This test mimics a two-way stat called
  on multiple indexes. It shows and tests the ability to compute
  multiple statistics from the same haplotype counts matrix (which is
  especially useful with the explosion of possible summary functions in
  three-way, four-way stats).

In our biallelic test case, I also assert that the normalisation
function is never called and add a note about polarisation.

Finally, I add a draft docstring, but to complete this I think that the
two-locus docs are required. Also, I'd like to add some general
documentation.
diff --git a/python/tests/test_ld_matrix.py b/python/tests/test_ld_matrix.py
@@ -2400,16 +2400,19 @@ def test_multipopulation_r2_varying_unequal_set_sizes(genotypes, sample_sets, ex
 
 class GeneralStatFuncs:
     """
-    functions take X, n as parameters where
+    Summary functions take X, n as parameters where X is a matrix of haplotype
+    counts per sample set and n is a vector of sample set sizes. X has shape (3, k)
+    and n has shape (k, ), where k is the number of sample sets. The rows of X
+    contain haplotype counts for AB, Ab, aB (capitalized == derived).
 
-    X: shape=(3, #ss)
+    X: shape=(3, k)
                 sample sets
-    count AB [[             ]
-    count Ab  [             ]
-    count aB  [             ]]
+    count AB [[ #ss1, #ss2, ... ]
+    count Ab  [ #ss1, #ss2, ... ]
+    count aB  [ #ss1, #ss2, ... ]]
 
-    n: shape=(#ss, )
-              [             ]
+    n: shape=(k, )
+              [ #ss1, #ss2, ... ]
     """
 
     @staticmethod
@@ -2480,37 +2483,39 @@ def pi2(X, n):
     def D2_unbiased(X, n):
         AB, Ab, aB = X
         ab = n - X.sum(0)
-        return (1 / (n * (n - 1) * (n - 2) * (n - 3))) * (
+        return (
             ((aB**2) * (Ab - 1) * Ab)
             + ((ab - 1) * ab * (AB - 1) * AB)
             - (aB * Ab * (Ab + (2 * ab * AB) - 1))
-        )
+        ) / (n * (n - 1) * (n - 2) * (n - 3))
 
     @staticmethod
     def Dz_unbiased(X, n):
         AB, Ab, aB = X
         ab = n - X.sum(0)
-        return (1 / (n * (n - 1) * (n - 2) * (n - 3))) * (
+        return (
             (((AB * ab) - (Ab * aB)) * (aB + ab - AB - Ab) * (Ab + ab - AB - aB))
             - ((AB * ab) * (AB + ab - Ab - aB - 2))
             - ((Ab * aB) * (Ab + aB - AB - ab - 2))
-        )
+        ) / (n * (n - 1) * (n - 2) * (n - 3))
 
     @staticmethod
     def pi2_unbiased(X, n):
         AB, Ab, aB = X
         ab = n - X.sum(0)
-        return (1 / (n * (n - 1) * (n - 2) * (n - 3))) * (
+        return (
             ((AB + Ab) * (aB + ab) * (AB + aB) * (Ab + ab))
             - ((AB * ab) * (AB + ab + (3 * Ab) + (3 * aB) - 1))
             - ((Ab * aB) * (Ab + aB + (3 * AB) + (3 * ab) - 1))
-        )
+        ) / (n * (n - 1) * (n - 2) * (n - 3))
 
+    # Two-way statistics have the _ij suffix.
     @staticmethod
     def r2_ij(X, n):
         pAB, pAb, paB = X / n
         pA = pAb + pAB
         pB = paB + pAB
+        # keepdims preserves the output shape of (1, )
         D2_ij = np.prod(pAB - (pA * pB), keepdims=True)
         denom = np.prod(np.sqrt(pA * pB * (1 - pA) * (1 - pB)), keepdims=True)
         with suppress_overflow_div0_warning():
@@ -2525,17 +2530,37 @@ def D2_ij(X, n):
 
     @staticmethod
     def D2_ij_unbiased(X, n):
-        """NB: We use double brackets here to preserve the output shape of (1,)"""
+        """The identity of the sample sets is up to the user."""
         AB, Ab, aB = X
         ab = n - X.sum(0)
-        return (
-            (Ab[[0]] * aB[[0]] - AB[[0]] * ab[[0]])
-            * (Ab[[1]] * aB[[1]] - AB[[1]] * ab[[1]])
-            / n[[0]]
-            / (n[[0]] - 1)
-            / n[[1]]
-            / (n[[1]] - 1)
+        return [
+            (Ab[0] * aB[0] - AB[0] * ab[0])
+            * (Ab[1] * aB[1] - AB[1] * ab[1])
+            / (n[0] * (n[0] - 1) * n[1] * (n[1] - 1))
+        ]
+
+    @staticmethod
+    def D2_ii_ij_jj_unbiased(X, n):
+        """
+        Multiple stats can be computed from the same data. The identity of the
+        sample sets is up to the user. This function assumes two sample sets.
+        """
+        AB, Ab, aB = X
+        ab = n - X.sum(0)
+
+        # unbiased estimator for equal sample sets
+        ii, jj = (
+            AB * (AB - 1) * ab * (ab - 1)
+            + Ab * (Ab - 1) * aB * (aB - 1)
+            - 2 * AB * Ab * aB * ab
+        ) / (n * (n - 1) * (n - 2) * (n - 3))
+        # unbiased estimator for disjoint sample sets
+        ij = (
+            (Ab[0] * aB[0] - AB[0] * ab[0])
+            * (Ab[1] * aB[1] - AB[1] * ab[1])
+            / (n[0] * (n[0] - 1) * n[1] * (n[1] - 1))
         )
+        return [ii, ij, jj]
 
 
 @pytest.fixture(scope="module")
@@ -2573,7 +2598,17 @@ def ts_multiallelic_fixture():
 def test_general_two_locus_site_stat(stat, ts_100_samp_with_sites_fixture):
     ts = ts_100_samp_with_sites_fixture
     sample_sets = [ts.samples()[0:50], ts.samples()[50:100]]
-    ldg = ts.two_locus_count_stat(sample_sets, getattr(GeneralStatFuncs, stat), 2)
+
+    # In addition to not needing a normalisation function, normalisation is also
+    # not required because these sites are biallelic.
+    def assert_no_norm_func(*_):
+        raise Exception(
+            "Normalisation function should not be called for biallelic sites"
+        )
+
+    ldg = ts.two_locus_count_stat(
+        sample_sets, getattr(GeneralStatFuncs, stat), 2, norm_f=assert_no_norm_func
+    )
     ld = ts.ld_matrix(sample_sets=sample_sets, stat=stat)
     np.testing.assert_array_almost_equal(ldg, ld)
 
@@ -2584,7 +2619,7 @@ def test_general_two_locus_two_way_site_stat(stat, ts_100_samp_with_sites_fixtur
     sample_sets = [ts.samples()[0:50], ts.samples()[50:100]]
     ldg = ts.two_locus_count_stat(sample_sets, getattr(GeneralStatFuncs, stat), 1)
     ld = ts.ld_matrix(
-        sample_sets=sample_sets, stat=stat.replace("_ij", ""), indexes=(0, 1)
+        sample_sets=sample_sets, stat=stat.replace("_ij", ""), indexes=[(0, 1)]
     )
     np.testing.assert_array_almost_equal(ldg, ld)
 
@@ -2599,7 +2634,24 @@ def test_general_one_way_two_locus_stat_multiallelic(stat, ts_multiallelic_fixtu
         [ts.samples()], general_func, 1, norm_f=norm_func, polarised=polarised
     )
     ld = ts.ld_matrix(stat=stat)
-    np.testing.assert_array_almost_equal(ld, ldg)
+    # ld_matrix drops dims, expand for comparison
+    np.testing.assert_array_almost_equal(ldg, np.expand_dims(ld, 0))
+
+
+@pytest.mark.parametrize("stat", SUMMARY_FUNCS.keys())
+def test_general_one_way_two_locus_stat_multiallelic_multi_sample_set(
+    stat, ts_multiallelic_fixture
+):
+    ts = ts_multiallelic_fixture
+    general_func = getattr(GeneralStatFuncs, stat)
+    norm_func = (lambda X, n, nA, nB: X[0] / n) if stat == "r2" else None
+    polarised = POLARIZATION[SUMMARY_FUNCS[stat]]
+    sample_sets = [ts.samples(), ts.samples()]
+    ldg = ts.two_locus_count_stat(
+        sample_sets, general_func, 2, norm_f=norm_func, polarised=polarised
+    )
+    ld = ts.ld_matrix(stat=stat, sample_sets=sample_sets)
+    np.testing.assert_array_almost_equal(ldg, ld)
 
 
 @pytest.mark.parametrize("stat", ["r2_ij", "D2_ij", "D2_ij_unbiased"])
@@ -2616,4 +2668,25 @@ def test_general_two_way_two_locus_stat_multiallelic(stat, ts_multiallelic_fixtu
     ld = ts.ld_matrix(
         stat=stat.replace("_ij", ""), indexes=(0, 1), sample_sets=sample_sets
     )
-    np.testing.assert_array_almost_equal(ld, ldg)
+    # ld_matrix drops dims, expand for comparison
+    np.testing.assert_array_almost_equal(ldg, np.expand_dims(ld, 0))
+
+
+def test_general_two_locus_multi_outputs():
+    ts = msprime.sim_mutations(
+        msprime.sim_ancestry(
+            4, recombination_rate=0.1, sequence_length=100, random_seed=123
+        ),
+        rate=0.1,
+        random_seed=123,
+    )
+    assert ts.num_samples == 8, "8 samples are required"
+    assert max({len(s.mutations) for s in ts.sites()}) > 2, (
+        "At least one multiallelic site required"
+    )
+    A = ts.samples()[0:4]
+    B = ts.samples()[4:]
+
+    ldg = ts.two_locus_count_stat([A, B], GeneralStatFuncs.D2_ii_ij_jj_unbiased, 3)
+    ld = ts.ld_matrix([A, B], stat="D2_unbiased", indexes=[(0, 0), (0, 1), (1, 1)])
+    np.testing.assert_array_almost_equal(ldg, ld)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -10942,14 +10942,84 @@ def two_locus_count_stat(
         positions=None,
         mode="site",
     ):
+        """
+        Compute two-locus statistics with a user-defined python function that
+        operates on haplotype counts. TODO: reference modes in two-locus docs.
+        On each pair of sites or trees, the summary function is provided with
+        ``X``, a matrix with shape (3, k) and ``n``, a vector with shape (k,),
+        where k is the number of sample sets provided. ``X`` is a read-only
+        matrix whose rows contain haplotype counts per sample set (counts of AB,
+        Ab, aB) and ``n`` is a vector of sample set sizes.
+
+        .. note::
+            Because we are operating on very small matrices/vectors, vectorised
+            operations are often times slower than operations on scalars. Simply
+            returning ``[value]`` can be faster than returning
+            ``value[np.newaxis,]`` or ``np.expand_dims(value, 0)``.
+
+        What follows is an example of computing ``D`` from a tree sequence. Many
+        more examples can be found in the test suite
+        ``test_ld_matrix.py::GeneralStatsFuncs``. Let's begin with our summary
+        function, ``D``. We convert counts to proportions, then compute ``D``,
+        returning a numpy array with length equal to the number of sample sets.
+
+        .. code-block:: python
+            def D(X, n):
+                pAB, pAb, paB = X / n
+                pA = pAb + pAB
+                pB = paB + pAB
+                return pAB - (pA * pB)
+
+        ``norm_f`` is a normalisation function used to combine all computed
+        statistics for multiallelic allele pairs (TODO: see two-locus
+        docs). Biallelic sites do not require any normalisation (in fact, the
+        normalisation function is never called for biallelic sites). If one of
+        either site A or site B is multiallelic, then the normalisation function
+        will be called. The default normalisation function is identical to
+        ``total_norm`` shown in the example below. ``hap_norm`` is required for
+        normalising :math:`r^2`. Both of these examples return a numpy array
+        with length equal to the number of sample sets (for one-way stats).
+
+        .. code-block:: python
+            def total_norm(X, n, nA, nB):
+                [1 / (nA * nB)] * result_dim
+
+            def hap_norm(X, n, nA, nB):
+                X[0] / n
+
+        A simple call (without specifying normalisation) would look like this
+
+        .. code-block::python
+            ts.two_locus_count_stat([ts.samples()], D, 1, polarised=True)
+
+        :param list sample_sets: A list of lists of Node IDs, specifying the
+            groups of nodes to compute the statistic with.
+        :param f: A function that takes two arguments - a two-dimensional array
+            with shape (3, k) and a one-dimensional array with shape (k, ) where
+            k is the number of sample sets.
+        :param int result_dim: The length of ``f`` and ``norm_f``'s return value.
+        :param norm_f: A function that takes four arguments - the first two are
+            the same as ``f``, the second two are scalars representing the
+            number of A and B alleles, respectively.
+        :param bool polarised: Whether to leave the ancestral state out of
+            computations: see :ref:`sec_stats` for more details.
+        :param list sites: TODO: two-locus docs
+        :param list positions: TODO: two-locus docs
+        :param str mode: A string giving the "type" of the statistic to be
+            computed (defaults to "site").
+        :return: A ndarray with shape equal to (TODO: reference two-locus docs,
+            no dimension dropping shape=(k, m, m) where k=num_sample_sets,
+            m=num_sites or num_trees).
+        """
         row_sites, col_sites = self.parse_sites(sites)
         row_positions, col_positions = self.parse_positions(positions)
         _, sample_sets, sample_set_sizes = self.__convert_sample_sets(sample_sets)
         result = self._ll_tree_sequence.two_locus_count_stat(
             sample_set_sizes,
             sample_sets,
             f,
-            norm_f or (lambda X, n, nA, nB: 1 / (nA * nB)[np.newaxis,]),
+            # produce the same number of dims as output dimensions
+            norm_f or (lambda X, n, nA, nB: [1 / (nA * nB)] * result_dim),
             result_dim,
             polarised,
             row_sites,
@@ -10958,11 +11028,9 @@ def two_locus_count_stat(
             col_positions,
             mode,
         )
-        if result_dim == 1:  # drop dimension
-            return result.reshape(result.shape[:2])
         # Orient the data so that the first dimension is the sample set so that
         # we get one LD matrix per sample set.
-        return result.swapaxes(0, 2).swapaxes(1, 2)
+        return np.moveaxis(result, -1, 0)
 
     def ld_matrix(
         self,