Update tests according to Peters's feedback

lkirk · lkirk · commit 593161a79387 · 2026-03-16T17:49:12.000-05:00
*Python tests*
Overhaul python testing of the general stat functions. Remove the
dependence on the example tree sequences, opting instead to simulate a
couple of examples directly. Use these simulated trees in test fixtures,
scoped at the module level. This streamlines the test parameterization a
lot.

Use the single stat site names from the summary function definitions.

*CPython tests*
Add a multiallelic tree sequence to test normalisation function
validation and errors. Remove one more occurrence of `np.expand_dims`.

*trees.c*
Remove the unnecessary branch in
tsk_treeseq_two_locus_count_general_stat, improving the code coverage.

*trees.py*
Default normalisation function can be None, applying default at runtime.
Simplifies calling code and is more in line with the rest of the API.
diff --git a/c/tskit/trees.c b/c/tskit/trees.c
@@ -3439,21 +3439,22 @@ tsk_treeseq_two_locus_count_general_stat(const tsk_treeseq_t *self,
         ret = tsk_treeseq_two_site_count_stat(self, state_dim, num_sample_sets,
             sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows,
             row_sites, out_cols, col_sites, options, result);
-    } else if (stat_branch) {
-        ret = check_positions(
-            row_positions, out_rows, tsk_treeseq_get_sequence_length(self));
-        if (ret != 0) {
-            goto out;
-        }
-        ret = check_positions(
-            col_positions, out_cols, tsk_treeseq_get_sequence_length(self));
-        if (ret != 0) {
-            goto out;
-        }
-        ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets,
-            sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows,
-            row_positions, out_cols, col_positions, options, result);
+        goto out;
+    }
+    tsk_bug_assert(stat_branch);
+    ret = check_positions(
+        row_positions, out_rows, tsk_treeseq_get_sequence_length(self));
+    if (ret != 0) {
+        goto out;
+    }
+    ret = check_positions(
+        col_positions, out_cols, tsk_treeseq_get_sequence_length(self));
+    if (ret != 0) {
+        goto out;
     }
+    ret = tsk_treeseq_two_branch_count_stat(self, state_dim, num_sample_sets,
+        sample_set_sizes, sample_sets, result_dim, f, f_params, norm_f, out_rows,
+        row_positions, out_cols, col_positions, options, result);
 out:
     return ret;
 }
diff --git a/python/tests/test_ld_matrix.py b/python/tests/test_ld_matrix.py
@@ -2538,55 +2538,49 @@ def D2_ij_unbiased(X, n):
         )
 
 
-@pytest.mark.parametrize(
-    "ts,stat",
-    [
-        (
-            ts := tsutil.get_sim_example(
-                sample_size=100,
-                sequence_length=32,
-                recombination_rate=0.5,
-                mutation_rate=0.1,
-                seed=123,
-            ),
-            "D",
+@pytest.fixture(scope="module")
+def ts_100_samp_with_sites_fixture():
+    ts = tsutil.get_sim_example(
+        sample_size=100,
+        sequence_length=32,
+        recombination_rate=0.5,
+        mutation_rate=0.1,
+        seed=123,
+    )
+    assert ts.num_sites > 0, "sites are required"
+    assert ts.num_samples == 100, "100 samples are required"
+    return ts
+
+
+@pytest.fixture(scope="module")
+def ts_multiallelic_fixture():
+    ts = msprime.sim_mutations(
+        msprime.sim_ancestry(
+            2, recombination_rate=0.1, sequence_length=100, random_seed=123
         ),
-        (ts, "D2"),
-        (ts, "r2"),
-        (ts, "r"),
-        (ts, "D_prime"),
-        (ts, "Dz"),
-        (ts, "pi2"),
-        (ts, "D2_unbiased"),
-        (ts, "Dz_unbiased"),
-        (ts, "pi2_unbiased"),
-    ],
-)
-def test_general_two_locus_site_stat(ts, stat):
+        rate=0.1,
+        random_seed=123,
+    )
+    # Need at least 4 samples to test unbiased statistics
+    assert ts.num_samples >= 4, "At least 4 samples required"
+    assert max({len(s.mutations) for s in ts.sites()}) > 2, (
+        "At least one multiallelic site required"
+    )
+    return ts
+
+
+@pytest.mark.parametrize("stat", SUMMARY_FUNCS.keys())
+def test_general_two_locus_site_stat(stat, ts_100_samp_with_sites_fixture):
+    ts = ts_100_samp_with_sites_fixture
     sample_sets = [ts.samples()[0:50], ts.samples()[50:100]]
     ldg = ts.two_locus_count_stat(sample_sets, getattr(GeneralStatFuncs, stat), 2)
     ld = ts.ld_matrix(sample_sets=sample_sets, stat=stat)
     np.testing.assert_array_almost_equal(ldg, ld)
 
 
-@pytest.mark.parametrize(
-    "ts,stat",
-    [
-        (
-            ts := tsutil.get_sim_example(
-                sample_size=100,
-                sequence_length=32,
-                recombination_rate=0.5,
-                mutation_rate=0.1,
-                seed=123,
-            ),
-            "r2_ij",
-        ),
-        (ts, "D2_ij"),
-        (ts, "D2_ij_unbiased"),
-    ],
-)
-def test_general_two_locus_two_way_site_stat(ts, stat):
+@pytest.mark.parametrize("stat", ["r2_ij", "D2_ij", "D2_ij_unbiased"])
+def test_general_two_locus_two_way_site_stat(stat, ts_100_samp_with_sites_fixture):
+    ts = ts_100_samp_with_sites_fixture
     sample_sets = [ts.samples()[0:50], ts.samples()[50:100]]
     ldg = ts.two_locus_count_stat(sample_sets, getattr(GeneralStatFuncs, stat), 1)
     ld = ts.ld_matrix(
@@ -2595,62 +2589,31 @@ def test_general_two_locus_two_way_site_stat(ts, stat):
     np.testing.assert_array_almost_equal(ldg, ld)
 
 
-@pytest.mark.parametrize(
-    "stat",
-    [
-        "D",
-        "D2",
-        "r2",
-        "r",
-        "D_prime",
-        "Dz",
-        "pi2",
-        "D2_unbiased",
-        "Dz_unbiased",
-        "pi2_unbiased",
-    ],
-)
-def test_general_one_way_two_locus_stat_multiallelic(stat):
-    ts = tsutil.all_fields_ts()
-    func = getattr(GeneralStatFuncs, stat)
-    if stat == "r2":
-        result = ts.two_locus_count_stat(
-            [ts.samples()], func, 1, lambda X, n, nA, nB: X[0] / n
-        )
-    elif stat in {"D", "r", "D_prime"}:
-        result = ts.two_locus_count_stat([ts.samples()], func, 1, polarised=True)
-    else:
-        # default norm func is `lambda X, n, nA, nB: 1 / (nA * nB)[np.newaxis,]`
-        result = ts.two_locus_count_stat([ts.samples()], func, 1)
-    np.testing.assert_array_almost_equal(ts.ld_matrix(stat=stat), result)
-
-
-@pytest.mark.parametrize(
-    "stat",
-    [
-        "r2_ij",
-        "D2_ij",
-        "D2_ij_unbiased",
-    ],
-)
-def test_general_two_way_two_locus_stat_multiallelic(stat):
-    ts = tsutil.all_fields_ts()
-    func = getattr(GeneralStatFuncs, stat)
-    if stat == "r2_ij":
-        result = ts.two_locus_count_stat(
-            [ts.samples(), ts.samples()],
-            func,
-            1,
-            lambda X, n, nA, nB: X[0].sum(keepdims=True) / n.sum(),
-        )
-    else:
-        # default norm func is `lambda X, n, nA, nB: 1 / (nA * nB)[np.newaxis,]`
-        result = ts.two_locus_count_stat([ts.samples(), ts.samples()], func, 1)
-    np.testing.assert_array_almost_equal(
-        ts.ld_matrix(
-            stat=stat.replace("_ij", ""),
-            indexes=(0, 1),
-            sample_sets=[ts.samples(), ts.samples()],
-        ),
-        result,
+@pytest.mark.parametrize("stat", SUMMARY_FUNCS.keys())
+def test_general_one_way_two_locus_stat_multiallelic(stat, ts_multiallelic_fixture):
+    ts = ts_multiallelic_fixture
+    general_func = getattr(GeneralStatFuncs, stat)
+    norm_func = (lambda X, n, nA, nB: X[0] / n) if stat == "r2" else None
+    polarised = POLARIZATION[SUMMARY_FUNCS[stat]]
+    ldg = ts.two_locus_count_stat(
+        [ts.samples()], general_func, 1, norm_f=norm_func, polarised=polarised
+    )
+    ld = ts.ld_matrix(stat=stat)
+    np.testing.assert_array_almost_equal(ld, ldg)
+
+
+@pytest.mark.parametrize("stat", ["r2_ij", "D2_ij", "D2_ij_unbiased"])
+def test_general_two_way_two_locus_stat_multiallelic(stat, ts_multiallelic_fixture):
+    ts = ts_multiallelic_fixture
+    general_func = getattr(GeneralStatFuncs, stat)
+    norm_func = (
+        (lambda X, n, nA, nB: X[0].sum(keepdims=True) / n.sum())
+        if stat == "r2_ij"
+        else None
+    )
+    sample_sets = [ts.samples(), ts.samples()]
+    ldg = ts.two_locus_count_stat(sample_sets, general_func, 1, norm_f=norm_func)
+    ld = ts.ld_matrix(
+        stat=stat.replace("_ij", ""), indexes=(0, 1), sample_sets=sample_sets
     )
+    np.testing.assert_array_almost_equal(ld, ldg)
diff --git a/python/tests/test_python_c.py b/python/tests/test_python_c.py
@@ -138,6 +138,23 @@ def get_example_migration_tree_sequence(self):
         )
         return ts.ll_tree_sequence
 
+    def get_example_tree_sequence_multiallelic(self, sample_size=10):
+        ts = msprime.sim_mutations(
+            msprime.sim_ancestry(
+                sample_size,
+                recombination_rate=0.1,
+                sequence_length=100,
+                ploidy=1,
+                random_seed=123,
+            ),
+            rate=0.1,
+            random_seed=123,
+        )
+        assert max({len(s.mutations) for s in ts.sites()}) > 2, (
+            "At least one multiallelic site required"
+        )
+        return ts.ll_tree_sequence
+
     def verify_iterator(self, iterator):
         """
         Checks that the specified non-empty iterator implements the
@@ -1989,6 +2006,12 @@ def test_ld_matrix_multipop(self, stat_method_name):
 
     def test_two_locus_count_stat(self):
         ts = self.get_example_tree_sequence(10)
+        # Multiallelic test case to test norm function
+        ts_multi = self.get_example_tree_sequence_multiallelic()
+        assert (ts.get_samples() == ts_multi.get_samples()).all(), (
+            "biallelic and multiallelic test case are expected "
+            "to have the same sample nodes"
+        )
         ss = ts.get_samples()  # sample sets
         ss_sizes = np.array([len(ss)], dtype=np.uint32)
         row_sites = np.arange(ts.get_num_sites(), dtype=np.int32)
@@ -2007,10 +2030,9 @@ def stat_func(X, n):
             return pAB - (pA * pB)
 
         def norm_func(X, n, nA, nB):
-            return np.expand_dims(X[0].sum() / n.sum(), axis=0)
-
-        method = ts.two_locus_count_stat
+            return X[0].sum(keepdims=True) / n.sum()
 
+        method = ts.two_locus_count_stat  # most tests on biallelic
         site_args = row_sites, col_sites, None, None, "site"
         branch_args = None, None, row_pos, col_pos, "branch"
         a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *site_args)
@@ -2019,10 +2041,20 @@ def norm_func(X, n, nA, nB):
         assert a.shape == (2, 2, 1)
         site_list_args = row_sites_list, col_sites_list, None, None, "site"
         branch_list_args = None, None, row_pos_list, col_pos_list, "branch"
+
+        # happy path
         a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *site_list_args)
-        assert a.shape == (10, 10, 1)
+        assert a.shape == (10, 10, 1)  # ts has 10 sites
         a = method(ss_sizes, ss, stat_func, norm_func, 1, True, *branch_list_args)
-        assert a.shape == (2, 2, 1)
+        assert a.shape == (2, 2, 1)  # ts has 2 trees
+        a = ts_multi.two_locus_count_stat(
+            ss_sizes, ss, stat_func, norm_func, 1, True, None, None, None, None, "site"
+        )
+        assert a.shape == (56, 56, 1)  # ts has 56 sites
+        a = ts_multi.two_locus_count_stat(
+            ss_sizes, ss, stat_func, norm_func, 1, True, None, None, None, None, "branch"
+        )
+        assert a.shape == (48, 48, 1)  # ts has 48 trees
         # CPython API errors
         with pytest.raises(ValueError, match="Sum of sample_set_sizes"):
             bad_ss = np.array([], dtype=np.int32)
@@ -2094,10 +2126,55 @@ def norm_func(X, n, nA, nB):
         with pytest.raises(TypeError, match="norm_func must be callable"):
             method(ss_sizes, ss, stat_func, "uncallable", 1, True, *site_args)
         with pytest.raises(ValueError, match="summary function.*must be 1D"):
-            method(ss_sizes, ss, lambda a, b: 1, norm_func, 1, True, *site_args)
-        with pytest.raises(ValueError, match="length 2; must be 1"):
-            method(ss_sizes, ss, lambda a, b: [1, 2], norm_func, 1, True, *site_args)
+            method(ss_sizes, ss, lambda *_: 1, norm_func, 1, True, *site_args)
+        with pytest.raises(ValueError, match="summary function.*length 2; must be 1"):
+            method(ss_sizes, ss, lambda *_: [1, 2], norm_func, 1, True, *site_args)
+        with pytest.raises(ValueError, match="could not convert string to float"):
+            method(ss_sizes, ss, lambda *_: ["nonfloat"], norm_func, 1, True, *site_args)
+        with pytest.raises(ValueError, match="norm function.*must be 1D"):
+            ts_multi.two_locus_count_stat(
+                ss_sizes, ss, stat_func, lambda *_: 1, 1, True, *site_args
+            )
+        with pytest.raises(
+            TypeError, match="takes 1 positional argument but 2 were given"
+        ):
+            ts_multi.two_locus_count_stat(
+                ss_sizes, ss, lambda _: 1, norm_func, 1, True, *site_args
+            )
+        with pytest.raises(ValueError, match="norm function.*length 2; must be 1"):
+            ts_multi.two_locus_count_stat(
+                ss_sizes, ss, stat_func, lambda *_: [1, 2], 1, True, *site_args
+            )
+        with pytest.raises(
+            TypeError, match="takes 1 positional argument but 4 were given"
+        ):
+            ts_multi.two_locus_count_stat(
+                ss_sizes, ss, stat_func, lambda _: [1, 2], 1, True, *site_args
+            )
+        with pytest.raises(ValueError, match="could not convert string to float"):
+            ts_multi.two_locus_count_stat(
+                ss_sizes, ss, stat_func, lambda *_: ["nonfloat"], 1, True, *site_args
+            )
+        # Exceptions within stat_func and norm_func are correctly raised.
+        for exception in [ValueError, TypeError]:
+
+            def stat_func_except(*_):
+                raise exception("test")
+
+            def norm_func_except(*_):
+                raise exception("test")
+
+            with pytest.raises(exception, match="test"):
+                method(
+                    ss_sizes, ss, stat_func_except, norm_func, 1, True, *site_list_args
+                )
+            with pytest.raises(exception, match="test"):
+                ts_multi.two_locus_count_stat(
+                    ss_sizes, ss, stat_func, norm_func_except, 1, True, *site_list_args
+                )
         # C API errors
+        with pytest.raises(tskit.LibraryError, match="TSK_ERR_BAD_RESULT_DIMS"):
+            method(ss_sizes, ss, stat_func, norm_func, 0, True, *site_list_args)
         with pytest.raises(tskit.LibraryError, match="TSK_ERR_STAT_UNSORTED_SITES"):
             bad_sites = np.array([1, 0, 2], dtype=np.int32)
             bad_site_args = bad_sites, col_sites, None, None, "site"
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -10936,7 +10936,7 @@ def two_locus_count_stat(
         sample_sets,
         f,
         result_dim,
-        norm_f=lambda X, n, nA, nB: 1 / (nA * nB)[np.newaxis,],
+        norm_f=None,
         polarised=False,
         sites=None,
         positions=None,
@@ -10949,7 +10949,7 @@ def two_locus_count_stat(
             sample_set_sizes,
             sample_sets,
             f,
-            norm_f,
+            norm_f or (lambda X, n, nA, nB: 1 / (nA * nB)[np.newaxis,]),
             result_dim,
             polarised,
             row_sites,