From 251ad37752da07c3c23d808dcd068934abe05dc5 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 7 May 2025 10:18:32 +0100 Subject: [PATCH 1/3] Fix CI linting not finding clang --- .github/workflows/tests.yml | 7 +++--- c/tests/test_core.c | 2 +- c/tests/test_stats.c | 6 ++--- c/tests/testlib.c | 46 ++++++++++++++++++------------------- c/tskit/core.c | 3 ++- python/_tskitmodule.c | 45 ++++++++++++++++++++++-------------- 6 files changed, 60 insertions(+), 49 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index fb443fbc1a..99169f1b7c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -22,9 +22,10 @@ jobs: - name: install clang-format if: steps.clang_format.outputs.cache-hit != 'true' run: | - python -m venv env - source env/bin/activate - pip install clang-format==6.0.1 + pip install setuptools clang-format==6.0.1 + clang-format -version + sudo cp /opt/hostedtoolcache/Python/3.12.10/x64/bin/clang-format /opt/hostedtoolcache/Python/3.12.10/x64/bin/clang-format-6.0 + clang-format-6.0 -version - uses: pre-commit/action@v3.0.1 benchmark: diff --git a/c/tests/test_core.c b/c/tests/test_core.c index eed22fc1ca..6a14ecc655 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -532,7 +532,7 @@ test_bit_arrays(void) // NB: This test is only valid for the 32 bit implementation of bit arrays. If we // were to change the chunk size of a bit array, we'd need to update these tests tsk_bit_array_t arr; - tsk_id_t items_truth[64] = {0}, items[64] = {0}; + tsk_id_t items_truth[64] = { 0 }, items[64] = { 0 }; tsk_size_t n_items = 0, n_items_truth = 0; // test item retrieval diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index f55cdcee50..22faf2256c 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -348,7 +348,7 @@ verify_pair_coalescence_counts(tsk_treeseq_t *ts, tsk_flags_t options) /* test various bin assignments */ for (i = 0; i < N; i++) { - node_bin_map[i] = ((tsk_id_t) (i % B)); + node_bin_map[i] = ((tsk_id_t)(i % B)); } ret = tsk_treeseq_pair_coalescence_counts(ts, P, sample_set_sizes, sample_sets, I, index_tuples, T, breakpoints, B, node_bin_map, options, C_B); @@ -3694,8 +3694,8 @@ static void test_pair_coalescence_counts_missing(void) { tsk_treeseq_t ts; - tsk_treeseq_from_text(&ts, 5, missing_ex_nodes, missing_ex_edges, NULL, - NULL, NULL, NULL, NULL, 0); + tsk_treeseq_from_text( + &ts, 5, missing_ex_nodes, missing_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); verify_pair_coalescence_counts(&ts, 0); verify_pair_coalescence_counts(&ts, TSK_STAT_SPAN_NORMALISE); tsk_treeseq_free(&ts); diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 1f05178627..637a59e681 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -373,31 +373,29 @@ const char *empty_ex_nodes = "1 0.0 0 -1\n" const char *empty_ex_edges = ""; /*** An example of a tree sequence with missing marginal trees. ***/ -/* - | 4 | | 4 | - | / \ | | / \ | - | 3 \ | | / 3 | - | / \ \ | | / / \ | - | 0 1 2 | | 0 1 2 | - |-|-----------|-|-----------|-| - 0 1 2 3 4 5 +/* + | 4 | | 4 | + | / \ | | / \ | + | 3 \ | | / 3 | + | / \ \ | | / / \ | + | 0 1 2 | | 0 1 2 | + |-|-----------|-|-----------|-| + 0 1 2 3 4 5 */ -const char *missing_ex_nodes = - "1 0.0 0 -1\n" - "1 0.0 0 -1\n" - "1 0.0 0 -1\n" - "0 1.0 0 -1\n" - "0 2.0 0 -1\n"; - -const char *missing_ex_edges = - "1.0 2.0 3 0\n" - "1.0 2.0 3 1\n" - "3.0 4.0 3 1\n" - "3.0 4.0 3 2\n" - "3.0 4.0 4 0\n" - "1.0 2.0 4 2\n" - "1.0 2.0 4 3\n" - "3.0 4.0 4 3\n"; +const char *missing_ex_nodes = "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "0 1.0 0 -1\n" + "0 2.0 0 -1\n"; + +const char *missing_ex_edges = "1.0 2.0 3 0\n" + "1.0 2.0 3 1\n" + "3.0 4.0 3 1\n" + "3.0 4.0 3 2\n" + "3.0 4.0 4 0\n" + "1.0 2.0 4 2\n" + "1.0 2.0 4 3\n" + "3.0 4.0 4 3\n"; /* Simple utilities to parse text so we can write declaritive * tests. This is not intended as a robust general input mechanism. diff --git a/c/tskit/core.c b/c/tskit/core.c index 2b0078bf08..175505439e 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -168,7 +168,8 @@ tsk_strerror_internal(int err) break; case TSK_ERR_FILE_VERSION_TOO_OLD: ret = "tskit file version too old. Please upgrade using the " - "'tskit upgrade' command from tskit version<0.6.2. (TSK_ERR_FILE_VERSION_TOO_OLD)"; + "'tskit upgrade' command from tskit version<0.6.2. " + "(TSK_ERR_FILE_VERSION_TOO_OLD)"; break; case TSK_ERR_FILE_VERSION_TOO_NEW: ret = "tskit file version is too new for this instance. " diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index e79a876467..a9ee1ba188 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -223,7 +223,8 @@ handle_library_error(int err) const char *not_kas_format_msg = "File not in kastore format. Either the file is corrupt or it is not a " "tskit tree sequence file. It may be a legacy HDF file upgradable with " - "`tskit upgrade` from tskit version<0.6.2 or a compressed tree sequence file that can be decompressed " + "`tskit upgrade` from tskit version<0.6.2 or a compressed tree sequence file " + "that can be decompressed " "with `tszip`."; const char *ibd_pairs_not_stored_msg = "Sample pairs are not stored by default " @@ -9715,12 +9716,12 @@ TreeSequence_weighted_stat_vector_method( if (result_array == NULL) { goto out; } - Py_BEGIN_ALLOW_THREADS - err = method(self->tree_sequence, w_shape[1], PyArray_DATA(weights_array), - num_windows, PyArray_DATA(windows_array), num_focal_nodes, - PyArray_DATA(focal_nodes_array), PyArray_DATA(result_array), options); - Py_END_ALLOW_THREADS - if (err != 0) { + Py_BEGIN_ALLOW_THREADS err + = method(self->tree_sequence, w_shape[1], PyArray_DATA(weights_array), + num_windows, PyArray_DATA(windows_array), num_focal_nodes, + PyArray_DATA(focal_nodes_array), PyArray_DATA(result_array), options); + Py_END_ALLOW_THREADS if (err != 0) + { handle_library_error(err); goto out; } @@ -10744,7 +10745,8 @@ TreeSequence_get_individuals_metadata(TreeSequence *self, void *closure) goto out; } individuals = self->tree_sequence->tables->individuals; - ret = TreeSequence_make_array(self, individuals.metadata_length, NPY_UINT8, individuals.metadata); + ret = TreeSequence_make_array( + self, individuals.metadata_length, NPY_UINT8, individuals.metadata); out: return ret; } @@ -10835,7 +10837,8 @@ TreeSequence_get_nodes_metadata(TreeSequence *self, void *closure) goto out; } nodes = self->tree_sequence->tables->nodes; - ret = TreeSequence_make_array(self, nodes.metadata_length, NPY_UINT8, nodes.metadata); + ret = TreeSequence_make_array( + self, nodes.metadata_length, NPY_UINT8, nodes.metadata); out: return ret; } @@ -10926,7 +10929,8 @@ TreeSequence_get_edges_metadata(TreeSequence *self, void *closure) goto out; } edges = self->tree_sequence->tables->edges; - ret = TreeSequence_make_array(self, edges.metadata_length, NPY_UINT8, edges.metadata); + ret = TreeSequence_make_array( + self, edges.metadata_length, NPY_UINT8, edges.metadata); out: return ret; } @@ -10941,7 +10945,8 @@ TreeSequence_get_edges_metadata_offset(TreeSequence *self, void *closure) goto out; } edges = self->tree_sequence->tables->edges; - ret = TreeSequence_make_array(self, edges.num_rows + 1, NPY_UINT64, edges.metadata_offset); + ret = TreeSequence_make_array( + self, edges.num_rows + 1, NPY_UINT64, edges.metadata_offset); out: return ret; } @@ -10971,7 +10976,8 @@ TreeSequence_get_sites_metadata(TreeSequence *self, void *closure) goto out; } sites = self->tree_sequence->tables->sites; - ret = TreeSequence_make_array(self, sites.metadata_length, NPY_UINT8, sites.metadata); + ret = TreeSequence_make_array( + self, sites.metadata_length, NPY_UINT8, sites.metadata); out: return ret; } @@ -10986,7 +10992,8 @@ TreeSequence_get_sites_metadata_offset(TreeSequence *self, void *closure) goto out; } sites = self->tree_sequence->tables->sites; - ret = TreeSequence_make_array(self, sites.num_rows + 1, NPY_UINT64, sites.metadata_offset); + ret = TreeSequence_make_array( + self, sites.num_rows + 1, NPY_UINT64, sites.metadata_offset); out: return ret; } @@ -11061,7 +11068,8 @@ TreeSequence_get_mutations_metadata(TreeSequence *self, void *closure) goto out; } mutations = self->tree_sequence->tables->mutations; - ret = TreeSequence_make_array(self, mutations.metadata_length, NPY_UINT8, mutations.metadata); + ret = TreeSequence_make_array( + self, mutations.metadata_length, NPY_UINT8, mutations.metadata); out: return ret; } @@ -11076,7 +11084,8 @@ TreeSequence_get_mutations_metadata_offset(TreeSequence *self, void *closure) goto out; } mutations = self->tree_sequence->tables->mutations; - ret = TreeSequence_make_array(self, mutations.num_rows + 1, NPY_UINT64, mutations.metadata_offset); + ret = TreeSequence_make_array( + self, mutations.num_rows + 1, NPY_UINT64, mutations.metadata_offset); out: return ret; } @@ -11185,7 +11194,8 @@ TreeSequence_get_migrations_metadata(TreeSequence *self, void *closure) goto out; } migrations = self->tree_sequence->tables->migrations; - ret = TreeSequence_make_array(self, migrations.metadata_length, NPY_UINT8, migrations.metadata); + ret = TreeSequence_make_array( + self, migrations.metadata_length, NPY_UINT8, migrations.metadata); out: return ret; } @@ -11216,7 +11226,8 @@ TreeSequence_get_populations_metadata(TreeSequence *self, void *closure) goto out; } populations = self->tree_sequence->tables->populations; - ret = TreeSequence_make_array(self, populations.metadata_length, NPY_UINT8, populations.metadata); + ret = TreeSequence_make_array( + self, populations.metadata_length, NPY_UINT8, populations.metadata); out: return ret; } From 9bf6a7f3bc3278df42b9631763159f175498dd63 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Sat, 3 May 2025 01:49:08 +0100 Subject: [PATCH 2/3] Add individuals nodes method --- python/CHANGELOG.rst | 10 ++++ python/_tskitmodule.c | 59 +++++++++++++++++++++ python/tests/test_highlevel.py | 93 ++++++++++++++++++++++++++++++++++ python/tests/test_lowlevel.py | 25 +++++++++ python/tskit/trees.py | 15 ++++++ 5 files changed, 202 insertions(+) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 1219d49689..7f1b29ce8d 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -1,3 +1,13 @@ +-------------------- +[0.6.4] - 2025-XX-XX +-------------------- + +**Features** + +- Add ``TreeSequence.individuals_nodes`` attribute to return the nodes + associated with each individual as a numpy array. + (:user:`benjeffery`, :pr:`3153`) + -------------------- [0.6.3] - 2025-04-28 -------------------- diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index a9ee1ba188..66d53a6489 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -8725,6 +8725,61 @@ TreeSequence_get_individuals_time(TreeSequence *self) return ret; } +static PyObject * +TreeSequence_get_individuals_nodes(TreeSequence *self) +{ + PyObject *ret = NULL; + PyArrayObject *ret_array = NULL; + npy_intp dims[2]; + tsk_size_t ploidy; + tsk_size_t max_ploidy = 0; + tsk_id_t *node_mem; + tsk_size_t j; + tsk_size_t num_individuals; + tsk_id_t *const *individual_nodes; + const tsk_size_t *individual_nodes_length; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + + num_individuals = tsk_treeseq_get_num_individuals(self->tree_sequence); + individual_nodes = self->tree_sequence->individual_nodes; + individual_nodes_length = self->tree_sequence->individual_nodes_length; + + for (tsk_id_t i = 0; i < (tsk_id_t) num_individuals; i++) { + ploidy = individual_nodes_length[i]; + if (ploidy > max_ploidy) { + max_ploidy = ploidy; + } + } + + dims[0] = (npy_intp) num_individuals; + dims[1] = (npy_intp) max_ploidy; + ret_array = (PyArrayObject *) PyArray_SimpleNew(2, dims, NPY_INT32); + if (ret_array == NULL) { + goto out; + } + + /* Fill with -1 (TSK_NULL) */ + node_mem = (tsk_id_t *) PyArray_DATA(ret_array); + memset(node_mem, 0xFF, PyArray_NBYTES(ret_array)); + + for (tsk_id_t i = 0; i < (tsk_id_t) num_individuals; i++) { + ploidy = individual_nodes_length[i]; + for (j = 0; j < ploidy; j++) { + node_mem[i * max_ploidy + j] = individual_nodes[i][j]; + } + } + + ret = (PyObject *) ret_array; + ret_array = NULL; + +out: + Py_XDECREF(ret_array); + return ret; +} + static PyObject * TreeSequence_genealogical_nearest_neighbours( TreeSequence *self, PyObject *args, PyObject *kwds) @@ -11425,6 +11480,10 @@ static PyMethodDef TreeSequence_methods[] = { .ml_meth = (PyCFunction) TreeSequence_get_individuals_time, .ml_flags = METH_NOARGS, .ml_doc = "Returns the vector of per-individual times." }, + { .ml_name = "get_individuals_nodes", + .ml_meth = (PyCFunction) TreeSequence_get_individuals_nodes, + .ml_flags = METH_NOARGS, + .ml_doc = "Returns an array of the node ids for each individual" }, { .ml_name = "genealogical_nearest_neighbours", .ml_meth = (PyCFunction) TreeSequence_genealogical_nearest_neighbours, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 00b8bca3e0..f703408013 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -5491,3 +5491,96 @@ def test_error_if_no_schema(self, table_name): ts = msprime.simulate(10) with pytest.raises(NotImplementedError): getattr(ts, f"{table_name}_metadata") + + +class TestIndividualsNodes: + def test_basic_individuals_nodes(self, tmp_path): + # Create a basic tree sequence with two individuals + tables = tskit.TableCollection(sequence_length=100) + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + ts = tables.tree_sequence() + + result = ts.individuals_nodes + assert result.shape == (2, 2) + assert_array_equal(result, [[0, 1], [2, 3]]) + + def test_variable_ploidy(self, tmp_path): + tables = tskit.TableCollection(sequence_length=100) + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") # Diploid + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") # Haploid + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") # Triploid + + # Diploid individual + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + + # Haploid individual + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=1) + + # Triploid individual + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=2) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=2) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=2) + + ts = tables.tree_sequence() + + result = ts.individuals_nodes + + assert result.shape == (3, 3) + + expected = np.array( + [[0, 1, -1], [2, -1, -1], [3, 4, 5]] # Diploid # Haploid # Triploid + ) + assert_array_equal(result, expected) + + def test_no_individuals(self): + tables = tskit.TableCollection(sequence_length=100) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + + result = ts.individuals_nodes + expected = np.array([], dtype=np.int32).reshape(0, 0) + assert result.shape == (0, 0) + assert_array_equal(result, expected) + + def test_no_nodes_with_individuals(self): + tables = tskit.TableCollection(sequence_length=100) + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") + # Node without individual reference + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + ts = tables.tree_sequence() + + result = ts.individuals_nodes + expected = np.array([[]]) + assert result.shape == (1, 0) + assert_array_equal(result, expected) + + def test_individual_with_no_nodes(self): + tables = tskit.TableCollection(sequence_length=100) + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") + # Only add nodes for first individual + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + ts = tables.tree_sequence() + + result = ts.individuals_nodes + expected = np.array([[0], [-1]]) + assert result.shape == (2, 1) + assert_array_equal(result, expected) + + def test_mixed_sample_status(self): + tables = tskit.TableCollection(sequence_length=100) + tables.individuals.add_row(flags=0, location=(0, 0), metadata=b"") + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0, individual=0) + tables.nodes.add_row(flags=0, time=0, individual=0) + ts = tables.tree_sequence() + + result = ts.individuals_nodes + expected = np.array([[0, 1]]) + assert result.shape == (1, 2) + assert_array_equal(result, expected) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 7d0efae1c1..49dadd301f 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1949,6 +1949,31 @@ def test_array_lifetime(self, name, ts_fixture): a2[:] = 0 assert a3 is not a2 + def test_individuals_nodes(self, ts_fixture): + ts_fixture = ts_fixture.ll_tree_sequence + + # Properties + a = ts_fixture.get_individuals_nodes() + assert a.flags.aligned + assert a.flags.c_contiguous + assert a.flags.owndata + b = ts_fixture.get_individuals_nodes() + assert a is not b + assert np.all(a == b) + + # Lifetime + a1 = ts_fixture.get_individuals_nodes() + a2 = a1.copy() + assert a1 is not a2 + del ts_fixture + # Do some memory operations + a3 = np.ones(10**6) + assert np.all(a1 == a2) + del a1 + # Just do something to touch memory + a2[:] = 0 + assert a3 is not a2 + class StatsInterfaceMixin: """ diff --git a/python/tskit/trees.py b/python/tskit/trees.py index ca03d22d3c..822ba78281 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -4123,6 +4123,7 @@ def __init__(self, ll_tree_sequence): self._individuals_time = None self._individuals_population = None self._individuals_location = None + self._individuals_nodes = None # NOTE: when we've implemented read-only access via the underlying # tables we can replace these arrays with reference to the read-only # tables here (and remove the low-level boilerplate). @@ -5814,6 +5815,20 @@ def individuals_metadata(self): self._individuals_metadata ) + @property + def individuals_nodes(self): + """ + Return an array of node IDs for each individual in the tree sequence. + + :return: Array of shape (num_individuals, max_ploidy) containing node IDs. + Values of -1 indicate unused slots for individuals with ploidy + less than the maximum. + :rtype: numpy.ndarray (dtype=np.int32) + """ + if self._individuals_nodes is None: + self._individuals_nodes = self._ll_tree_sequence.get_individuals_nodes() + return self._individuals_nodes + @property def nodes_metadata(self): """ From 0ab28259e8daf02cae0d7bee2064d1dd46ea8efe Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 6 May 2025 14:59:17 +0100 Subject: [PATCH 3/3] Update dev version --- python/tskit/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tskit/_version.py b/python/tskit/_version.py index d5e22a2f3f..8750fe5d4e 100644 --- a/python/tskit/_version.py +++ b/python/tskit/_version.py @@ -1,4 +1,4 @@ # Definitive location for the version number. # During development, should be x.y.z.devN # For beta should be x.y.zbN -tskit_version = "0.6.3" +tskit_version = "0.6.4.dev0"