Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions RcppTskit/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ and releases adhere to [Semantic Versioning](https://semver.org/spec/v2.0.0.html
- Added `rtsk_mutation_table_add_row()` and
`TableCollection$mutation_table_add_row()` to append mutation rows from
\code{R}, mirroring `tsk_mutation_table_add_row()`.
- Added `rtsk_node_table_get_row()` and `TableCollection$node_table_get_row()`
to retrieve node-table rows by 0-based row ID.
- Added `rtsk_table_collection_sort()` and `TableCollection$sort()` to sort
table collections with 0-based `edge_start` semantics.
- Added low-level variant iterators
(`rtsk_variant_iterator_init()`/`rtsk_variant_iterator_next()`) and a
user-facing `TreeSequence$variants()` method to iterate over decoded
site-by-site variants from \code{R}, aligned with `tskit` Python API
semantics for `samples`, `isolated_as_missing`, `alleles`, and
`left`/`right` intervals.
- Added `rtsk_treeseq_get_samples()` and `TreeSequence$samples()` to retrieve
sample node IDs from a tree sequence.
- TODO

### Changed
Expand Down
65 changes: 65 additions & 0 deletions RcppTskit/R/Class-TableCollection.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,50 @@ TableCollection <- R6Class(
TreeSequence$new(xptr = ts_xptr)
},

# TODO: add site_start and mutation_start
#' @description Sort this table collection in place.
#' @param edge_start integer scalar edge-table start row ID (0-based).
#' # TODO: remove this arg since Python code does not have it
#' @param no_check_integrity logical; when \code{TRUE}, pass
#' \code{TSK_NO_CHECK_INTEGRITY} to \code{tskit C}.
#' @details See the \code{tskit Python} equivalent at
#' \url{https://tskit.dev/tskit/docs/latest/python-api.html#tskit.TableCollection.sort}.
#' @return No return value; called for side effects.
#' @examples
#' ts_file <- system.file("examples/test.trees", package = "RcppTskit")
#' tc <- tc_load(ts_file)
#' tc$sort()
sort = function(edge_start = 0L, no_check_integrity = FALSE) {
Comment thread
LynxJinyangii marked this conversation as resolved.
Outdated
if (
is.null(edge_start) ||
!is.integer(edge_start) ||
length(edge_start) != 1L ||
is.na(edge_start) ||
edge_start < 0L
) {
stop("edge_start must be a non-NA positive integer scalar!")
}
# TODO: remove this arg since Python code does not have it
if (
!is.logical(no_check_integrity) ||
length(no_check_integrity) != 1L ||
is.na(no_check_integrity)
) {
stop("no_check_integrity must be TRUE/FALSE!")
}
options <- if (isTRUE(no_check_integrity)) {
as.integer(rtsk_const_tsk_no_check_integrity())
} else {
0L
}
rtsk_table_collection_sort(
tc = self$xptr,
edge_start = as.integer(edge_start),
# TODO: remove this arg since Python code does not have it
options = options
)
},

#' @description Get the number of provenances in a table collection.
#' @return A signed 64 bit integer \code{bit64::integer64}.
#' @examples
Expand Down Expand Up @@ -218,6 +262,27 @@ TableCollection <- R6Class(
rtsk_table_collection_get_num_nodes(self$xptr)
},

#' @description Get one row from the nodes table.
#' @param row_id integer scalar node row ID (0-based).
#' @details The ID is 0-based, matching \code{tskit C/Python} semantics.
#' @return A named list with fields \code{id}, \code{flags}, \code{time},
#' \code{population}, \code{individual}, and \code{metadata}.
#' @examples
#' ts_file <- system.file("examples/test.trees", package = "RcppTskit")
#' tc <- tc_load(ts_file)
#' tc$node_table_get_row(0L)
node_table_get_row = function(row_id) {
if (
is.null(row_id) || length(row_id) != 1L || is.na(as.integer(row_id))
) {
stop("row_id must be a non-NA integer scalar (0-based)!")
}
if (as.integer(row_id) < 0L) {
stop("row_id must be >= 0 (0-based)!")
}
rtsk_node_table_get_row(self$xptr, row_id = as.integer(row_id))
},

#' @description Add a row to the nodes table.
#' @param flags integer scalar flags for the new node.
#' @param time numeric scalar time value for the new node.
Expand Down
96 changes: 96 additions & 0 deletions RcppTskit/R/Class-TreeSequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,90 @@ TreeSequence <- R6Class(
)
},

#' @description Iterate over sites as decoded variants.
#' @param samples Optional integer vector of sample node IDs to decode.
#' @param isolated_as_missing Logical; decode isolated samples as missing
#' data (\code{TRUE}, default) or as ancestral state (\code{FALSE}).
#' @param alleles Optional character vector of allele states; when set,
#' genotypes are indexed to this allele order.
#' @param impute_missing_data Deprecated alias for
#' \code{!isolated_as_missing}.
#' @param copy Logical; currently only \code{TRUE} is supported.
#' @param left Left genomic coordinate (inclusive).
#' @param right Right genomic coordinate (exclusive). \code{NULL} means
#' sequence length.
#' @details See the \code{tskit Python} equivalent at
#' \url{https://tskit.dev/tskit/docs/latest/python-api.html#tskit.TreeSequence.variants}.
#' @return A simple iterator object with methods \code{next()} and
#' \code{next_variant()} that each return either a variant list or
#' \code{NULL} at end.
#' @examples
#' ts_file <- system.file("examples/test.trees", package = "RcppTskit")
#' ts <- ts_load(ts_file)
#' it <- ts$variants()
#' v1 <- it$next_variant()
#' v2 <- it$next_variant()
#' is.list(v1)
#' is.list(v2)
variants = function(
samples = NULL,
isolated_as_missing = TRUE,
alleles = NULL,
impute_missing_data = NULL,
copy = TRUE,
left = 0,
right = NULL
) {
if (!is.logical(copy) || length(copy) != 1 || is.na(copy)) {
stop("copy must be TRUE/FALSE!")
}
if (!copy) {
stop("copy = FALSE is not supported yet!")
}
if (!is.null(impute_missing_data)) {
if (
!is.logical(impute_missing_data) ||
length(impute_missing_data) != 1 ||
is.na(impute_missing_data)
) {
stop("impute_missing_data must be TRUE/FALSE or NULL!")
}
mapped <- !impute_missing_data
if (
!missing(isolated_as_missing) &&
!identical(isolated_as_missing, mapped)
) {
stop(
"isolated_as_missing and impute_missing_data are inconsistent!"
)
}
warning(
"impute_missing_data is deprecated; use isolated_as_missing",
call. = FALSE
)
isolated_as_missing <- mapped
}

iter_xptr <- rtsk_variant_iterator_init(
ts = self$xptr,
samples = samples,
isolated_as_missing = isolated_as_missing,
alleles = alleles,
left = left,
right = if (is.null(right)) NA_real_ else right
)

env <- new.env(parent = emptyenv())
env$iter_xptr <- iter_xptr
next_fun <- function() {
rtsk_variant_iterator_next(env$iter_xptr)
}
structure(
list(`next` = next_fun, next_variant = next_fun),
class = "rtsk_variant_iterator"
)
},

#' @description Get the number of provenances in a tree sequence.
#' @return A signed 64 bit integer \code{bit64::integer64}.
#' @details See the \code{tskit Python} equivalent at
Expand Down Expand Up @@ -221,6 +305,18 @@ TreeSequence <- R6Class(
rtsk_treeseq_get_num_samples(self$xptr)
},

#' @description Get sample node IDs in this tree sequence.
#' @return An integer vector with sample node IDs (0-based).
#' @details See the \code{tskit Python} equivalent at
#' \url{https://tskit.dev/tskit/docs/latest/python-api.html#tskit.TreeSequence.samples}.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Python method has population, population_id, and time arguments, so if we implement the samples method in R we should mirror what the Python method does.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this implementation was driven by the C method, which only accepts TreeSequence, but let's ensure R API matches Python API - looking at the source of https://tskit.dev/tskit/docs/latest/_modules/tskit/trees.html#TreeSequence.samples I see that these args are all handled on the Python side (first all sample IDs are obtained by calling the low-level C function and then subsetting is done in Python - suggesting we should handle these also on R side).

Looking at the docs I now also see that population_id is deprecated, so best skipping it in our implementation.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@LynxJinyangii this one also has TODO that is not addressed

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

population and time added as arguments.

#' @examples
#' ts_file <- system.file("examples/test.trees", package = "RcppTskit")
#' ts <- ts_load(ts_file)
#' ts$samples()
samples = function() {
rtsk_treeseq_get_samples(self$xptr)
},

#' @description Get the number of nodes in a tree sequence.
#' @return A signed 64 bit integer \code{bit64::integer64}.
#' @details See the \code{tskit Python} equivalent at
Expand Down
36 changes: 36 additions & 0 deletions RcppTskit/R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

rtsk_variant_iterator_init <- function(ts, samples = NULL, isolated_as_missing = TRUE, alleles = NULL, left = 0.0, right = NA_real_) {
.Call(`_RcppTskit_rtsk_variant_iterator_init`, ts, samples, isolated_as_missing, alleles, left, right)
}

rtsk_variant_iterator_next <- function(iterator) {
.Call(`_RcppTskit_rtsk_variant_iterator_next`, iterator)
}

test_rtsk_variant_iterator_force_null_first_allele <- function(enabled) {
invisible(.Call(`_RcppTskit_test_rtsk_variant_iterator_force_null_first_allele`, enabled))
}

test_rtsk_variant_iterator_set_site_bounds <- function(iterator, next_site_id, stop_site_id) {
invisible(.Call(`_RcppTskit_test_rtsk_variant_iterator_set_site_bounds`, iterator, next_site_id, stop_site_id))
}

test_variant_site_index_range <- function(start, stop) {
invisible(.Call(`_RcppTskit_test_variant_site_index_range`, start, stop))
}

test_validate_options <- function(options, supported) {
.Call(`_RcppTskit_test_validate_options`, options, supported)
}
Expand Down Expand Up @@ -31,6 +51,10 @@ tskit_version <- function() {
.Call(`_RcppTskit_tskit_version`)
}

rtsk_const_tsk_no_check_integrity <- function() {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since Python sort method does not have options I assume we do not need rtsk_const_tsk_no_check_integrity at all?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed

.Call(`_RcppTskit_rtsk_const_tsk_no_check_integrity`)
}

rtsk_treeseq_load <- function(filename, options = 0L) {
.Call(`_RcppTskit_rtsk_treeseq_load`, filename, options)
}
Expand Down Expand Up @@ -75,6 +99,10 @@ rtsk_treeseq_get_num_samples <- function(ts) {
.Call(`_RcppTskit_rtsk_treeseq_get_num_samples`, ts)
}

rtsk_treeseq_get_samples <- function(ts) {
.Call(`_RcppTskit_rtsk_treeseq_get_samples`, ts)
}

rtsk_treeseq_get_num_nodes <- function(ts) {
.Call(`_RcppTskit_rtsk_treeseq_get_num_nodes`, ts)
}
Expand Down Expand Up @@ -195,6 +223,10 @@ rtsk_table_collection_drop_index <- function(tc, options = 0L) {
invisible(.Call(`_RcppTskit_rtsk_table_collection_drop_index`, tc, options))
}

rtsk_table_collection_sort <- function(tc, edge_start = 0L, options = 0L) {
invisible(.Call(`_RcppTskit_rtsk_table_collection_sort`, tc, edge_start, options))
}

rtsk_table_collection_summary <- function(tc) {
.Call(`_RcppTskit_rtsk_table_collection_summary`, tc)
}
Expand All @@ -211,6 +243,10 @@ rtsk_node_table_add_row <- function(tc, flags = 0L, time = 0, population = -1L,
.Call(`_RcppTskit_rtsk_node_table_add_row`, tc, flags, time, population, individual, metadata)
}

rtsk_node_table_get_row <- function(tc, row_id) {
.Call(`_RcppTskit_rtsk_node_table_get_row`, tc, row_id)
}

rtsk_edge_table_add_row <- function(tc, left, right, parent, child, metadata = NULL) {
.Call(`_RcppTskit_rtsk_edge_table_add_row`, tc, left, right, parent, child, metadata)
}
Expand Down
10 changes: 10 additions & 0 deletions RcppTskit/inst/include/RcppTskit_public.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
// RcppTskit.cpp
Rcpp::IntegerVector kastore_version();
Rcpp::IntegerVector tskit_version();
int rtsk_const_tsk_no_check_integrity();

// sync default options with .cpp!
SEXP rtsk_treeseq_load(const std::string &filename, int options = 0);
Expand All @@ -18,12 +19,19 @@ void rtsk_table_collection_dump(SEXP tc, const std::string &filename,
int options = 0);
SEXP rtsk_treeseq_copy_tables(SEXP ts, int options = 0);
SEXP rtsk_treeseq_init(SEXP tc, int options = 0);
SEXP rtsk_variant_iterator_init(
SEXP ts, Rcpp::Nullable<Rcpp::IntegerVector> samples = R_NilValue,
bool isolated_as_missing = true,
Rcpp::Nullable<Rcpp::CharacterVector> alleles = R_NilValue,
double left = 0.0, double right = NA_REAL);
SEXP rtsk_variant_iterator_next(SEXP iterator);

SEXP rtsk_treeseq_get_num_provenances(SEXP ts);
SEXP rtsk_treeseq_get_num_populations(SEXP ts);
SEXP rtsk_treeseq_get_num_migrations(SEXP ts);
SEXP rtsk_treeseq_get_num_individuals(SEXP ts);
SEXP rtsk_treeseq_get_num_samples(SEXP ts);
Rcpp::IntegerVector rtsk_treeseq_get_samples(SEXP ts);
SEXP rtsk_treeseq_get_num_nodes(SEXP ts);
SEXP rtsk_treeseq_get_num_edges(SEXP ts);
SEXP rtsk_treeseq_get_num_trees(SEXP ts);
Expand Down Expand Up @@ -55,6 +63,7 @@ Rcpp::String rtsk_table_collection_get_file_uuid(SEXP tc);
bool rtsk_table_collection_has_index(SEXP tc, int options = 0);
void rtsk_table_collection_build_index(SEXP tc, int options = 0);
void rtsk_table_collection_drop_index(SEXP tc, int options = 0);
void rtsk_table_collection_sort(SEXP tc, int edge_start = 0, int options = 0);
Rcpp::List rtsk_table_collection_summary(SEXP tc);
Rcpp::List rtsk_table_collection_metadata_length(SEXP tc);
int rtsk_individual_table_add_row(
Expand All @@ -65,6 +74,7 @@ int rtsk_individual_table_add_row(
int rtsk_node_table_add_row(
SEXP tc, int flags = 0, double time = 0, int population = -1,
int individual = -1, Rcpp::Nullable<Rcpp::RawVector> metadata = R_NilValue);
Rcpp::List rtsk_node_table_get_row(SEXP tc, int row_id);
int rtsk_edge_table_add_row(
SEXP tc, double left, double right, int parent, int child,
Rcpp::Nullable<Rcpp::RawVector> metadata = R_NilValue);
Expand Down
Loading
Loading