Skip to content

Commit d328606

Browse files
authored
Merge pull request #10 from DavZim/describe
add first draft of describe (#5)
2 parents 2118754 + febca9e commit d328606

7 files changed

Lines changed: 526 additions & 32 deletions

File tree

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ S3method("+",ruleset)
55
S3method(print,rule)
66
S3method(print,ruleset)
77
export(check_data)
8+
export(describe)
89
export(detect_backend)
910
export(filter_fails)
1011
export(plot_res)

R/check_data.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,15 @@ detect_backend <- function(x) {
103103

104104
} else if ("tbl_sql" %in% cc) {
105105

106-
if (!has_pkg("DBI"))
107-
stop("The DBI package needs to be installed in order to test a tbl_sql.")
106+
if (!has_pkg("DBI") || !has_pkg("dbplyr"))
107+
stop("The DBI and dbplyr packages need to be installed in order to test a tbl_sql.")
108108

109109
backend <- "collectibles"
110110

111111
} else if ("ArrowObject" %in% cc) {
112112

113-
if (!has_pkg("arrow"))
114-
stop("The arrow package needs to be installed in order to test an ArrowObject.")
113+
if (!has_pkg("arrow") || !has_pkg("dbplyr"))
114+
stop("The arrow and dbplyr packages need to be installed in order to test an ArrowObject.")
115115

116116
backend <- "collectibles"
117117

R/describe.R

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
#' Describes a dataset
2+
#'
3+
#' Note that the current version is in the beta stadium at best, that means the
4+
#' R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster
5+
#' than arrow or SQL-based datasets.
6+
#'
7+
#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`],
8+
#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection)
9+
#'
10+
#' @return a `data.frame`, `dplyr::tibble`, or `data.table::data.table` containing
11+
#' a summary of the dataset given
12+
#' @export
13+
#'
14+
#' @seealso Similar to [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html),
15+
#' [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary),
16+
#' and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html)
17+
#'
18+
#' @examples
19+
#' describe(mtcars)
20+
describe <- function(x) {
21+
22+
backend <- detect_backend(x)
23+
24+
# make sure the input dataset has the right class
25+
if (class(x)[[1]] == "data.frame") {
26+
if (backend == "data.table") {
27+
x <- data.table::as.data.table(x)
28+
} else if (backend == "dplyr") {
29+
x <- dplyr::as_tibble(x)
30+
}
31+
}
32+
33+
if (backend == "base-r") {
34+
describe_base_r(x)
35+
} else if (backend == "dplyr") {
36+
describe_dplyr(x)
37+
} else if (backend == "data.table") {
38+
describe_data.table(x)
39+
} else if (backend == "collectibles") {
40+
if ("tbl_sql" %in% class(x)) {
41+
describe_sql(x)
42+
} else if ("ArrowObject" %in% class(x)) {
43+
describe_arrow(x)
44+
}
45+
} else {
46+
stop(sprintf("Could not detect backend to describe %s", paste(class(x), collapse = ", ")))
47+
}
48+
}
49+
50+
51+
# internal function to see which values should use the min/max etc part
52+
is_numeric <- function(v) {
53+
any(class(v) %in% c("integer", "numeric", "POSIXt"))
54+
}
55+
56+
# x <- mtcars
57+
describe_base_r <- function(x, max_n = 3) {
58+
ll <- lapply(
59+
seq(ncol(x)),
60+
function(i) {
61+
v <- x[[i]]
62+
type <- class(v)[[1]]
63+
is_num <- is_numeric(v)
64+
65+
tbl <- table(v)
66+
uv <- unique(v)
67+
tab <- tabulate(match(v, uv))
68+
tab_max <- which(tab == max(tab))
69+
# get the indices of the three highest counts
70+
od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))]
71+
72+
nz <- if (!is_num) nchar(as.character(v))
73+
74+
data.frame(
75+
var = names(x)[[i]],
76+
type = type,
77+
n = length(v),
78+
n_distinct = length(unique(v)),
79+
n_na = sum(is.na(v)),
80+
most_frequent = paste(sprintf("%s (%s)", uv[od], tab[od]),
81+
collapse = ", "),
82+
83+
min = as.numeric(min(if (is_num) v else nz, na.rm = TRUE)),
84+
mean = as.numeric(mean(if (is_num) v else nz, na.rm = TRUE)),
85+
median = as.numeric(median(if (is_num) v else nz, na.rm = TRUE)),
86+
max = as.numeric(max(if (is_num) v else nz, na.rm = TRUE)),
87+
sd = as.numeric(sd(if (is_num) v else nz, na.rm = TRUE))
88+
)
89+
}
90+
)
91+
92+
do.call(rbind, ll)
93+
}
94+
95+
# x <- mtcars |> tibble::as_tibble()
96+
describe_dplyr <- function(x, max_n = 3) {
97+
ll <- lapply(
98+
names(x),
99+
function(v) {
100+
mc <- x |>
101+
dplyr::count(.data[[v]]) |>
102+
dplyr::slice_max(n, n = max_n, with_ties = FALSE)
103+
104+
type <- class(mc[[1]])[[1]]
105+
is_num <- is_numeric(mc[[1]])
106+
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
107+
108+
nz <- if (!is_num) nchar(as.character(x[[v]]))
109+
x |>
110+
dplyr::summarise(
111+
var = v,
112+
type = type,
113+
n = dplyr::n(),
114+
n_distinct = dplyr::n_distinct(.data[[v]]),
115+
n_na = sum(is.na(.data[[v]])),
116+
most_frequent = mf,
117+
min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
118+
mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
119+
median = as.numeric(median(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
120+
max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
121+
sd = as.numeric(sd(if (is_num) .data[[v]] else nz, na.rm = TRUE))
122+
)
123+
}
124+
)
125+
126+
dplyr::bind_rows(ll)
127+
}
128+
129+
# x <- mtcars |> data.table::as.data.table()
130+
describe_data.table <- function(x, max_n = 3) {
131+
ll <- lapply(
132+
names(x),
133+
function(v) {
134+
mc <- x[, .(n = .N), by = v][order(n, decreasing = TRUE)][seq(max_n)]
135+
136+
type <- class(mc[[1]])[[1]]
137+
is_num <- is_numeric(mc[[1]])
138+
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
139+
140+
nz <- if (!is_num) nchar(as.character(x[[v]]))
141+
142+
x[, .(
143+
var = v,
144+
type = type,
145+
n = .N,
146+
n_distinct = data.table::uniqueN(get(v)),
147+
n_na = sum(is.na(get(v))),
148+
most_frequent = mf,
149+
150+
min = as.numeric(min(if (is_num) get(v) else nz, na.rm = TRUE)),
151+
mean = as.numeric(mean(if (is_num) get(v) else nz, na.rm = TRUE)),
152+
median = as.numeric(median(if (is_num) get(v) else nz, na.rm = TRUE)),
153+
max = as.numeric(max(if (is_num) get(v) else nz, na.rm = TRUE)),
154+
sd = as.numeric(sd(if (is_num) get(v) else nz, na.rm = TRUE))
155+
)]
156+
}
157+
)
158+
159+
data.table::rbindlist(ll)
160+
}
161+
162+
163+
# RSQLite, duckdb etc
164+
describe_sql <- function(x, max_n = 3) {
165+
ll <- lapply(names(x), function(v) {
166+
mc <- x |>
167+
dplyr::count(.data[[v]]) |>
168+
dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
169+
dplyr::collect()
170+
171+
type <- class(mc[[1]])[[1]]
172+
is_num <- is_numeric(mc[[1]])
173+
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
174+
nn <- x |>
175+
dplyr::distinct(.data[[v]]) |>
176+
dplyr::summarise(n = dplyr::n()) |>
177+
dplyr::collect()
178+
nna <- x |> dplyr::filter(is.na(.data[[v]])) |> dplyr::collect() |> nrow()
179+
180+
r <- dplyr::tibble(
181+
var = v,
182+
type = type,
183+
n_distinct = nn[[1]],
184+
n_na = nna,
185+
most_frequent = mf
186+
)
187+
188+
xx <- x |>
189+
dplyr::select(dplyr::all_of(v)) |>
190+
dplyr::rename(x := dplyr::all_of(v))
191+
if (!is_num) xx <- xx |> dplyr::mutate(x = nchar(as.character(x)))
192+
193+
194+
rr <- try(
195+
xx |>
196+
dplyr::summarise(
197+
min = min(x, na.rm = TRUE),
198+
mean = mean(x, na.rm = TRUE),
199+
median = median(x, na.rm = TRUE),
200+
max = max(x, na.rm = TRUE),
201+
sd = sd(x, na.rm = TRUE)
202+
) |>
203+
dplyr::collect(),
204+
silent = TRUE
205+
)
206+
if (inherits(rr, "try-error")) {
207+
rr <- dplyr::tibble(
208+
min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
209+
sd = NA_real_
210+
)
211+
}
212+
213+
dplyr::bind_cols(r, rr)
214+
})
215+
216+
dplyr::bind_rows(ll)
217+
}
218+
219+
# arrow::write_parquet(nycflights13::flights, "flights.parquet")
220+
# x <- arrow::open_dataset("flights.parquet")
221+
describe_arrow <- function(x, max_n = 3) {
222+
# if x is a dbplyr connection string
223+
ll <- lapply(names(x), function(v) {
224+
mc <- x |>
225+
dplyr::count(.data[[v]]) |>
226+
dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
227+
dplyr::collect()
228+
229+
type <- class(mc[[1]])[[1]]
230+
is_num <- is_numeric(mc[[1]])
231+
mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
232+
nn <- x |>
233+
dplyr::distinct(.data[[v]]) |>
234+
dplyr::summarise(n = dplyr::n()) |>
235+
dplyr::collect()
236+
nna <- x |> dplyr::filter(is.na(.data[[v]])) |> nrow()
237+
238+
r <- dplyr::tibble(
239+
var = v,
240+
type = type,
241+
n_distinct = nn[[1]],
242+
n_na = nna,
243+
most_frequent = mf
244+
)
245+
246+
if (is_num) {
247+
xx <- x |> dplyr::transmute(x = get(v))
248+
} else {
249+
xx <- x |> dplyr::transmute(x = nchar(as.character(get(v))))
250+
}
251+
252+
suppressWarnings({
253+
rr <- try(
254+
xx |>
255+
dplyr::summarise(
256+
min = min(x, na.rm = TRUE),
257+
mean = mean(x, na.rm = TRUE),
258+
median = median(x, na.rm = TRUE),
259+
max = max(x, na.rm = TRUE),
260+
sd = sd(x, na.rm = TRUE)
261+
) |>
262+
dplyr::collect(),
263+
silent = TRUE)
264+
})
265+
if (inherits(rr, "try-error")) {
266+
rr <- dplyr::tibble(
267+
min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
268+
sd = NA_real_
269+
)
270+
}
271+
272+
dplyr::bind_cols(r, rr)
273+
})
274+
275+
dplyr::bind_rows(ll)
276+
}

README.Rmd

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ That way, you can concentrate on writing the rules and making sure that your dat
3232

3333
The package is lightweight as all the heavy dependencies are Suggests-only, that means if you want to use `data.table` for the task, you don't need to install the other packages (`arrow`, `DBI`, etc) unless you explicitly tell R to install all suggested packages as well when installing the package.
3434

35-
The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages.
35+
The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages (see also `?detect_backend(data)`).
3636
By using the underlying technologies and handing over all evaluation of code to the backend, this package can deal with all sizes of data the backends can deal with.
3737

38+
The package also has a helper function to describe a dataset, see `?describe()`.
3839

3940
## Installation
4041

@@ -63,6 +64,9 @@ At the moment rules work in a window/vectorized approach only, that means that a
6364
```{r example, message=FALSE}
6465
library(dataverifyr)
6566
67+
# create a dataset
68+
data <- mtcars
69+
6670
# define a rule set within our R code; alternatively in a yaml file
6771
rules <- ruleset(
6872
rule(mpg > 10 & mpg < 30), # mpg goes up to 34
@@ -73,8 +77,11 @@ rules <- ruleset(
7377
# print the rules
7478
rules
7579
80+
# describe the dataset
81+
describe(data)
82+
7683
# check if the data matches our rules
77-
res <- check_data(mtcars, rules)
84+
res <- check_data(data, rules)
7885
res
7986
```
8087

@@ -206,9 +213,8 @@ if (!file.exists(file)) download.file(url, file, method = "curl")
206213
file.size(file) / 1e6 # in MB
207214
208215
# quick check of the filesize
209-
d <- read_parquet(file)
210-
dim(d)
211-
names(d)
216+
d <- open_dataset(file)
217+
describe(d)
212218
213219
# write the dataset to disk
214220
write_dataset(d, "nyc-taxi-data")

0 commit comments

Comments
 (0)