|
| 1 | +#' Describes a dataset |
| 2 | +#' |
| 3 | +#' Note that the current version is in the beta stadium at best, that means the |
| 4 | +#' R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster |
| 5 | +#' than arrow or SQL-based datasets. |
| 6 | +#' |
| 7 | +#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`], |
| 8 | +#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection) |
| 9 | +#' |
| 10 | +#' @return a `data.frame`, `dplyr::tibble`, or `data.table::data.table` containing |
| 11 | +#' a summary of the dataset given |
| 12 | +#' @export |
| 13 | +#' |
| 14 | +#' @seealso Similar to [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html), |
| 15 | +#' [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary), |
| 16 | +#' and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html) |
| 17 | +#' |
| 18 | +#' @examples |
| 19 | +#' describe(mtcars) |
| 20 | +describe <- function(x) { |
| 21 | + |
| 22 | + backend <- detect_backend(x) |
| 23 | + |
| 24 | + # make sure the input dataset has the right class |
| 25 | + if (class(x)[[1]] == "data.frame") { |
| 26 | + if (backend == "data.table") { |
| 27 | + x <- data.table::as.data.table(x) |
| 28 | + } else if (backend == "dplyr") { |
| 29 | + x <- dplyr::as_tibble(x) |
| 30 | + } |
| 31 | + } |
| 32 | + |
| 33 | + if (backend == "base-r") { |
| 34 | + describe_base_r(x) |
| 35 | + } else if (backend == "dplyr") { |
| 36 | + describe_dplyr(x) |
| 37 | + } else if (backend == "data.table") { |
| 38 | + describe_data.table(x) |
| 39 | + } else if (backend == "collectibles") { |
| 40 | + if ("tbl_sql" %in% class(x)) { |
| 41 | + describe_sql(x) |
| 42 | + } else if ("ArrowObject" %in% class(x)) { |
| 43 | + describe_arrow(x) |
| 44 | + } |
| 45 | + } else { |
| 46 | + stop(sprintf("Could not detect backend to describe %s", paste(class(x), collapse = ", "))) |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | + |
| 51 | +# internal function to see which values should use the min/max etc part |
| 52 | +is_numeric <- function(v) { |
| 53 | + any(class(v) %in% c("integer", "numeric", "POSIXt")) |
| 54 | +} |
| 55 | + |
| 56 | +# x <- mtcars |
| 57 | +describe_base_r <- function(x, max_n = 3) { |
| 58 | + ll <- lapply( |
| 59 | + seq(ncol(x)), |
| 60 | + function(i) { |
| 61 | + v <- x[[i]] |
| 62 | + type <- class(v)[[1]] |
| 63 | + is_num <- is_numeric(v) |
| 64 | + |
| 65 | + tbl <- table(v) |
| 66 | + uv <- unique(v) |
| 67 | + tab <- tabulate(match(v, uv)) |
| 68 | + tab_max <- which(tab == max(tab)) |
| 69 | + # get the indices of the three highest counts |
| 70 | + od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))] |
| 71 | + |
| 72 | + nz <- if (!is_num) nchar(as.character(v)) |
| 73 | + |
| 74 | + data.frame( |
| 75 | + var = names(x)[[i]], |
| 76 | + type = type, |
| 77 | + n = length(v), |
| 78 | + n_distinct = length(unique(v)), |
| 79 | + n_na = sum(is.na(v)), |
| 80 | + most_frequent = paste(sprintf("%s (%s)", uv[od], tab[od]), |
| 81 | + collapse = ", "), |
| 82 | + |
| 83 | + min = as.numeric(min(if (is_num) v else nz, na.rm = TRUE)), |
| 84 | + mean = as.numeric(mean(if (is_num) v else nz, na.rm = TRUE)), |
| 85 | + median = as.numeric(median(if (is_num) v else nz, na.rm = TRUE)), |
| 86 | + max = as.numeric(max(if (is_num) v else nz, na.rm = TRUE)), |
| 87 | + sd = as.numeric(sd(if (is_num) v else nz, na.rm = TRUE)) |
| 88 | + ) |
| 89 | + } |
| 90 | + ) |
| 91 | + |
| 92 | + do.call(rbind, ll) |
| 93 | +} |
| 94 | + |
| 95 | +# x <- mtcars |> tibble::as_tibble() |
| 96 | +describe_dplyr <- function(x, max_n = 3) { |
| 97 | + ll <- lapply( |
| 98 | + names(x), |
| 99 | + function(v) { |
| 100 | + mc <- x |> |
| 101 | + dplyr::count(.data[[v]]) |> |
| 102 | + dplyr::slice_max(n, n = max_n, with_ties = FALSE) |
| 103 | + |
| 104 | + type <- class(mc[[1]])[[1]] |
| 105 | + is_num <- is_numeric(mc[[1]]) |
| 106 | + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") |
| 107 | + |
| 108 | + nz <- if (!is_num) nchar(as.character(x[[v]])) |
| 109 | + x |> |
| 110 | + dplyr::summarise( |
| 111 | + var = v, |
| 112 | + type = type, |
| 113 | + n = dplyr::n(), |
| 114 | + n_distinct = dplyr::n_distinct(.data[[v]]), |
| 115 | + n_na = sum(is.na(.data[[v]])), |
| 116 | + most_frequent = mf, |
| 117 | + min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)), |
| 118 | + mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)), |
| 119 | + median = as.numeric(median(if (is_num) .data[[v]] else nz, na.rm = TRUE)), |
| 120 | + max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)), |
| 121 | + sd = as.numeric(sd(if (is_num) .data[[v]] else nz, na.rm = TRUE)) |
| 122 | + ) |
| 123 | + } |
| 124 | + ) |
| 125 | + |
| 126 | + dplyr::bind_rows(ll) |
| 127 | +} |
| 128 | + |
| 129 | +# x <- mtcars |> data.table::as.data.table() |
| 130 | +describe_data.table <- function(x, max_n = 3) { |
| 131 | + ll <- lapply( |
| 132 | + names(x), |
| 133 | + function(v) { |
| 134 | + mc <- x[, .(n = .N), by = v][order(n, decreasing = TRUE)][seq(max_n)] |
| 135 | + |
| 136 | + type <- class(mc[[1]])[[1]] |
| 137 | + is_num <- is_numeric(mc[[1]]) |
| 138 | + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") |
| 139 | + |
| 140 | + nz <- if (!is_num) nchar(as.character(x[[v]])) |
| 141 | + |
| 142 | + x[, .( |
| 143 | + var = v, |
| 144 | + type = type, |
| 145 | + n = .N, |
| 146 | + n_distinct = data.table::uniqueN(get(v)), |
| 147 | + n_na = sum(is.na(get(v))), |
| 148 | + most_frequent = mf, |
| 149 | + |
| 150 | + min = as.numeric(min(if (is_num) get(v) else nz, na.rm = TRUE)), |
| 151 | + mean = as.numeric(mean(if (is_num) get(v) else nz, na.rm = TRUE)), |
| 152 | + median = as.numeric(median(if (is_num) get(v) else nz, na.rm = TRUE)), |
| 153 | + max = as.numeric(max(if (is_num) get(v) else nz, na.rm = TRUE)), |
| 154 | + sd = as.numeric(sd(if (is_num) get(v) else nz, na.rm = TRUE)) |
| 155 | + )] |
| 156 | + } |
| 157 | + ) |
| 158 | + |
| 159 | + data.table::rbindlist(ll) |
| 160 | +} |
| 161 | + |
| 162 | + |
| 163 | +# RSQLite, duckdb etc |
| 164 | +describe_sql <- function(x, max_n = 3) { |
| 165 | + ll <- lapply(names(x), function(v) { |
| 166 | + mc <- x |> |
| 167 | + dplyr::count(.data[[v]]) |> |
| 168 | + dplyr::slice_max(n, n = max_n, with_ties = FALSE) |> |
| 169 | + dplyr::collect() |
| 170 | + |
| 171 | + type <- class(mc[[1]])[[1]] |
| 172 | + is_num <- is_numeric(mc[[1]]) |
| 173 | + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") |
| 174 | + nn <- x |> |
| 175 | + dplyr::distinct(.data[[v]]) |> |
| 176 | + dplyr::summarise(n = dplyr::n()) |> |
| 177 | + dplyr::collect() |
| 178 | + nna <- x |> dplyr::filter(is.na(.data[[v]])) |> dplyr::collect() |> nrow() |
| 179 | + |
| 180 | + r <- dplyr::tibble( |
| 181 | + var = v, |
| 182 | + type = type, |
| 183 | + n_distinct = nn[[1]], |
| 184 | + n_na = nna, |
| 185 | + most_frequent = mf |
| 186 | + ) |
| 187 | + |
| 188 | + xx <- x |> |
| 189 | + dplyr::select(dplyr::all_of(v)) |> |
| 190 | + dplyr::rename(x := dplyr::all_of(v)) |
| 191 | + if (!is_num) xx <- xx |> dplyr::mutate(x = nchar(as.character(x))) |
| 192 | + |
| 193 | + |
| 194 | + rr <- try( |
| 195 | + xx |> |
| 196 | + dplyr::summarise( |
| 197 | + min = min(x, na.rm = TRUE), |
| 198 | + mean = mean(x, na.rm = TRUE), |
| 199 | + median = median(x, na.rm = TRUE), |
| 200 | + max = max(x, na.rm = TRUE), |
| 201 | + sd = sd(x, na.rm = TRUE) |
| 202 | + ) |> |
| 203 | + dplyr::collect(), |
| 204 | + silent = TRUE |
| 205 | + ) |
| 206 | + if (inherits(rr, "try-error")) { |
| 207 | + rr <- dplyr::tibble( |
| 208 | + min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_, |
| 209 | + sd = NA_real_ |
| 210 | + ) |
| 211 | + } |
| 212 | + |
| 213 | + dplyr::bind_cols(r, rr) |
| 214 | + }) |
| 215 | + |
| 216 | + dplyr::bind_rows(ll) |
| 217 | +} |
| 218 | + |
| 219 | +# arrow::write_parquet(nycflights13::flights, "flights.parquet") |
| 220 | +# x <- arrow::open_dataset("flights.parquet") |
| 221 | +describe_arrow <- function(x, max_n = 3) { |
| 222 | + # if x is a dbplyr connection string |
| 223 | + ll <- lapply(names(x), function(v) { |
| 224 | + mc <- x |> |
| 225 | + dplyr::count(.data[[v]]) |> |
| 226 | + dplyr::slice_max(n, n = max_n, with_ties = FALSE) |> |
| 227 | + dplyr::collect() |
| 228 | + |
| 229 | + type <- class(mc[[1]])[[1]] |
| 230 | + is_num <- is_numeric(mc[[1]]) |
| 231 | + mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ") |
| 232 | + nn <- x |> |
| 233 | + dplyr::distinct(.data[[v]]) |> |
| 234 | + dplyr::summarise(n = dplyr::n()) |> |
| 235 | + dplyr::collect() |
| 236 | + nna <- x |> dplyr::filter(is.na(.data[[v]])) |> nrow() |
| 237 | + |
| 238 | + r <- dplyr::tibble( |
| 239 | + var = v, |
| 240 | + type = type, |
| 241 | + n_distinct = nn[[1]], |
| 242 | + n_na = nna, |
| 243 | + most_frequent = mf |
| 244 | + ) |
| 245 | + |
| 246 | + if (is_num) { |
| 247 | + xx <- x |> dplyr::transmute(x = get(v)) |
| 248 | + } else { |
| 249 | + xx <- x |> dplyr::transmute(x = nchar(as.character(get(v)))) |
| 250 | + } |
| 251 | + |
| 252 | + suppressWarnings({ |
| 253 | + rr <- try( |
| 254 | + xx |> |
| 255 | + dplyr::summarise( |
| 256 | + min = min(x, na.rm = TRUE), |
| 257 | + mean = mean(x, na.rm = TRUE), |
| 258 | + median = median(x, na.rm = TRUE), |
| 259 | + max = max(x, na.rm = TRUE), |
| 260 | + sd = sd(x, na.rm = TRUE) |
| 261 | + ) |> |
| 262 | + dplyr::collect(), |
| 263 | + silent = TRUE) |
| 264 | + }) |
| 265 | + if (inherits(rr, "try-error")) { |
| 266 | + rr <- dplyr::tibble( |
| 267 | + min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_, |
| 268 | + sd = NA_real_ |
| 269 | + ) |
| 270 | + } |
| 271 | + |
| 272 | + dplyr::bind_cols(r, rr) |
| 273 | + }) |
| 274 | + |
| 275 | + dplyr::bind_rows(ll) |
| 276 | +} |
0 commit comments