diff --git a/.Rbuildignore b/.Rbuildignore
index 6ae4005..46f8de9 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -10,3 +10,7 @@
 ^\.github$
 ^CRAN-SUBMISSION$
 ^cran-comments\.md$
+^.lintr$
+^yellow_tripdata_2018-01.parquet$
+^AGENTS.md$
+^duckdb_*.tar.gz$
diff --git a/.gitignore b/.gitignore
index f264c84..3bfb922 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 .Rhistory
 .RData
 .Ruserdata
+.Renviron
 
 *.parquet
 nyc-taxi-data/*
diff --git a/.lintr b/.lintr
new file mode 100644
index 0000000..b366d6f
--- /dev/null
+++ b/.lintr
@@ -0,0 +1,6 @@
+linters: linters_with_defaults(
+    line_length_linter(100),
+    object_usage_linter = NULL
+  )
+exclusions: list(
+  )
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..4d64a87
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,19 @@
+# Dataverifyr
+
+An R Library to verify that data exists and is valid, see comprehensive README.md.
+
+All code must be thoroughly tested and follow modern R standards.
+Test first: Before writing any code write the expected tests!
+
+All material updates to the code base itself must be registered in the NEWS.md file.
+
+Never change .Rd files, instead change the roxygen2 documentation above the function (use `devtools::document()` to create the .Rd documentation).
+
+The package is build with efficiency in mind.
+This shows for example that most operations are pushed to the database layer if possible.
+Also, additional dependencies need to be explicity allowed before they are added!
+
+When invoking R, use `/opt/R/4.4.0/bin/R` or `/opt/R/4.4.0/bin/Rscript`.
+
+To check, use `rcmdcheck::rcmdcheck()`.
+
diff --git a/DESCRIPTION b/DESCRIPTION
index e3f307a..812050f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: dataverifyr
 Type: Package
 Title: A Lightweight, Flexible, and Fast Data Validation Package that Can Handle All Sizes of Data
-Version: 0.1.9
+Version: 0.1.10
 Authors@R: c(
       person(given = "David",
              family = "Zimmermann-Kollenda",
@@ -35,4 +35,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.3
diff --git a/NAMESPACE b/NAMESPACE
index 465c8fc..581f863 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,13 +5,16 @@ S3method("+",ruleset)
 S3method(print,rule)
 S3method(print,ruleset)
 export(check_data)
+export(data_column)
 export(describe)
 export(detect_backend)
 export(filter_fails)
 export(plot_res)
 export(read_rules)
+export(reference_rule)
 export(rule)
 export(ruleset)
+export(sample_data)
 export(write_rules)
 importFrom(graphics,axis)
 importFrom(graphics,barplot)
diff --git a/NEWS.md b/NEWS.md
index 5809429..8dc6b52 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,16 @@
+# dataverifyr 0.1.10
+
+* Add `describe()` to describe a dataset
+* `check_data()` now includes schema checks in the output by default (`check_type` as first result column), including explicit rows for column existence and declared type checks
+* add `stop_on_schema_fail` to `check_data()` to optionally stop when schema checks fail
+* update `filter_fails()` to ignore schema/reference rows and only process row rules from `check_data()` results
+* Add explicit regression test for `detect_backend()` fallback to `dplyr` when input is a `data.frame` and `data.table` is unavailable
+* Add structured ruleset internals for schema metadata (`data_column()`, `rule_meta()`) and reference checks (`reference_rule()`)
+* Extend `ruleset()`, `check_data()`, `read_rules()`, and `write_rules()` for v1 schema-aware workflows; keep `rule()` as row-level API (no `col_rule()`)
+* Add exported `sample_data` dataset (mixed types, NAs, datetime) for examples and tests
+* export `reference_rule()` and extend examples in `ruleset()`, `check_data()`, `reference_rule()`, and `data_column()` to show combined schema + relational workflows
+* Require DuckDB version `>= 1.5.1.9002` in all DuckDB-backed tests via `skip_if_not_installed("duckdb", "1.5.1.9002")`
+
 # dataverifyr 0.1.9
 
 * fix tests for new duckdb version (fixes #17, thanks @krlmlr for reporting)
diff --git a/R/check_data.R b/R/check_data.R
index f1b3559..8eb01ba 100644
--- a/R/check_data.R
+++ b/R/check_data.R
@@ -1,260 +1,380 @@
-#' Checks if a dataset confirms to a given set of rules
-#'
-#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`],
-#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection)
-#' @param rules a list of [`rule`]s
-#' @param xname optional, a name for the x variable (only used for errors)
-#' @param stop_on_fail when any of the rules fail, throw an error with stop
-#' @param stop_on_warn when a warning is found in the code execution, throw an error with stop
-#' @param stop_on_error when an error is found in the code execution, throw an error with stop
-#'
-#' @return a data.frame-like object with one row for each rule and its results
-#' @export
-#'
-#' @seealso [detect_backend()]
-#'
-#' @examples
-#' rs <- ruleset(
-#'   rule(mpg > 10),
-#'   rule(cyl %in% c(4, 6)), # missing 8
-#'   rule(qsec >= 14.5 & qsec <= 22.9)
-#' )
-#' rs
-#'
-#' check_data(mtcars, rs)
-check_data <- function(x, rules, xname = deparse(substitute(x)),
-                       stop_on_fail = FALSE, stop_on_warn = FALSE,
-                       stop_on_error = FALSE) {
-
-  # if rules is a yaml file, read it in
-  if (length(rules) == 1 && is.character(rules)) rules <- read_rules(rules)
-  # treat single rule if needed
-  if (hasName(rules, "expr")) rules <- ruleset(rules)
-
-  backend <- detect_backend(x)
-
-  # make sure the input dataset has the right class
-  if (class(x)[[1]] == "data.frame") {
-    if (backend == "data.table") {
-      x <- data.table::as.data.table(x)
-    } else if (backend == "dplyr") {
-      x <- dplyr::as_tibble(x)
-    }
-  }
-
-  res <- check_(x, rules, backend = backend)
-
-  # stops on fail, warning and/or error
-  fail <- stop_on_fail && any(res$fail != 0)
-  warn <- stop_on_warn && any(res$warn != "")
-  err <- stop_on_error && any(res$error != "")
-
-  if (fail || warn || err) {
-    tt <- paste(c(
-      if (fail) sprintf("%s rule fails", sum(res$fail != 0)),
-      if (warn) sprintf("%s warnings", sum(res$warn != "")),
-      if (err) sprintf("%s errors", sum(res$error != ""))
-    ), collapse = ", ")
-
-    stop(sprintf("In dataset '%s' found %s", xname, tt))
-  }
-
-  res
-}
-
-#' Detects the backend which will be used for checking the rules
-#'
-#' @description
-#' The detection will be made based on the class of the object as well as the packages installed.
-#' For example, if a `data.frame` is used, it will look if `data.table` or `dplyr` are installed on the system, as they provide more speed.
-#' Note the main functions will revert the
-#'
-#' @param x The data object, ie a data.frame, tibble, data.table, arrow, or DBI object
-#'
-#' @return a single character element with the name of the backend to use.
-#' One of `base-r`, `data.table`, `dplyr`, `collectibles` (for arrow or DBI objects)
-#' @export
-#'
-#' @seealso [check_data()]
-#' @examples
-#' data <- mtcars
-#' detect_backend(data)
-detect_backend <- function(x) {
-  cc <- class(x)
-  if ("data.table" %in% cc) {
-    if (!has_pkg("data.table"))
-      stop("The data.table package needs to be installed in order to test a data.table OR you can convert the data to a data.frame first!")
-    backend <- "data.table"
-
-  } else if (any(c("tibble", "tbl_df") %in% cc)) {
-    if (!has_pkg("dplyr"))
-      stop("The dplyr package needs to be installed in order to test a tibble OR you can convert the data to a data.frame first!")
-    backend <- "dplyr"
-
-  } else if (length(cc) == 1 && cc == "data.frame") {
-
-    if (has_pkg("data.table")) {
-      backend <- "data.table"
-    } else if (has_pkg("dplyr")) {
-      backend <- "dplyr"
-    } else {
-      backend <- "base-r"
-    }
-
-  } else if ("tbl_sql" %in% cc) {
-
-    if (!has_pkg("DBI") || !has_pkg("dbplyr"))
-      stop("The DBI and dbplyr packages need to be installed in order to test a tbl_sql.")
-
-    backend <- "collectibles"
-
-  } else if ("ArrowObject" %in% cc) {
-
-    if (!has_pkg("arrow") || !has_pkg("dbplyr"))
-      stop("The arrow and dbplyr packages need to be installed in order to test an ArrowObject.")
-
-    backend <- "collectibles"
-
-  } else {
-
-    stop(sprintf(paste("Unknown class of x found: '%s'.",
-                       "x must be a data.frame/tibble/data.table or a tbl (SQL table) or ArrowObject."),
-                 paste(cc, collapse = ",  ")))
-  }
-  backend
-}
-
-# small helper to check if a package is installed
-has_pkg <- function(p) requireNamespace(p, quietly = TRUE)
-
-
-# helper function that collects all warnings
-get_warnings <- function(code) {
-  out <- c()
-  suppressWarnings(withCallingHandlers(code, warning = function(c) out <<- c(out, conditionMessage(c))))
-  strip_dplyr_errors(paste(unique(out), collapse = ", "))
-}
-
-# helper function that checks the rules given a specific backend (~package)
-check_ <- function(x, rules, backend = c("base-r", "dplyr", "data.table", "collectibles")) {
-
-  backend <- match.arg(backend)
-
-  # function to create a data.frame
-  to_df <- switch(backend,
-                  "base-r" = data.frame,
-                  dplyr = dplyr::tibble,
-                  data.table = data.table::data.table,
-                  collectibles = dplyr::tibble)
-  # function to combine multiple data.frames into a single one by row
-  br <- switch(backend,
-               "base-r" = function(l) do.call(rbind, l),
-               dplyr = dplyr::bind_rows,
-               data.table = data.table::rbindlist,
-               collectibles = dplyr::bind_rows)
-
-  ll <- lapply(rules, function(r) {
-
-    e <- r$expr
-
-    # negate expression if needed
-    if ("negate" %in% names(r) && r$negate) {
-      e <- paste0("!(", e, ")")
-    }
-
-    # add the is.na guards when allow_na = TRUE
-    if ("allow_na" %in% names(r) && r$allow_na) {
-      v <- get_symbols(e)
-      e <- paste("(", e, ")", "|", paste("is.na(", v, ")",
-                                         sep = "", collapse = " | "))
-    }
-
-
-    # evaluate the expression and collect warnings and errors...
-    t0 <- Sys.time()
-    nr <- if ("tbl_sql" %in% class(x)) {
-      dplyr::pull(dplyr::collect(dplyr::summarise(x, n = dplyr::n())), n)
-    } else {
-      nrow(x)
-    }
-    warns <- ""
-    err <- ""
-    oc <- Sys.getenv("NO_COLOR") # to turn off colors in errors/warnings
-    Sys.setenv(NO_COLOR = "OFF")
-    pos <- tryCatch({
-      warns <- get_warnings({
-        pos <- filter_data_(x, backend, e)
-      })
-      pos
-    }, error = function(err) {
-      strip_dplyr_errors(conditionMessage(err))
-    })
-
-    Sys.setenv("NO_COLOR" = oc) # to turn on colors in errors/warnings
-
-    if (is.character(pos)) {
-      err <- pos
-      pos <- 0
-    }
-    tt <- difftime(Sys.time(), t0, units = "secs")
-    to_df(
-      name = r$name,
-      expr = r$expr,
-      allow_na = r$allow_na,
-      negate = r$negate,
-      tests = nr,
-      pass = pos,
-      fail = nr - pos,
-      warn = warns,
-      error = err,
-      time = tt
-    )
-  })
-
-  br(ll)
-}
-
-# internal helper function that filters a dataset x
-# when return_n = FALSE the data is returned, otherwise the number of rows
-filter_data_ <- function(x, backend, e, return_n = TRUE) {
-  if (backend == "base-r") {
-    # note that the nrow(with(x, x[eval(parse(text = e)), ]))
-    # includes NA rows and therefore returns the wrong number of rows
-    pos <- with(x, eval(parse(text = e)))
-    pos <- if (return_n) sum(pos, na.rm = TRUE) else x[pos, ]
-  } else if (backend == "dplyr" | backend == "collectibles") {
-    rr <- dplyr::filter(x, !!str2lang(e))
-    if (backend == "collectibles") {
-      if (return_n) {
-        pos <- dplyr::pull(
-          dplyr::collect(dplyr::summarise(rr, n = dplyr::n())),
-          n
-        )
-      } else {
-        pos <- dplyr::collect(rr)
-      }
-    } else { # dplyr
-      pos <- if (return_n) nrow(rr) else rr
-    }
-  } else if (backend == "data.table") {
-    pos <- x[eval(parse(text = e)), ]
-    if (return_n) pos <- nrow(pos)
-  }
-  pos
-}
-
-# strips a dplyr/cli error message of its formatting
-# x <- "\033[37;48;5;19m\033[38;5;232mProblem while computing ... .\033[39m\n\033[1mCaused by error in xxx:\033[22m\n\033[33m!\033[39m object 'does_not_exist' not found\033[39;49m"
-# x <- "Problem while computing `..1 = eval(parse(text = e))`.\nCaused by error in `does_not_exist %in% c(\"a\", \"b\", \"c\")`:\n! object 'does_not_exist' not found"
-# x <- "\033[38;5;232mThere were 2 warnings in `dplyr::filter()`.\nThe first warning was:\033[39m\n\033[38;5;232m\033[36mℹ\033[38;5;232m In argument: `as.numeric(hp) > 0 & as.numeric(hp) < 400`.\033[39m\nCaused by warning:\n\033[33m!\033[39m NAs introduced by coercion\n\033[38;5;232m\033[36mℹ\033[38;5;232m Run \033]8;;ide:run:dplyr::last_dplyr_warnings()\adplyr::last_dplyr_warnings()\033]8;;\a to see the 1 remaining warning.\033[39m"
-strip_dplyr_errors <- function(x) {
-  if (substr(x, 1, 1) == "\033") {
-    r <- gsub("\033.*\033\\[33m\\!\033\\[39m ", "", x)
-    r <- gsub("\033.*$", "", r)
-  } else {
-    # testthat automatically turns off the colorful errors...
-    r <- gsub(".*\\n\\! ", "", x)
-    r <- gsub("i Run.*$", "", r) # remove multiline warnings
-  }
-  gsub("\\n$", "", r)
-}
+#' Checks if a dataset confirms to a given set of rules
+#'
+#' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`],
+#' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection).
+#' Can also be a named list of datasets when using reference rules.
+#' @param rules a list of [`rule`]s
+#' @param xname optional, a name for the x variable (only used for errors)
+#' @param stop_on_fail when any of the rules fail, throw an error with stop
+#' @param stop_on_warn when a warning is found in the code execution, throw an error with stop
+#' @param stop_on_error when an error is found in the code execution, throw an error with stop
+#' @param stop_on_schema_fail when any schema checks fail, throw an error with stop
+#' @param extra_columns how to treat columns in `x` that are not declared in
+#' optional `data_columns` attached to a ruleset. One of `"ignore"` (default),
+#' `"warn"`, or `"fail"`.
+#'
+#' @return a data.frame-like object with one row for each rule and its results
+#' @export
+#'
+#' @seealso [detect_backend()]
+#'
+#' @examples
+#' rs <- ruleset(
+#'   rule(mpg > 10),
+#'   rule(cyl %in% c(4, 6)), # missing 8
+#'   rule(qsec >= 14.5 & qsec <= 22.9)
+#' )
+#' rs
+#'
+#' check_data(mtcars, rs)
+#'
+#' # schema + relation checks in one output
+#' orders <- data.frame(order_id = 1:3, customer_id = c(10, 99, NA), amount = c(10, -5, 20))
+#' customers <- data.frame(customer_id = c(10, 11))
+#'
+#' rs2 <- ruleset(
+#'   rule(amount >= 0, name = "amount non-negative"),
+#'   reference_rule(
+#'     local_col = "customer_id",
+#'     ref_dataset = "customers",
+#'     ref_col = "customer_id",
+#'     allow_na = TRUE
+#'   ),
+#'   data_columns = list(
+#'     data_column("order_id", type = "int", optional = FALSE),
+#'     data_column("customer_id", type = "double", optional = FALSE),
+#'     data_column("amount", type = "double", optional = FALSE)
+#'   ),
+#'   data_name = "orders"
+#' )
+#'
+#' check_data(list(orders = orders, customers = customers), rs2)
+check_data <- function(x, rules, xname = deparse(substitute(x)),
+                       stop_on_fail = FALSE, stop_on_warn = FALSE,
+                       stop_on_error = FALSE, stop_on_schema_fail = FALSE,
+                       extra_columns = c("ignore", "warn", "fail")) {
+
+  extra_columns <- match.arg(extra_columns)
+  # if rules is a yaml file, read it in
+  if (length(rules) == 1 && is.character(rules)) rules <- read_rules(rules)
+  # treat single rule if needed
+  if (hasName(rules, "expr")) rules <- ruleset(rules)
+
+  datasets <- NULL
+  if (is.list(x) && !inherits(x, "data.frame")) {
+    datasets <- x
+    data_name <- attr(rules, "data_name", exact = TRUE)
+    if (is.null(data_name)) {
+      data_name <- names(datasets)[[1]]
+    }
+    if (is.null(data_name) || is.na(data_name) || !nzchar(data_name)) {
+      stop(paste(
+        "When `x` is a list, datasets must be named or",
+        "`ruleset(..., data_name=...)` must be set."
+      ))
+    }
+    if (!data_name %in% names(datasets)) {
+      stop(sprintf("The primary dataset '%s' was not found in `x`.", data_name))
+    }
+    x <- datasets[[data_name]]
+  }
+
+  schema_res <- validate_rules_against_schema(
+    x, rules, extra_columns = extra_columns
+  )
+
+  backend <- detect_backend(x)
+
+  # make sure the input dataset has the right class
+  if (class(x)[[1]] == "data.frame") {
+    if (backend == "data.table") {
+      x <- data.table::as.data.table(x)
+    } else if (backend == "dplyr") {
+      x <- dplyr::as_tibble(x)
+    }
+  }
+
+  expr_rules <- Filter(function(r) !inherits(r, "reference_rule"), rules)
+  ref_rules <- Filter(function(r) inherits(r, "reference_rule"), rules)
+
+  if (length(expr_rules)) {
+    res <- check_(x, expr_rules, backend = backend)
+  } else {
+    res <- data.frame(
+      check_type = character(),
+      name = character(),
+      expr = character(),
+      allow_na = logical(),
+      negate = logical(),
+      tests = integer(),
+      pass = integer(),
+      fail = integer(),
+      warn = character(),
+      error = character(),
+      time = as.difftime(numeric(), units = "secs"),
+      stringsAsFactors = FALSE
+    )
+  }
+
+  if (length(ref_rules)) {
+    if (is.null(datasets)) {
+      stop("Reference rules require `x` to be a named list of datasets.")
+    }
+    ref_res <- do.call(rbind, lapply(ref_rules, function(r) {
+      check_reference_rule(x, r, datasets)
+    }))
+
+    res <- switch(
+      backend,
+      "base-r" = rbind(res, ref_res),
+      dplyr = dplyr::bind_rows(res, ref_res),
+      data.table = data.table::rbindlist(list(res, ref_res), fill = TRUE),
+      collectibles = dplyr::bind_rows(res, ref_res)
+    )
+  }
+
+  if (nrow(schema_res)) {
+    res <- switch(
+      backend,
+      "base-r" = rbind(schema_res, res),
+      dplyr = dplyr::bind_rows(schema_res, res),
+      data.table = data.table::rbindlist(list(schema_res, res), fill = TRUE),
+      collectibles = dplyr::bind_rows(schema_res, res)
+    )
+  }
+
+  # stops on fail, warning and/or error
+  is_schema <- if ("check_type" %in% names(res)) {
+    res$check_type == "schema"
+  } else {
+    rep(FALSE, nrow(res))
+  }
+  is_rule <- !is_schema
+
+  fail <- stop_on_fail && any(res$fail[is_rule] != 0)
+  schema_fail <- stop_on_schema_fail && any(res$fail[is_schema] != 0)
+  warn <- stop_on_warn && any(res$warn != "")
+  err <- stop_on_error && any(res$error != "")
+
+  if (fail || schema_fail || warn || err) {
+    tt <- paste(c(
+      if (fail) sprintf("%s rule fails", sum(res$fail[is_rule] != 0)),
+      if (schema_fail) sprintf("%s schema fails", sum(res$fail[is_schema] != 0)),
+      if (warn) sprintf("%s warnings", sum(res$warn != "")),
+      if (err) sprintf("%s errors", sum(res$error != ""))
+    ), collapse = ", ")
+
+    stop(sprintf("In dataset '%s' found %s", xname, tt))
+  }
+
+  res
+}
+
+#' Detects the backend which will be used for checking the rules
+#'
+#' @description
+#' The detection will be made based on the class of the object as well as the packages installed.
+#' For example, if a `data.frame` is used, it will look if `data.table` or `dplyr` are installed
+#' on the system, as they provide more speed.
+#' Note the main functions will revert the
+#'
+#' @param x The data object, ie a data.frame, tibble, data.table, arrow, or DBI object
+#'
+#' @return a single character element with the name of the backend to use.
+#' One of `base-r`, `data.table`, `dplyr`, `collectibles` (for arrow or DBI objects)
+#' @export
+#'
+#' @seealso [check_data()]
+#' @examples
+#' data <- mtcars
+#' detect_backend(data)
+detect_backend <- function(x) {
+  cc <- class(x)
+  if ("data.table" %in% cc) {
+    if (!has_pkg("data.table"))
+      stop(paste(
+        "The data.table package needs to be installed in order to test a data.table",
+        "OR you can convert the data to a data.frame first!"
+      ))
+    backend <- "data.table"
+
+  } else if (any(c("tibble", "tbl_df") %in% cc)) {
+    if (!has_pkg("dplyr"))
+      stop(paste(
+        "The dplyr package needs to be installed in order to test a tibble OR",
+        "you can convert the data to a data.frame first!"
+      ))
+    backend <- "dplyr"
+
+  } else if (length(cc) == 1 && cc == "data.frame") {
+
+    if (has_pkg("data.table")) {
+      backend <- "data.table"
+    } else if (has_pkg("dplyr")) {
+      backend <- "dplyr"
+    } else {
+      backend <- "base-r"
+    }
+
+  } else if ("tbl_sql" %in% cc) {
+
+    if (!has_pkg("DBI") || !has_pkg("dbplyr"))
+      stop("The DBI and dbplyr packages need to be installed in order to test a tbl_sql.")
+
+    backend <- "collectibles"
+
+  } else if ("ArrowObject" %in% cc) {
+
+    if (!has_pkg("arrow") || !has_pkg("dbplyr"))
+      stop("The arrow and dbplyr packages need to be installed in order to test an ArrowObject.")
+
+    backend <- "collectibles"
+
+  } else {
+
+    stop(sprintf(paste(
+      "Unknown class of x found: '%s'.",
+      "x must be a data.frame/tibble/data.table or a tbl (SQL table) or ArrowObject."
+    ), paste(cc, collapse = ",  ")))
+  }
+  backend
+}
+
+# small helper to check if a package is installed
+has_pkg <- function(p) requireNamespace(p, quietly = TRUE)
+
+
+# helper function that collects all warnings
+get_warnings <- function(code) {
+  out <- c()
+  suppressWarnings(withCallingHandlers(code,
+                                       warning = function(c) out <<- c(out, conditionMessage(c))))
+  strip_dplyr_errors(paste(unique(out), collapse = ", "))
+}
+
+# helper function that checks the rules given a specific backend (~package)
+check_ <- function(x, rules, backend = c("base-r", "dplyr", "data.table", "collectibles")) {
+
+  backend <- match.arg(backend)
+
+  # function to create a data.frame
+  to_df <- switch(backend,
+                  "base-r" = data.frame,
+                  dplyr = dplyr::tibble,
+                  data.table = data.table::data.table,
+                  collectibles = dplyr::tibble)
+  # function to combine multiple data.frames into a single one by row
+  br <- switch(backend,
+               "base-r" = function(l) do.call(rbind, l),
+               dplyr = dplyr::bind_rows,
+               data.table = data.table::rbindlist,
+               collectibles = dplyr::bind_rows)
+
+  ll <- lapply(rules, function(r) {
+
+    e <- r$expr
+
+    # negate expression if needed
+    if ("negate" %in% names(r) && r$negate) {
+      e <- paste0("!(", e, ")")
+    }
+
+    # add the is.na guards when allow_na = TRUE
+    if ("allow_na" %in% names(r) && r$allow_na) {
+      v <- get_symbols(e)
+      e <- paste("(", e, ")", "|", paste("is.na(", v, ")",
+                                         sep = "", collapse = " | "))
+    }
+
+
+    # evaluate the expression and collect warnings and errors...
+    t0 <- Sys.time()
+    nr <- if ("tbl_sql" %in% class(x)) {
+      dplyr::pull(dplyr::collect(dplyr::summarise(x, n = dplyr::n())), n)
+    } else {
+      nrow(x)
+    }
+    warns <- ""
+    err <- ""
+    oc <- Sys.getenv("NO_COLOR") # to turn off colors in errors/warnings
+    Sys.setenv(NO_COLOR = "OFF")
+    pos <- tryCatch({
+      warns <- get_warnings({
+        pos <- filter_data_(x, backend, e)
+      })
+      pos
+    }, error = function(err) {
+      strip_dplyr_errors(conditionMessage(err))
+    })
+
+    Sys.setenv("NO_COLOR" = oc) # to turn on colors in errors/warnings
+
+    if (is.character(pos)) {
+      err <- pos
+      pos <- 0
+    }
+    tt <- difftime(Sys.time(), t0, units = "secs")
+    to_df(
+      check_type = "row_rule",
+      name = r$name,
+      expr = r$expr,
+      allow_na = r$allow_na,
+      negate = r$negate,
+      tests = nr,
+      pass = pos,
+      fail = nr - pos,
+      warn = warns,
+      error = err,
+      time = tt
+    )
+  })
+
+  br(ll)
+}
+
+# internal helper function that filters a dataset x
+# when return_n = FALSE the data is returned, otherwise the number of rows
+filter_data_ <- function(x, backend, e, return_n = TRUE) {
+  if (backend == "base-r") {
+    # note that the nrow(with(x, x[eval(parse(text = e)), ]))
+    # includes NA rows and therefore returns the wrong number of rows
+    pos <- with(x, eval(parse(text = e)))
+    pos <- if (return_n) sum(pos, na.rm = TRUE) else x[pos, ]
+  } else if (backend == "dplyr" || backend == "collectibles") {
+    rr <- dplyr::filter(x, !!str2lang(e))
+    if (backend == "collectibles") {
+      if (return_n) {
+        pos <- dplyr::pull(
+          dplyr::collect(dplyr::summarise(rr, n = dplyr::n())),
+          n
+        )
+      } else {
+        pos <- dplyr::collect(rr)
+      }
+    } else { # dplyr
+      pos <- if (return_n) nrow(rr) else rr
+    }
+  } else if (backend == "data.table") {
+    pos <- x[eval(parse(text = e)), ]
+    if (return_n) pos <- nrow(pos)
+  }
+  pos
+}
+
+# strips a dplyr/cli error message of its formatting
+# nolint start
+# x <- "\033[37;48;5;19m\033[38;5;232mProblem while computing ... .\033[39m\n\033[1mCaused by error in xxx:\033[22m\n\033[33m!\033[39m object 'does_not_exist' not found\033[39;49m"
+# x <- "Problem while computing `..1 = eval(parse(text = e))`.\nCaused by error in `does_not_exist %in% c(\"a\", \"b\", \"c\")`:\n! object 'does_not_exist' not found"
+# x <- "\033[38;5;232mThere were 2 warnings in `dplyr::filter()`.\nThe first warning was:\033[39m\n\033[38;5;232m\033[36mℹ\033[38;5;232m In argument: `as.numeric(hp) > 0 & as.numeric(hp) < 400`.\033[39m\nCaused by warning:\n\033[33m!\033[39m NAs introduced by coercion\n\033[38;5;232m\033[36mℹ\033[38;5;232m Run \033]8;;ide:run:dplyr::last_dplyr_warnings()\adplyr::last_dplyr_warnings()\033]8;;\a to see the 1 remaining warning.\033[39m"
+# nolint end
+strip_dplyr_errors <- function(x) {
+  if (substr(x, 1, 1) == "\033") {
+    r <- gsub("\033.*\033\\[33m\\!\033\\[39m ", "", x)
+    r <- gsub("\033.*$", "", r)
+  } else {
+    # testthat automatically turns off the colorful errors...
+    r <- gsub(".*\\n\\! ", "", x)
+    r <- gsub("i Run.*$", "", r) # remove multiline warnings
+  }
+  gsub("\\n$", "", r)
+}
diff --git a/R/data_column.R b/R/data_column.R
new file mode 100644
index 0000000..50c9a54
--- /dev/null
+++ b/R/data_column.R
@@ -0,0 +1,379 @@
+# Internal constructors and validators for structured rule specs (v1).
+
+#' Define a Column Specification for Schema Checks
+#'
+#' @description
+#' Creates a single column declaration used in `ruleset(..., data_columns = ...)`.
+#' Column declarations are schema checks (column existence, optionality, and
+#' declared type), whereas [`rule()`] is for row-wise value checks.
+#'
+#' @param col column name.
+#' @param type optional declared type (for example `"int"`, `"double"`,
+#' `"str"`, `"logical"`). Use `NA_character_` for no type declaration.
+#' @param optional logical; if `FALSE`, the column is required.
+#' @param description optional free-text description.
+#'
+#' @return A `data_column` object (list) that can be passed in
+#' `ruleset(..., data_columns = list(...))`.
+#' @export
+#'
+#' @examples
+#' rs <- ruleset(
+#'   rule(price >= 0),
+#'   data_columns = list(
+#'     data_column("price", type = "double", optional = FALSE),
+#'     data_column("note", type = "str", optional = TRUE)
+#'   )
+#' )
+#' rs
+#'
+#' # combined with row rules and strict schema stopping
+#' order_rules <- ruleset(
+#'   rule(price >= 0, allow_na = FALSE),
+#'   data_columns = list(
+#'     data_column("order_id", type = "int", optional = FALSE),
+#'     data_column("price", type = "double", optional = FALSE),
+#'     data_column("note", type = "str", optional = TRUE)
+#'   )
+#' )
+#'
+#' check_data(
+#'   data.frame(order_id = 1:3, price = c(10, 20, 30), note = c("ok", NA, "ok")),
+#'   order_rules,
+#'   stop_on_schema_fail = TRUE
+#' )
+data_column <- function(col, type = NA_character_, optional = FALSE,
+                        description = NA_character_) {
+  if (!is.character(col) || length(col) != 1 || is.na(col) || !nzchar(col)) {
+    stop("`col` must be a single non-empty character value.")
+  }
+  if (!is.character(type) || length(type) != 1) {
+    stop("`type` must be a single character value (or NA).")
+  }
+  if (!is.logical(optional) || length(optional) != 1 || is.na(optional)) {
+    stop("`optional` must be a single non-missing logical value.")
+  }
+  if (!is.character(description) || length(description) != 1) {
+    stop("`description` must be a single character value (or NA).")
+  }
+
+  structure(
+    list(
+      col = col,
+      type = type,
+      optional = optional,
+      description = description
+    ),
+    class = "data_column"
+  )
+}
+
+
+rule_meta <- function(title = NA_character_, version = NA_character_,
+                      description = NA_character_, ...) {
+  if (!is.character(title) || length(title) != 1) {
+    stop("`title` must be a single character value (or NA).")
+  }
+  if (!is.character(version) || length(version) != 1) {
+    stop("`version` must be a single character value (or NA).")
+  }
+  if (!is.character(description) || length(description) != 1) {
+    stop("`description` must be a single character value (or NA).")
+  }
+
+  structure(
+    c(
+      list(
+        title = title,
+        version = version,
+        description = description
+      ),
+      list(...)
+    ),
+    class = "rule_meta"
+  )
+}
+
+
+#' Define a Relational Reference Rule
+#'
+#' @description
+#' Creates a rule that checks whether values in a local column exist in a
+#' column of a referenced dataset. Use with `check_data()` by supplying `x` as
+#' a named list of datasets and setting `data_name` in `ruleset()` (or by
+#' ordering the list so the first entry is the primary dataset).
+#'
+#' @param local_col column name in the primary dataset.
+#' @param ref_dataset name of the referenced dataset in the `x` list.
+#' @param ref_col column name in the referenced dataset.
+#' @param name optional display name for the rule.
+#' @param allow_na logical; if `TRUE`, missing values in `local_col` are treated
+#' as passing.
+#' @param negate logical; if `TRUE`, inverts the rule (values must *not* be in
+#' the referenced column).
+#' @param ... additional fields attached to the rule object.
+#'
+#' @return A `reference_rule` object that can be included in `ruleset()`.
+#' @export
+#'
+#' @examples
+#' flights <- data.frame(carrier = c("AA", "BB", NA_character_))
+#' carriers <- data.frame(carrier_id = c("AA"))
+#'
+#' rs <- ruleset(
+#'   reference_rule(
+#'     local_col = "carrier",
+#'     ref_dataset = "carriers",
+#'     ref_col = "carrier_id",
+#'     allow_na = TRUE
+#'   ),
+#'   data_name = "flights"
+#' )
+#'
+#' check_data(list(flights = flights, carriers = carriers), rs)
+#'
+#' # negated relation: value must NOT exist in blacklist
+#' blacklist <- data.frame(carrier_id = c("XX", "YY"))
+#' rs_neg <- ruleset(
+#'   reference_rule(
+#'     local_col = "carrier",
+#'     ref_dataset = "blacklist",
+#'     ref_col = "carrier_id",
+#'     negate = TRUE,
+#'     allow_na = TRUE
+#'   ),
+#'   data_name = "flights"
+#' )
+#'
+#' check_data(list(flights = flights, blacklist = blacklist), rs_neg)
+reference_rule <- function(local_col, ref_dataset, ref_col, name = NA,
+                           allow_na = FALSE, negate = FALSE, ...) {
+  if (!is.character(local_col) || length(local_col) != 1 || is.na(local_col)) {
+    stop("`local_col` must be a single character value.")
+  }
+  if (!is.character(ref_dataset) || length(ref_dataset) != 1 || is.na(ref_dataset)) {
+    stop("`ref_dataset` must be a single character value.")
+  }
+  if (!is.character(ref_col) || length(ref_col) != 1 || is.na(ref_col)) {
+    stop("`ref_col` must be a single character value.")
+  }
+  if (!is.logical(allow_na) || length(allow_na) != 1 || is.na(allow_na)) {
+    stop("`allow_na` must be a single non-missing logical value.")
+  }
+  if (!is.logical(negate) || length(negate) != 1 || is.na(negate)) {
+    stop("`negate` must be a single non-missing logical value.")
+  }
+  if (is.na(name)) {
+    name <- sprintf(
+      "Reference rule: %s in %s$%s",
+      local_col, ref_dataset, ref_col
+    )
+  }
+
+  out <- list(
+    name = name,
+    expr = sprintf("%s %%in%% %s$%s", local_col, ref_dataset, ref_col),
+    allow_na = allow_na,
+    negate = negate,
+    local_col = local_col,
+    ref_dataset = ref_dataset,
+    ref_col = ref_col,
+    ...
+  )
+  class(out) <- c("reference_rule", "rule")
+  out
+}
+
+
+validate_data_columns <- function(data_columns) {
+  if (is.null(data_columns)) {
+    return(invisible(NULL))
+  }
+  if (!is.list(data_columns)) {
+    stop("`data_columns` must be a list of `data_column()` specs.")
+  }
+
+  cols <- vapply(data_columns, function(dc) dc$col, character(1))
+  if (anyDuplicated(cols)) {
+    stop("`data_columns` contains duplicate column names.")
+  }
+  invisible(NULL)
+}
+
+
+validate_rules_against_schema <- function(x, rules, extra_columns = "ignore") {
+  extra_columns <- match.arg(extra_columns, c("ignore", "warn", "fail"))
+  data_columns <- attr(rules, "data_columns", exact = TRUE)
+  if (is.null(data_columns) || length(data_columns) == 0) {
+    return(empty_schema_results())
+  }
+
+  declared <- vapply(data_columns, function(dc) dc$col, character(1))
+  present <- names(x)
+  schema_rows <- list()
+
+  extra <- setdiff(present, declared)
+  if (length(extra)) {
+    msg <- sprintf(
+      "Found extra columns not declared in `data_columns`: %s",
+      paste(extra, collapse = ", ")
+    )
+    if (extra_columns == "warn") {
+      warning(msg)
+    } else if (extra_columns == "fail") {
+      stop(msg)
+    }
+  }
+
+  expr_rules <- Filter(function(r) !inherits(r, "reference_rule"), rules)
+  used_symbols <- unique(unlist(lapply(expr_rules, function(r) get_symbols(r$expr))))
+  unknown <- setdiff(used_symbols, declared)
+  if (length(unknown)) {
+    stop(sprintf(
+      "Unknown symbols found in rules not present in `data_columns`: %s",
+      paste(unknown, collapse = ", ")
+    ))
+  }
+
+  for (dc in data_columns) {
+    is_present <- dc$col %in% present
+    exists_ok <- is_present || isTRUE(dc$optional)
+    schema_rows <- c(
+      schema_rows,
+      list(schema_result_row(
+        name = sprintf("Schema: column '%s' exists", dc$col),
+        expr = sprintf("column_exists('%s')", dc$col),
+        pass = exists_ok,
+        error = if (!exists_ok) sprintf("Required column '%s' is missing.", dc$col) else ""
+      ))
+    )
+
+    if (is_present && !is.na(dc$type) && nzchar(dc$type)) {
+      vv <- x[[dc$col]]
+      type_ok <- column_matches_type(vv, dc$type)
+      schema_rows <- c(
+        schema_rows,
+        list(schema_result_row(
+          name = sprintf("Schema: column '%s' has type '%s'", dc$col, dc$type),
+          expr = sprintf("column_type('%s') == '%s'", dc$col, dc$type),
+          pass = type_ok,
+          error = if (!type_ok) {
+            sprintf("Column '%s' does not match declared type '%s'.", dc$col, dc$type)
+          } else {
+            ""
+          }
+        ))
+      )
+    }
+  }
+
+  if (!length(schema_rows)) {
+    return(empty_schema_results())
+  }
+
+  do.call(rbind, schema_rows)
+}
+
+
+schema_result_row <- function(name, expr, pass, error = "") {
+  data.frame(
+    check_type = "schema",
+    name = name,
+    expr = expr,
+    allow_na = FALSE,
+    negate = FALSE,
+    tests = 1L,
+    pass = as.integer(isTRUE(pass)),
+    fail = as.integer(!isTRUE(pass)),
+    warn = "",
+    error = error,
+    time = as.difftime(0, units = "secs"),
+    stringsAsFactors = FALSE
+  )
+}
+
+
+empty_schema_results <- function() {
+  data.frame(
+    check_type = character(),
+    name = character(),
+    expr = character(),
+    allow_na = logical(),
+    negate = logical(),
+    tests = integer(),
+    pass = integer(),
+    fail = integer(),
+    warn = character(),
+    error = character(),
+    time = as.difftime(numeric(), units = "secs"),
+    stringsAsFactors = FALSE
+  )
+}
+
+
+column_matches_type <- function(v, type) {
+  type <- tolower(type)
+  if (type %in% c("int", "integer")) {
+    return(is.integer(v))
+  }
+  if (type %in% c("dbl", "double", "numeric", "float")) {
+    return(is.numeric(v))
+  }
+  if (type %in% c("str", "string", "chr", "character")) {
+    return(is.character(v))
+  }
+  if (type %in% c("bool", "boolean", "logical")) {
+    return(is.logical(v))
+  }
+  TRUE
+}
+
+
+check_reference_rule <- function(data, r, datasets) {
+  if (!r$local_col %in% names(data)) {
+    stop(sprintf("Column '%s' used in reference rule was not found.", r$local_col))
+  }
+  if (!r$ref_dataset %in% names(datasets)) {
+    stop(sprintf(
+      "In reference rule '%s' referenced dataset '%s' was not supplied.",
+      r$name, r$ref_dataset
+    ))
+  }
+
+  ref_data <- datasets[[r$ref_dataset]]
+  if (!r$ref_col %in% names(ref_data)) {
+    stop(sprintf(
+      "Column '%s' in referenced dataset '%s' was not found.",
+      r$ref_col, r$ref_dataset
+    ))
+  }
+
+  t0 <- Sys.time()
+  local_values <- data[[r$local_col]]
+  ref_values <- ref_data[[r$ref_col]]
+  is_ok <- local_values %in% ref_values
+  if (isTRUE(r$allow_na)) {
+    is_ok <- is_ok | is.na(local_values)
+  }
+  if (isTRUE(r$negate)) {
+    is_ok <- !is_ok
+  }
+
+  nr <- length(local_values)
+  pass <- sum(is_ok, na.rm = TRUE)
+
+  data.frame(
+    check_type = "reference_rule",
+    name = r$name,
+    expr = r$expr,
+    allow_na = r$allow_na,
+    negate = r$negate,
+    tests = nr,
+    pass = pass,
+    fail = nr - pass,
+    warn = "",
+    error = "",
+    time = difftime(Sys.time(), t0, units = "secs"),
+    stringsAsFactors = FALSE
+  )
+}
diff --git a/R/describe.R b/R/describe.R
index 44ff9e9..c2b27ea 100644
--- a/R/describe.R
+++ b/R/describe.R
@@ -6,18 +6,38 @@
 #'
 #' @param x a dataset, either a [`data.frame`], [`dplyr::tibble`], [`data.table::data.table`],
 #' [`arrow::arrow_table`], [`arrow::open_dataset`], or [`dplyr::tbl`] (SQL connection)
+#' @param skip_ones logical, whether values that occur exactly once should be omitted
+#' from `most_frequent`
+#' @param digits integer, number of digits to round numeric values in `most_frequent`
+#' @param top_n integer, number of most frequent values to include in `most_frequent`;
+#' set to `0` to skip the `most_frequent` computation
+#' @param fast logical, when `TRUE` skip expensive fields (`n_distinct`, `median`)
+#' by returning `NA` for them
 #'
 #' @return a `data.frame`, `dplyr::tibble`, or `data.table::data.table` containing
 #' a summary of the dataset given
+#' @details
+#' Numeric values in `most_frequent` are rounded to `digits` (default: 4).
+#' If a variable has at most 1 distinct value, `most_frequent` is left empty.
+#' By default, values with count 1 are omitted from `most_frequent`.
 #' @export
 #'
-#' @seealso Similar to [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html),
-#'   [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary),
-#'   and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html)
+#' @seealso Similar to
+#' [skimr::skim()](https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html),
+#' [summarytools::dfSummary()](https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary),
+#' and [gtExtras::gt_plt_summary()](https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html)
 #'
 #' @examples
 #' describe(mtcars)
-describe <- function(x) {
+describe <- function(x, skip_ones = TRUE, digits = 4, top_n = 3, fast = FALSE) {
+
+  if (!is.numeric(top_n) || length(top_n) != 1 || is.na(top_n) || top_n < 0) {
+    stop("`top_n` must be a single number >= 0.")
+  }
+  top_n <- as.integer(top_n)
+  if (!is.logical(fast) || length(fast) != 1 || is.na(fast)) {
+    stop("`fast` must be a single non-missing logical value.")
+  }
 
   backend <- detect_backend(x)
 
@@ -31,43 +51,57 @@ describe <- function(x) {
   }
 
   if (backend == "base-r") {
-    describe_base_r(x)
+    describe_base_r(x, max_n = top_n, skip_ones = skip_ones, digits = digits, fast = fast)
   } else if (backend == "dplyr") {
-    describe_dplyr(x)
+    describe_dplyr(x, max_n = top_n, skip_ones = skip_ones, digits = digits, fast = fast)
   } else if (backend == "data.table") {
-    describe_data.table(x)
+    describe_data.table(x, max_n = top_n, skip_ones = skip_ones, digits = digits, fast = fast)
   } else if (backend == "collectibles") {
-    if ("tbl_sql" %in% class(x)) {
-      describe_sql(x)
-    } else if ("ArrowObject" %in% class(x)) {
-      describe_arrow(x)
-    }
+    describe_collectibles(x, max_n = top_n, skip_ones = skip_ones, digits = digits, fast = fast)
   } else {
     stop(sprintf("Could not detect backend to describe %s", paste(class(x), collapse = ", ")))
   }
 }
 
-
 # internal function to see which values should use the min/max etc part
 is_numeric <- function(v) {
   any(class(v) %in% c("integer", "numeric", "POSIXt"))
 }
 
-# x <- mtcars
-describe_base_r <- function(x, max_n = 3) {
+is_roundable_numeric <- function(v) {
+  any(class(v) %in% c("integer", "numeric"))
+}
+
+format_most_frequent <- function(values, counts, skip_ones = TRUE, digits = 4) {
+  if (length(unique(values)) <= 1) return("")
+  if (skip_ones) {
+    keep <- counts > 1
+    values <- values[keep]
+    counts <- counts[keep]
+  }
+  if (length(values) == 0) return("")
+  if (is_roundable_numeric(values)) values <- round(values, digits = digits)
+  paste(sprintf("%s (%s)", values, counts), collapse = ", ")
+}
+
+top_counts <- function(v, max_n = 3) {
+  uv <- unique(v)
+  if (max_n <= 0) return(list(values = uv[0], counts = integer(0)))
+  tab <- tabulate(match(v, uv))
+  od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))]
+  list(values = uv[od], counts = tab[od])
+}
+
+# eg x <- mtcars
+describe_base_r <- function(x, max_n = 3, skip_ones = TRUE, digits = 4, fast = FALSE) {
   ll <- lapply(
-    seq(ncol(x)),
+    seq_len(ncol(x)),
     function(i) {
       v <- x[[i]]
       type <- class(v)[[1]]
       is_num <- is_numeric(v)
 
-      tbl <- table(v)
-      uv <- unique(v)
-      tab <- tabulate(match(v, uv))
-      tab_max <- which(tab == max(tab))
-      # get the indices of the three highest counts
-      od <- order(tab, decreasing = TRUE)[seq(min(max_n, length(tab)))]
+      tc <- top_counts(v, max_n = max_n)
 
       nz <- if (!is_num) nchar(as.character(v))
 
@@ -75,16 +109,23 @@ describe_base_r <- function(x, max_n = 3) {
         var = names(x)[[i]],
         type = type,
         n = length(v),
-        n_distinct = length(unique(v)),
+        n_distinct = if (fast) NA_integer_ else length(unique(v)),
         n_na = sum(is.na(v)),
-        most_frequent = paste(sprintf("%s (%s)", uv[od], tab[od]),
-                              collapse = ", "),
+        most_frequent = if (fast) {
+          NA_character_
+        } else {
+          format_most_frequent(tc$values, tc$counts, skip_ones = skip_ones, digits = digits)
+        },
 
         min = as.numeric(min(if (is_num) v else nz, na.rm = TRUE)),
         mean = as.numeric(mean(if (is_num) v else nz, na.rm = TRUE)),
-        median = as.numeric(median(if (is_num) v else nz, na.rm = TRUE)),
+        median = if (fast) {
+          NA_real_
+        } else {
+          as.numeric(stats::median(if (is_num) v else nz, na.rm = TRUE))
+        },
         max = as.numeric(max(if (is_num) v else nz, na.rm = TRUE)),
-        sd = as.numeric(sd(if (is_num) v else nz, na.rm = TRUE))
+        sd = as.numeric(stats::sd(if (is_num) v else nz, na.rm = TRUE))
       )
     }
   )
@@ -92,50 +133,75 @@ describe_base_r <- function(x, max_n = 3) {
   do.call(rbind, ll)
 }
 
-# x <- mtcars |> tibble::as_tibble()
-describe_dplyr <- function(x, max_n = 3) {
+# eg x <- mtcars |> tibble::as_tibble()
+describe_dplyr <- function(x, max_n = 3, skip_ones = TRUE, digits = 4, fast = FALSE) {
   ll <- lapply(
     names(x),
     function(v) {
-      mc <- x |>
-        dplyr::count(.data[[v]]) |>
-        dplyr::slice_max(n, n = max_n, with_ties = FALSE)
+      vv <- x[[v]]
+      tc <- top_counts(vv, max_n = max_n)
 
-      type <- class(mc[[1]])[[1]]
-      is_num <- is_numeric(mc[[1]])
-      mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
+      type <- class(vv)[[1]]
+      is_num <- is_numeric(vv)
+      mf <- if (fast) {
+        NA_character_
+      } else {
+        format_most_frequent(tc$values, tc$counts, skip_ones = skip_ones, digits = digits)
+      }
 
       nz <- if (!is_num) nchar(as.character(x[[v]]))
-      x |>
-        dplyr::summarise(
-          var = v,
-          type = type,
-          n = dplyr::n(),
-          n_distinct = dplyr::n_distinct(.data[[v]]),
-          n_na = sum(is.na(.data[[v]])),
-          most_frequent = mf,
-          min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
-          mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
-          median = as.numeric(median(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
-          max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
-          sd = as.numeric(sd(if (is_num) .data[[v]] else nz, na.rm = TRUE))
-        )
+      if (fast) {
+        x |>
+          dplyr::summarise(
+            var = v,
+            type = type,
+            n = dplyr::n(),
+            n_distinct = as.integer(NA),
+            n_na = sum(is.na(.data[[v]])),
+            most_frequent = mf,
+            min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            median = as.numeric(NA),
+            max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            sd = as.numeric(stats::sd(if (is_num) .data[[v]] else nz, na.rm = TRUE))
+          )
+      } else {
+        x |>
+          dplyr::summarise(
+            var = v,
+            type = type,
+            n = dplyr::n(),
+            n_distinct = dplyr::n_distinct(.data[[v]]),
+            n_na = sum(is.na(.data[[v]])),
+            most_frequent = mf,
+            min = as.numeric(min(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            mean = as.numeric(mean(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            median = as.numeric(stats::median(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            max = as.numeric(max(if (is_num) .data[[v]] else nz, na.rm = TRUE)),
+            sd = as.numeric(stats::sd(if (is_num) .data[[v]] else nz, na.rm = TRUE))
+          )
+      }
     }
   )
 
   dplyr::bind_rows(ll)
 }
 
-# x <- mtcars |> data.table::as.data.table()
-describe_data.table <- function(x, max_n = 3) {
+# eg x <- mtcars |> data.table::as.data.table()
+describe_data.table <- function(x, max_n = 3, skip_ones = TRUE, digits = 4, fast = FALSE) { # nolint
   ll <- lapply(
     names(x),
     function(v) {
-      mc <- x[, .(n = .N), by = v][order(n, decreasing = TRUE)][seq(max_n)]
+      vv <- x[[v]]
+      tc <- top_counts(vv, max_n = max_n)
 
-      type <- class(mc[[1]])[[1]]
-      is_num <- is_numeric(mc[[1]])
-      mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
+      type <- class(vv)[[1]]
+      is_num <- is_numeric(vv)
+      mf <- if (fast) {
+        NA_character_
+      } else {
+        format_most_frequent(tc$values, tc$counts, skip_ones = skip_ones, digits = digits)
+      }
 
       nz <- if (!is_num) nchar(as.character(x[[v]]))
 
@@ -143,15 +209,19 @@ describe_data.table <- function(x, max_n = 3) {
         var = v,
         type = type,
         n = .N,
-        n_distinct = data.table::uniqueN(get(v)),
+        n_distinct = if (fast) NA_integer_ else data.table::uniqueN(get(v)),
         n_na = sum(is.na(get(v))),
         most_frequent = mf,
 
         min = as.numeric(min(if (is_num) get(v) else nz, na.rm = TRUE)),
         mean = as.numeric(mean(if (is_num) get(v) else nz, na.rm = TRUE)),
-        median = as.numeric(median(if (is_num) get(v) else nz, na.rm = TRUE)),
+        median = if (fast) {
+          NA_real_
+        } else {
+          as.numeric(stats::median(if (is_num) get(v) else nz, na.rm = TRUE))
+        },
         max = as.numeric(max(if (is_num) get(v) else nz, na.rm = TRUE)),
-        sd = as.numeric(sd(if (is_num) get(v) else nz, na.rm = TRUE))
+        sd = as.numeric(stats::sd(if (is_num) get(v) else nz, na.rm = TRUE))
       )]
     }
   )
@@ -160,117 +230,393 @@ describe_data.table <- function(x, max_n = 3) {
 }
 
 
-# RSQLite, duckdb etc
-describe_sql <- function(x, max_n = 3) {
-  ll <- lapply(names(x), function(v) {
-    mc <- x |>
-      dplyr::count(.data[[v]]) |>
-      dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
-      dplyr::collect()
+# RSQLite, duckdb, arrow etc
+describe_collectibles <- function(x, max_n = 3, skip_ones = TRUE, digits = 4, fast = FALSE) {
+  colmeta <- describe_collectibles_colmeta(x)
+  vars <- colmeta$var
 
-    type <- class(mc[[1]])[[1]]
-    is_num <- is_numeric(mc[[1]])
-    mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
-    nn <- x |>
-      dplyr::distinct(.data[[v]]) |>
-      dplyr::summarise(n = dplyr::n()) |>
-      dplyr::collect()
-    nna <- x |> dplyr::filter(is.na(.data[[v]])) |> dplyr::collect() |> nrow()
+  stats_by_var <- describe_collectibles_stats_with_fallback(x, colmeta, fast = fast)
 
-    r <- dplyr::tibble(
-      var = v,
-      type = type,
-      n_distinct = nn[[1]],
-      n_na = nna,
-      most_frequent = mf
+  mf_by_var <- if (fast) {
+    stats::setNames(rep(NA_character_, length(vars)), vars)
+  } else if (max_n == 0L) {
+    stats::setNames(rep("", length(vars)), vars)
+  } else {
+    describe_collectibles_most_frequent(
+      x,
+      vars = vars,
+      max_n = max_n,
+      skip_ones = skip_ones,
+      digits = digits
     )
+  }
 
-    xx <- x |>
-      dplyr::select(dplyr::all_of(v)) |>
-      dplyr::rename(x := dplyr::all_of(v))
-    if (!is_num) xx <- xx |> dplyr::mutate(x = nchar(as.character(x)))
-
-
-    rr <- try(
-      xx |>
-        dplyr::summarise(
-          min = min(x, na.rm = TRUE),
-          mean = mean(x, na.rm = TRUE),
-          median = median(x, na.rm = TRUE),
-          max = max(x, na.rm = TRUE),
-          sd = sd(x, na.rm = TRUE)
-        ) |>
-        dplyr::collect(),
-      silent = TRUE
+  ll <- lapply(seq_len(nrow(colmeta)), function(i) {
+    v <- colmeta$var[[i]]
+    stats <- stats_by_var[[v]]
+    dplyr::tibble(
+      var = v,
+      type = colmeta$type[[i]],
+      n = stats$n,
+      n_distinct = stats$n_distinct,
+      n_na = stats$n_na,
+      most_frequent = mf_by_var[[v]],
+      min = stats$min,
+      mean = stats$mean,
+      median = stats$median,
+      max = stats$max,
+      sd = stats$sd
     )
-    if (inherits(rr, "try-error")) {
-      rr <- dplyr::tibble(
-        min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
-        sd = NA_real_
-      )
-    }
-
-    dplyr::bind_cols(r, rr)
   })
 
   dplyr::bind_rows(ll)
 }
 
-# arrow::write_parquet(nycflights13::flights, "flights.parquet")
-# x <- arrow::open_dataset("flights.parquet")
-describe_arrow <- function(x, max_n = 3) {
-  # if x is a dbplyr connection string
-  ll <- lapply(names(x), function(v) {
-    mc <- x |>
-      dplyr::count(.data[[v]]) |>
-      dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
-      dplyr::collect()
 
-    type <- class(mc[[1]])[[1]]
-    is_num <- is_numeric(mc[[1]])
-    mf <- paste(sprintf("%s (%s)", mc[[1]], mc[[2]]), collapse = ", ")
-    nn <- x |>
-      dplyr::distinct(.data[[v]]) |>
-      dplyr::summarise(n = dplyr::n()) |>
+describe_collectibles_stats <- function(x, v, is_num, fast = FALSE) {
+  out <- try(
+    describe_collectibles_stats_single_query(x, v, is_num = is_num, fast = fast),
+    silent = TRUE
+  )
+  if (!inherits(out, "try-error")) return(out)
+
+  vals <- x |>
+    dplyr::select(dplyr::all_of(v)) |>
+    dplyr::collect() |>
+    dplyr::pull(1)
+  describe_vector_stats(vals, is_num = is_num, fast = fast)
+}
+
+
+describe_collectibles_stats_with_fallback <- function(x, colmeta, fast = FALSE) {
+  out <- try(
+    describe_collectibles_stats_batched(x, colmeta = colmeta, fast = fast),
+    silent = TRUE
+  )
+  if (!inherits(out, "try-error")) return(out)
+
+  out <- list()
+  failed <- character()
+  for (i in seq_len(nrow(colmeta))) {
+    v <- colmeta$var[[i]]
+    is_num <- colmeta$is_num[[i]]
+    stats <- try(describe_collectibles_stats_single_query(x, v, is_num = is_num, fast = fast),
+                 silent = TRUE)
+    if (inherits(stats, "try-error")) {
+      failed <- c(failed, v)
+    } else {
+      out[[v]] <- stats
+    }
+  }
+
+  if (length(failed)) {
+    vals <- x |>
+      dplyr::select(dplyr::all_of(failed)) |>
       dplyr::collect()
-    nna <- x |> dplyr::filter(is.na(.data[[v]])) |> nrow()
 
-    r <- dplyr::tibble(
-      var = v,
-      type = type,
-      n_distinct = nn[[1]],
-      n_na = nna,
-      most_frequent = mf
+    for (v in failed) {
+      is_num <- colmeta$is_num[[which(colmeta$var == v)[[1]]]]
+      out[[v]] <- describe_vector_stats(vals[[v]], is_num = is_num, fast = fast)
+    }
+  }
+
+  out[colmeta$var]
+}
+
+
+describe_collectibles_stats_batched <- function(x, colmeta, fast = FALSE) {
+  vars <- colmeta$var
+  num_vars <- colmeta$var[colmeta$is_num]
+  non_num_vars <- colmeta$var[!colmeta$is_num]
+
+  exprs <- list(
+    n = dplyr::n(),
+    dplyr::across(
+      dplyr::all_of(vars),
+      ~ sum(is.na(.x), na.rm = TRUE),
+      .names = "n_na__{.col}"
     )
+  )
+  if (!fast) {
+    exprs <- c(exprs, list(
+      dplyr::across(
+        dplyr::all_of(vars),
+        ~ dplyr::n_distinct(.x),
+        .names = "n_distinct__{.col}"
+      )
+    ))
+  }
 
-    if (is_num) {
-      xx <- x |> dplyr::transmute(x = get(v))
-    } else {
-      xx <- x |> dplyr::transmute(x = nchar(as.character(get(v))))
+  if (length(num_vars)) {
+    exprs <- c(exprs, list(
+      dplyr::across(dplyr::all_of(num_vars), ~ min(.x, na.rm = TRUE), .names = "min__{.col}"),
+      dplyr::across(dplyr::all_of(num_vars), ~ mean(.x, na.rm = TRUE), .names = "mean__{.col}"),
+      dplyr::across(dplyr::all_of(num_vars), ~ max(.x, na.rm = TRUE), .names = "max__{.col}"),
+      dplyr::across(dplyr::all_of(num_vars), ~ stats::sd(.x, na.rm = TRUE), .names = "sd__{.col}")
+    ))
+    if (!fast) {
+      exprs <- c(exprs, list(
+        dplyr::across(
+          dplyr::all_of(num_vars),
+          ~ stats::median(.x, na.rm = TRUE),
+          .names = "median__{.col}"
+        )
+      ))
     }
+  }
 
-    suppressWarnings({
-      rr <- try(
-        xx |>
-          dplyr::summarise(
-            min = min(x, na.rm = TRUE),
-            mean = mean(x, na.rm = TRUE),
-            median = median(x, na.rm = TRUE),
-            max = max(x, na.rm = TRUE),
-            sd = sd(x, na.rm = TRUE)
-          ) |>
-          dplyr::collect(),
-        silent = TRUE)
-    })
-    if (inherits(rr, "try-error")) {
-      rr <- dplyr::tibble(
-        min = NA_real_, mean = NA_real_, median = NA_real_, max = NA_real_,
-        sd = NA_real_
+  if (length(non_num_vars)) {
+    exprs <- c(exprs, list(
+      dplyr::across(
+        dplyr::all_of(non_num_vars),
+        ~ min(nchar(as.character(.x)), na.rm = TRUE),
+        .names = "min__{.col}"
+      ),
+      dplyr::across(
+        dplyr::all_of(non_num_vars),
+        ~ mean(nchar(as.character(.x)), na.rm = TRUE),
+        .names = "mean__{.col}"
+      ),
+      dplyr::across(
+        dplyr::all_of(non_num_vars),
+        ~ max(nchar(as.character(.x)), na.rm = TRUE),
+        .names = "max__{.col}"
+      ),
+      dplyr::across(
+        dplyr::all_of(non_num_vars),
+        ~ stats::sd(nchar(as.character(.x)), na.rm = TRUE),
+        .names = "sd__{.col}"
       )
+    ))
+    if (!fast) {
+      exprs <- c(exprs, list(
+        dplyr::across(
+          dplyr::all_of(non_num_vars),
+          ~ stats::median(nchar(as.character(.x)), na.rm = TRUE),
+          .names = "median__{.col}"
+        )
+      ))
     }
+  }
 
-    dplyr::bind_cols(r, rr)
+  summed <- dplyr::summarise(x, !!!exprs) |>
+    dplyr::collect()
+
+  n <- as.integer(summed$n[[1]])
+  out <- lapply(seq_len(nrow(colmeta)), function(i) {
+    v <- colmeta$var[[i]]
+    data.frame(
+      n = n,
+      n_distinct = if (fast) as.integer(NA) else as.integer(summed[[paste0("n_distinct__", v)]][[1]]),
+      n_na = as.integer(summed[[paste0("n_na__", v)]][[1]]),
+      min = as.numeric(summed[[paste0("min__", v)]][[1]]),
+      mean = as.numeric(summed[[paste0("mean__", v)]][[1]]),
+      median = if (fast) as.numeric(NA) else as.numeric(summed[[paste0("median__", v)]][[1]]),
+      max = as.numeric(summed[[paste0("max__", v)]][[1]]),
+      sd = as.numeric(summed[[paste0("sd__", v)]][[1]])
+    )
   })
+  names(out) <- colmeta$var
+  out
+}
 
-  dplyr::bind_rows(ll)
+
+describe_collectibles_stats_single_query <- function(x, v, is_num, fast = FALSE) {
+  if (is_num) {
+    if (fast) {
+      dplyr::summarise(
+        x,
+        n = dplyr::n(),
+        n_na = sum(is.na(.data[[v]]), na.rm = TRUE),
+        min = min(.data[[v]], na.rm = TRUE),
+        mean = mean(.data[[v]], na.rm = TRUE),
+        max = max(.data[[v]], na.rm = TRUE),
+        sd = stats::sd(.data[[v]], na.rm = TRUE)
+      ) |>
+        dplyr::collect() |>
+        coerce_collectibles_stats(fast = fast)
+    } else {
+      dplyr::summarise(
+        x,
+        n = dplyr::n(),
+        n_distinct = dplyr::n_distinct(.data[[v]]),
+        n_na = sum(is.na(.data[[v]]), na.rm = TRUE),
+        min = min(.data[[v]], na.rm = TRUE),
+        mean = mean(.data[[v]], na.rm = TRUE),
+        median = stats::median(.data[[v]], na.rm = TRUE),
+        max = max(.data[[v]], na.rm = TRUE),
+        sd = stats::sd(.data[[v]], na.rm = TRUE)
+      ) |>
+        dplyr::collect() |>
+        coerce_collectibles_stats(fast = fast)
+    }
+  } else {
+    if (fast) {
+      dplyr::summarise(
+        x,
+        n = dplyr::n(),
+        n_na = sum(is.na(.data[[v]]), na.rm = TRUE),
+        min = min(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        mean = mean(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        max = max(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        sd = stats::sd(nchar(as.character(.data[[v]])), na.rm = TRUE)
+      ) |>
+        dplyr::collect() |>
+        coerce_collectibles_stats(fast = fast)
+    } else {
+      dplyr::summarise(
+        x,
+        n = dplyr::n(),
+        n_distinct = dplyr::n_distinct(.data[[v]]),
+        n_na = sum(is.na(.data[[v]]), na.rm = TRUE),
+        min = min(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        mean = mean(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        median = stats::median(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        max = max(nchar(as.character(.data[[v]])), na.rm = TRUE),
+        sd = stats::sd(nchar(as.character(.data[[v]])), na.rm = TRUE)
+      ) |>
+        dplyr::collect() |>
+        coerce_collectibles_stats(fast = fast)
+    }
+  }
+}
+
+
+describe_collectibles_most_frequent <- function(x, vars, max_n = 3, skip_ones = TRUE, digits = 4) {
+  out <- stats::setNames(rep("", length(vars)), vars)
+  failed <- character()
+
+  for (v in vars) {
+    mc <- try(
+      x |>
+        dplyr::count(.data[[v]]) |>
+        dplyr::slice_max(n, n = max_n, with_ties = FALSE) |>
+        dplyr::collect(),
+      silent = TRUE
+    )
+    if (inherits(mc, "try-error")) {
+      failed <- c(failed, v)
+      next
+    }
+    out[[v]] <- format_most_frequent(mc[[1]], mc[[2]], skip_ones = skip_ones, digits = digits)
+  }
+
+  if (length(failed)) {
+    vals <- x |>
+      dplyr::select(dplyr::all_of(failed)) |>
+      dplyr::collect()
+    for (v in failed) {
+      tc <- top_counts(vals[[v]], max_n = max_n)
+      out[[v]] <- format_most_frequent(tc$values, tc$counts, skip_ones = skip_ones, digits = digits)
+    }
+  }
+
+  out
+}
+
+
+describe_collectibles_colmeta <- function(x) {
+  meta <- try(describe_collectibles_colmeta_from_arrow_schema(x), silent = TRUE)
+  if (!inherits(meta, "try-error")) return(meta)
+
+  proto <- dplyr::collect(utils::head(x, 0))
+  vars <- colnames(proto)
+  data.frame(
+    var = vars,
+    type = vapply(vars, function(v) class(proto[[v]])[[1]], character(1)),
+    is_num = vapply(vars, function(v) is_numeric(proto[[v]]), logical(1)),
+    stringsAsFactors = FALSE
+  )
+}
+
+
+describe_collectibles_colmeta_from_arrow_schema <- function(x) {
+  if (!is_arrow_collectible(x)) stop("Not an Arrow collectible")
+
+  schema <- try(x$schema, silent = TRUE)
+  if (inherits(schema, "try-error") || is.null(schema)) stop("No Arrow schema available")
+
+  vars <- try(schema$names, silent = TRUE)
+  fields <- try(schema$fields, silent = TRUE)
+  if (inherits(vars, "try-error") || inherits(fields, "try-error")) {
+    stop("Arrow schema metadata unavailable")
+  }
+  if (length(vars) != length(fields)) stop("Arrow schema mismatch")
+
+  type_strings <- vapply(fields, function(field) {
+    tt <- try(field$type$ToString(), silent = TRUE)
+    if (inherits(tt, "try-error")) return(NA_character_)
+    as.character(tt)
+  }, character(1))
+  if (anyNA(type_strings)) stop("Arrow type metadata unavailable")
+
+  type <- vapply(type_strings, map_arrow_type_to_r_type, character(1))
+  data.frame(
+    var = vars,
+    type = type,
+    is_num = type %in% c("integer", "numeric", "POSIXct"),
+    stringsAsFactors = FALSE
+  )
+}
+
+
+is_arrow_collectible <- function(x) {
+  any(class(x) %in% c("ArrowObject", "arrow_dplyr_query", "Dataset", "FileSystemDataset"))
+}
+
+
+map_arrow_type_to_r_type <- function(type_string) {
+  tt <- tolower(type_string)
+  if (grepl("dictionary", tt)) return("factor")
+  if (grepl("timestamp|date|time", tt)) return("POSIXct")
+  if (grepl("int", tt)) return("integer")
+  if (grepl("float|double|decimal", tt)) return("numeric")
+  if (grepl("bool", tt)) return("logical")
+  if (grepl("utf8|string|binary", tt)) return("character")
+  "character"
+}
+
+coerce_collectibles_stats <- function(stats, fast = FALSE) {
+  if ("n" %in% names(stats)) stats$n <- as.integer(stats$n)
+  if ("n_distinct" %in% names(stats)) stats$n_distinct <- as.integer(stats$n_distinct)
+  if ("n_na" %in% names(stats)) stats$n_na <- as.integer(stats$n_na)
+
+  stat_cols <- intersect(c("min", "mean", "median", "max", "sd"), names(stats))
+  for (nm in stat_cols) stats[[nm]] <- suppressWarnings(as.numeric(stats[[nm]]))
+
+  if (isTRUE(fast)) {
+    stats$n_distinct <- as.integer(NA)
+    stats$median <- as.numeric(NA)
+  }
+
+  stats
+}
+
+
+describe_vector_stats <- function(vv, is_num, fast = FALSE) {
+  nz <- if (!is_num) nchar(as.character(vv))
+  data.frame(
+    n = length(vv),
+    n_distinct = if (fast) NA_integer_ else length(unique(vv)),
+    n_na = sum(is.na(vv)),
+    min = as.numeric(min(if (is_num) vv else nz, na.rm = TRUE)),
+    mean = as.numeric(mean(if (is_num) vv else nz, na.rm = TRUE)),
+    median = if (fast) {
+      NA_real_
+    } else {
+      as.numeric(stats::median(if (is_num) vv else nz, na.rm = TRUE))
+    },
+    max = as.numeric(max(if (is_num) vv else nz, na.rm = TRUE)),
+    sd = as.numeric(stats::sd(if (is_num) vv else nz, na.rm = TRUE))
+  )
+}
+
+
+describe_sql <- function(x, max_n = 3, skip_ones = TRUE, digits = 4, fast = FALSE) {
+  describe_collectibles(x, max_n = max_n, skip_ones = skip_ones, digits = digits, fast = fast)
+}
+
+
+describe_arrow <- function(x, max_n = 3, skip_ones = TRUE, digits = 4, fast = FALSE) {
+  describe_collectibles(x, max_n = max_n, skip_ones = skip_ones, digits = digits, fast = fast)
 }
diff --git a/R/filters.R b/R/filters.R
index e2c9582..97880bb 100644
--- a/R/filters.R
+++ b/R/filters.R
@@ -39,7 +39,12 @@ filter_fails <- function(res, x, per_rule = FALSE) {
       c("name", "expr", "allow_na", "negate", "pass", "fail", "tests") %in% names(res)
     ))
 
-    fails <- res$fail != 0
+    is_row_rule <- if ("check_type" %in% names(res)) {
+      res$check_type == "row_rule"
+    } else {
+      rep(TRUE, nrow(res))
+    }
+    fails <- is_row_rule & res$fail != 0
     eorig <- res$expr[fails]
     negated <- res$negate[fails]
     allow_na <- res$allow_na[fails]
diff --git a/R/im-export.R b/R/im-export.R
index 592c0a7..36b6816 100644
--- a/R/im-export.R
+++ b/R/im-export.R
@@ -1,49 +1,107 @@
-#' Read and write rules to a yaml file
-#'
-#' @param x a list of rules
-#' @param file a filename
-#'
-#' @return the filename invisibly
-#' @export
-#'
-#' @examples
-#' rr <- ruleset(
-#'   rule(mpg > 10),
-#'   rule(cyl %in% c(4, 6, 8))
-#' )
-#' file <- tempfile(fileext = ".yml")
-#' write_rules(rr, file)
-write_rules <- function(x, file) {
-  # if a single rule is supplied: wrap it in a list...
-  if (inherits(x, "rule")) x <- ruleset(x)
-
-  ftype <- gsub(".*\\.([^.]+)$", "\\1", file)
-  if (ftype %in% c("yml", "yaml")) {
-    yaml::write_yaml(x, file)
-  } else {
-    stop("At the moment only .yaml files are supported")
-  }
-}
-
-
-#' @describeIn write_rules reads a ruleset back in
-#' @export
-read_rules <- function(file) {
-  ftype <- gsub(".*\\.([^.]+)$", "\\1", file)
-  if (ftype %in% c("yml", "yaml")) {
-    res <- yaml::read_yaml(file)
-  } else {
-    stop("At the moment only .yaml files are supported")
-  }
-
-  res <- lapply(res, function(r) {
-    r$expr <- paste(r$expr, collapse = "\n")
-    class(r) <- "rule"
-    r
-  })
-  class(res) <- "ruleset"
-
-  # if a single rule was supplied: unpack it again
-  if (length(res) == 1) res <- res[[1]]
-  res
-}
+#' Read and write rules to a yaml file
+#'
+#' @param x a list of rules
+#' @param file a filename
+#' @param format output format. `"v1"` writes structured YAML with `meta`,
+#' `data-columns`, and `data-rules`. `"pre_v1"` keeps the pre package version 1.0
+#' flat-list structure.
+#'
+#' @return the filename invisibly
+#' @export
+#'
+#' @examples
+#' rr <- ruleset(
+#'   rule(mpg > 10),
+#'   rule(cyl %in% c(4, 6, 8))
+#' )
+#' file <- tempfile(fileext = ".yml")
+#' write_rules(rr, file)
+write_rules <- function(x, file, format = c("v1", "pre_v1")) {
+  # if a single rule is supplied: wrap it in a list...
+  if (inherits(x, "rule")) x <- ruleset(x)
+  if (identical(format, "legacy")) {
+    format <- "pre_v1"
+  } else {
+    format <- match.arg(format)
+  }
+
+  ftype <- gsub(".*\\.([^.]+)$", "\\1", file)
+  if (ftype %in% c("yml", "yaml")) {
+    if (format == "pre_v1") {
+      yaml::write_yaml(x, file)
+    } else {
+      out <- list(
+        meta = attr(x, "meta", exact = TRUE),
+        "data-columns" = lapply(attr(x, "data_columns", exact = TRUE), unclass),
+        "data-rules" = lapply(x, unclass)
+      )
+      yaml::write_yaml(out, file)
+    }
+  } else {
+    stop("At the moment only .yaml files are supported")
+  }
+}
+
+
+#' @describeIn write_rules reads a ruleset back in
+#' @export
+read_rules <- function(file) {
+  ftype <- gsub(".*\\.([^.]+)$", "\\1", file)
+  if (ftype %in% c("yml", "yaml")) {
+    res <- yaml::read_yaml(file)
+  } else {
+    stop("At the moment only .yaml files are supported")
+  }
+
+  if (is.list(res) && "data-rules" %in% names(res)) {
+    if (!"data-rules" %in% names(res)) {
+      stop("Malformed v1 yaml: missing `data-rules` section.")
+    }
+
+    rr <- lapply(res[["data-rules"]], function(r) {
+      r$expr <- paste(r$expr, collapse = "\n")
+      if (all(c("local_col", "ref_dataset", "ref_col") %in% names(r))) {
+        class(r) <- c("reference_rule", "rule")
+      } else {
+        class(r) <- "rule"
+      }
+      r
+    })
+    class(rr) <- "ruleset"
+
+    if ("meta" %in% names(res)) {
+      attr(rr, "meta") <- res[["meta"]]
+    }
+    if ("data-columns" %in% names(res)) {
+      attr(rr, "data_columns") <- lapply(res[["data-columns"]], function(dc) {
+        data_column(
+          col = dc$col,
+          type = if (is.null(dc$type)) NA_character_ else dc$type,
+          optional = if (is.null(dc$optional)) FALSE else dc$optional,
+          description = if (is.null(dc$description)) NA_character_ else dc$description
+        )
+      })
+    }
+    return(rr)
+  }
+
+  # detect malformed structured format (has sections but not data-rules)
+  is_error <- is.list(res) &&
+    any(c("meta", "data-columns", "data-rules") %in% names(res)) &&
+    !"data-rules" %in% names(res)
+
+  if (is_error) {
+    stop("Malformed v1 yaml: missing `data-rules` section.")
+  }
+
+  rr <- lapply(res, function(r) {
+    r$expr <- paste(r$expr, collapse = "\n")
+    class(r) <- "rule"
+    r
+  })
+  class(rr) <- "ruleset"
+
+  # if a single rule was supplied: unpack it again
+  if (length(rr) == 1) rr <- rr[[1]]
+  rr
+}
diff --git a/R/misc.R b/R/misc.R
index 5169ba8..e29a529 100644
--- a/R/misc.R
+++ b/R/misc.R
@@ -27,10 +27,10 @@ simple_table <- function(d, align = paste(rep("l", ncol(d)), collapse = "")) {
     s('    <th style="text-align:%s;">%s</th>', a, names(d)),
     "  </thead>",
     "  <tbody>",
-    paste(sapply(seq(nrow(d)), function(r) {
+    paste(sapply(seq_len(nrow(d)), function(r) {
       c(
         s('<tr class="%s">', ifelse(r %% 2 == 0, "even", "odd")),
-        paste(sapply(seq(ncol(d)), function(cc) {
+        paste(sapply(seq_len(ncol(d)), function(cc) {
           v <- d[r, cc]
           s('      <td style="text-align:%s;">%s%s%s</td>',
             a[cc],
diff --git a/R/rule.R b/R/rule.R
index 2a3c516..2d79b8e 100644
--- a/R/rule.R
+++ b/R/rule.R
@@ -1,131 +1,163 @@
-#' Creates a single data rule
-#'
-#' @param expr an expression which dictates which determines when a rule is good.
-#' Note that the expression is evaluated in `check_data()`, within the given
-#' framework. That means, for example if a the data given to `check_data()` is
-#' an `arrow` dataset, the expression must be mappable from `arrow` (see also
-#' [arrow documentation](https://arrow.apache.org/docs/r/reference/acero.html#function-mappings)).
-#' The expression can be given as a string as well.
-#' @param name an optional name for the rule for reference
-#' @param allow_na does the rule allow for NA values in the data? default value is FALSE.
-#' Note that when NAs are introduced in the expression, `allow_na` has no effect.
-#' Eg when the rule `as.numeric(vs) %in% c(0, 1)` finds the values of `vs` as
-#' `c("1", "A")`, the rule will throw a fail regardless of the value of `allow_na`
-#' as the NA is introduced in the expression and is not found in the original data.
-#' However, when the values of `vs` are `c("1", NA)`, `allow_na` will have an effect.
-#' @param negate is the rule negated, only applies to the expression not allow_na,
-#' that is, if `expr = mpg > 10`, `allow_na = TRUE`, and `negate = TRUE`, it would
-#' match all `mpg <= 10` as well as NAs.
-#' @param ... additional arguments that are carried along for your documentation,
-#' but are not used. Could be for example date, person, contact, comment, etc
-#'
-#' @return The rule values as a list
-#' @export
-#'
-#' @examples
-#' r <- rule(mpg > 10)
-#' r
-#'
-#' r2 <- rule(mpg > 10, name = "check that mpg is reasonable", allow_na = TRUE,
-#'            negate = FALSE, author = "me", date = Sys.Date())
-#' r2
-#'
-#' check_data(mtcars, r)
-#'
-#' rs <- ruleset(
-#'   rule(mpg > 10),
-#'   rule(cyl %in% c(4, 6)), # missing 8
-#'   rule(qsec >= 14.5 & qsec <= 22.9)
-#' )
-#' rs
-#' check_data(mtcars, rs)
-rule <- function(expr, name = NA, allow_na = FALSE, negate = FALSE, ...) {
-  expr <- paste(deparse(substitute(expr)), collapse = "")
-
-  # allows expressions as well as strings
-  if (substr(expr, 1, 1) == '"' &&
-      substr(expr, nchar(expr), nchar(expr)) == '"')
-    expr <- substr(expr, 2, nchar(expr) - 1)
-
-  if (is.na(name))
-    name <- paste("Rule for:", paste(get_symbols(expr), collapse = ", "))
-
-
-  ll <- list(
-    name = name,
-    expr = expr,
-    allow_na = allow_na,
-    negate = negate,
-    ...
-  )
-  class(ll) <- "rule"
-  ll
-}
-
-#' @describeIn rule Prints a rule
-#' @param x a rule to print
-#' @export
-print.rule <- function(x, ...) {
-  cat(sprintf("<Verification Rule>\n  expr: '%s'\n  name: '%s'\n  allow NA: %s\n  negated:  %s\n",
-              x$expr, x$name, x$allow_na, x$negate))
-  nn <- setdiff(names(x), c("expr", "name", "allow_na", "negate"))
-  for (n in nn) cat(sprintf("  %s: '%s'\n", n, x[[n]]))
-  return(invisible(x))
-}
-
-# small helper function to extract the symbols (var names) from an expression
-# (given as string)
-get_symbols <- function(expr) {
-  vv <- getParseData(parse(text = expr, keep.source = TRUE))
-  unique(vv[vv$token == "SYMBOL", "text"])
-}
-
-#' Creates a set of rules
-#'
-#' @param ... a list of rules
-#'
-#' @return the list of rules as a ruleset
-#' @export
-#'
-#' @examples
-#' r1 <- rule(mpg > 10)
-#' r2 <- rule(mpg < 20)
-#' rs <- ruleset(r1, r2)
-#' rs
-#'
-#' rs <- ruleset(
-#'   rule(cyl %in% c(4, 6, 8)),
-#'   rule(is.numeric(disp))
-#' )
-#' rs
-ruleset <- function(...) {
-  ll <- list(...)
-  ll <- lapply(seq_along(ll), function(i) {
-    l <- ll[[i]]
-    if (!"index" %in% names(l)) l$index <- i
-    l
-  })
-  class(ll) <- "ruleset"
-  ll
-}
-
-
-#' @describeIn ruleset Prints a ruleset
-#' @param x a ruleset to print
-#' @param n a maximum number of rules to print
-#' @export
-print.ruleset <- function(x, n = 3, ...) {
-  cat(sprintf("<Verification Ruleset with %s elements>\n", length(x)))
-
-  nn <- min(length(x), n)
-  for (r in x[seq(nn)]) {
-    cat(sprintf("  [%i] '%s' matching `%s` (allow_na: %s%s)\n",
-                r$index, r$name, r$expr, r$allow_na,
-                if (r$negate) ", negated" else ""))
-  }
-  if (nn != length(x))
-    cat(sprintf("  ... +%s more. Use print(ruleset, n = 10) to print more.\n",
-                length(x) - nn))
-  invisible(x)
-}
-
+#' Creates a single data rule
+#'
+#' @param expr an expression which dictates which determines when a rule is good.
+#' Note that the expression is evaluated in `check_data()`, within the given
+#' framework. That means, for example if a the data given to `check_data()` is
+#' an `arrow` dataset, the expression must be mappable from `arrow` (see also
+#' [arrow documentation](https://arrow.apache.org/docs/r/reference/acero.html#function-mappings)).
+#' The expression can be given as a string as well.
+#' @param name an optional name for the rule for reference
+#' @param allow_na does the rule allow for NA values in the data? default value is FALSE.
+#' Note that when NAs are introduced in the expression, `allow_na` has no effect.
+#' Eg when the rule `as.numeric(vs) %in% c(0, 1)` finds the values of `vs` as
+#' `c("1", "A")`, the rule will throw a fail regardless of the value of `allow_na`
+#' as the NA is introduced in the expression and is not found in the original data.
+#' However, when the values of `vs` are `c("1", NA)`, `allow_na` will have an effect.
+#' @param negate is the rule negated, only applies to the expression not allow_na,
+#' that is, if `expr = mpg > 10`, `allow_na = TRUE`, and `negate = TRUE`, it would
+#' match all `mpg <= 10` as well as NAs.
+#' @param ... additional arguments that are carried along for your documentation,
+#' but are not used. Could be for example date, person, contact, comment, etc
+#'
+#' @return The rule values as a list
+#' @export
+#'
+#' @examples
+#' r <- rule(mpg > 10)
+#' r
+#'
+#' r2 <- rule(mpg > 10, name = "check that mpg is reasonable", allow_na = TRUE,
+#'            negate = FALSE, author = "me", date = Sys.Date())
+#' r2
+#'
+#' check_data(mtcars, r)
+#'
+#' rs <- ruleset(
+#'   rule(mpg > 10),
+#'   rule(cyl %in% c(4, 6)), # missing 8
+#'   rule(qsec >= 14.5 & qsec <= 22.9)
+#' )
+#' rs
+#' check_data(mtcars, rs)
+rule <- function(expr, name = NA, allow_na = FALSE, negate = FALSE, ...) {
+  expr <- paste(deparse(substitute(expr)), collapse = "")
+
+  # allows expressions as well as strings
+  use_substr <- substr(expr, 1, 1) == '"' && substr(expr, nchar(expr), nchar(expr)) == '"'
+  if (use_substr) expr <- substr(expr, 2, nchar(expr) - 1)
+
+  if (is.na(name))
+    name <- paste("Rule for:", paste(get_symbols(expr), collapse = ", "))
+
+
+  ll <- list(
+    name = name,
+    expr = expr,
+    allow_na = allow_na,
+    negate = negate,
+    ...
+  )
+  class(ll) <- "rule"
+  ll
+}
+
+#' @describeIn rule Prints a rule
+#' @param x a rule to print
+#' @export
+print.rule <- function(x, ...) {
+  cat(sprintf("<Verification Rule>\n  expr: '%s'\n  name: '%s'\n  allow NA: %s\n  negated:  %s\n",
+              x$expr, x$name, x$allow_na, x$negate))
+  nn <- setdiff(names(x), c("expr", "name", "allow_na", "negate"))
+  for (n in nn) cat(sprintf("  %s: '%s'\n", n, x[[n]]))
+
+  invisible(x)
+}
+
+# small helper function to extract the symbols (var names) from an expression
+# (given as string)
+get_symbols <- function(expr) {
+  vv <- getParseData(parse(text = expr, keep.source = TRUE))
+  unique(vv[vv$token == "SYMBOL", "text"])
+}
+
+#' Creates a set of rules
+#'
+#' @param ... a list of rules
+#' @param data_columns optional list of schema declarations created with
+#' internal `data_column()` helper.
+#' @param meta optional metadata list for v1 YAML workflows.
+#' @param data_name optional name of the primary dataset when `check_data()`
+#' receives a named list of datasets.
+#'
+#' @return the list of rules as a ruleset
+#' @export
+#'
+#' @examples
+#' r1 <- rule(mpg > 10)
+#' r2 <- rule(mpg < 20)
+#' rs <- ruleset(r1, r2)
+#' rs
+#'
+#' rs <- ruleset(
+#'   rule(cyl %in% c(4, 6, 8)),
+#'   rule(is.numeric(disp))
+#' )
+#' rs
+#'
+#' # combine row, schema, and relational checks
+#' orders <- data.frame(order_id = 1:4, customer_id = c(10, 11, 99, NA), amount = c(10, 20, -5, 30))
+#' customers <- data.frame(customer_id = c(10, 11, 12))
+#'
+#' rs2 <- ruleset(
+#'   rule(amount >= 0, name = "amount must be non-negative"),
+#'   reference_rule(
+#'     local_col = "customer_id",
+#'     ref_dataset = "customers",
+#'     ref_col = "customer_id",
+#'     allow_na = TRUE
+#'   ),
+#'   data_columns = list(
+#'     data_column("order_id", type = "int", optional = FALSE),
+#'     data_column("customer_id", type = "int", optional = FALSE),
+#'     data_column("amount", type = "double", optional = FALSE)
+#'   ),
+#'   data_name = "orders"
+#' )
+#'
+#' check_data(list(orders = orders, customers = customers), rs2)
+ruleset <- function(..., data_columns = NULL, meta = NULL, data_name = NULL) {
+  ll <- list(...)
+  ll <- lapply(seq_along(ll), function(i) {
+    l <- ll[[i]]
+    if (!"index" %in% names(l)) l$index <- i
+    l
+  })
+
+  validate_data_columns(data_columns)
+  if (!is.null(data_columns)) attr(ll, "data_columns") <- data_columns
+  if (!is.null(meta)) attr(ll, "meta") <- meta
+  if (!is.null(data_name)) attr(ll, "data_name") <- data_name
+
+  class(ll) <- "ruleset"
+  ll
+}
+
+
+#' @describeIn ruleset Prints a ruleset
+#' @param x a ruleset to print
+#' @param n a maximum number of rules to print
+#' @export
+print.ruleset <- function(x, n = 3, ...) {
+  cat(sprintf("<Verification Ruleset with %s elements>\n", length(x)))
+
+  nn <- min(length(x), n)
+  for (r in x[seq(nn)]) {
+    cat(sprintf("  [%i] '%s' matching `%s` (allow_na: %s%s)\n",
+                r$index, r$name, r$expr, r$allow_na,
+                if (r$negate) ", negated" else ""))
+  }
+  if (nn != length(x))
+    cat(sprintf("  ... +%s more. Use print(ruleset, n = 10) to print more.\n",
+                length(x) - nn))
+  invisible(x)
+}
diff --git a/R/ruleset_construction.R b/R/ruleset_construction.R
index 0db3e4b..ccf4916 100644
--- a/R/ruleset_construction.R
+++ b/R/ruleset_construction.R
@@ -12,11 +12,11 @@ datavarifyr_plus <- function(a, b) {
   # [double dispatch](https://yutani.rbind.io/post/double-dispatch-of-s3-method/)
   # semantics.
 
-  if (inherits(a, "rule")  & inherits(b, "rule")) {
+  if (inherits(a, "rule") && inherits(b, "rule")) {
     out <- list(a, b)
   } else {
     if (inherits(a, "rule")) a <- list(a)
-    if (inherits(b,"rule")) b <- list(b)
+    if (inherits(b, "rule")) b <- list(b)
     out <- c(a, b)
   }
 
@@ -27,7 +27,8 @@ datavarifyr_plus <- function(a, b) {
   }
 
   class(out) <- "ruleset"
-  return(out)
+
+  out
 }
 
 #' @export
diff --git a/R/sample_data.R b/R/sample_data.R
new file mode 100644
index 0000000..acf8864
--- /dev/null
+++ b/R/sample_data.R
@@ -0,0 +1,31 @@
+#' Sample Orders Dataset for Examples and Tests
+#'
+#' A small, human-readable dataset with mixed column types, missing values, and
+#' one datetime column. It is designed for documentation examples and unit tests.
+#'
+#' @format A data frame with 8 rows and 6 variables:
+#' \describe{
+#'   \item{order_id}{Integer order identifier.}
+#'   \item{customer_tier}{Character tier (`"bronze"`, `"silver"`, `"gold"`, etc),
+#'     includes one `NA`.}
+#'   \item{amount}{Numeric order amount, includes one negative value and one `NA`.}
+#'   \item{paid}{Logical payment flag, includes one `NA`.}
+#'   \item{payment_method}{Character payment method, includes one `NA`.}
+#'   \item{order_time}{`POSIXct` order timestamp in UTC, includes one `NA`.}
+#' }
+#' @export
+#' @examples
+#' sample_data
+sample_data <- data.frame(
+  order_id = as.integer(1:8),
+  customer_tier = c("gold", "silver", "bronze", "gold", NA, "silver", "bronze", "unknown"),
+  amount = c(120.50, 80.00, -5.00, 320.25, 45.10, NA, 0.00, 99.99),
+  paid = c(TRUE, TRUE, FALSE, TRUE, FALSE, NA, TRUE, TRUE),
+  payment_method = c("card", "cash", "none", "card", "none", "card", NA, "none"),
+  order_time = as.POSIXct(c(
+    "2025-01-01 09:00:00", "2025-01-02 10:30:00", "2025-01-03 12:15:00",
+    "2025-01-04 15:45:00", NA, "2025-01-06 08:10:00",
+    "2025-01-07 17:20:00", "2025-01-08 11:05:00"
+  ), tz = "UTC"),
+  stringsAsFactors = FALSE
+)
diff --git a/R/zzz.R b/R/zzz.R
index 3ad402c..1a0e9a6 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -3,5 +3,5 @@
 #' @importFrom stats setNames
 NULL
 
-.datatable.aware <- TRUE
-globalVariables(c("n"))
+.datatable.aware <- TRUE # nolint
+globalVariables(c("n", ".", ".data", ".N", ":="))
diff --git a/README.Rmd b/README.Rmd
index 94cbf4a..37070e8 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -1,323 +1,544 @@
----
-output: github_document
----
-
-<!-- README.md is generated from README.Rmd. Please edit that file -->
-
-```{r, include = FALSE}
-knitr::opts_chunk$set(
-  collapse = TRUE,
-  comment = "#>",
-  fig.path = "man/figures/README-",
-  out.width = "100%"
-)
-Sys.setenv(LANG = "en")
-options(pillar.sigfig = 5, width = 100)
-```
-
-# `dataverifyr` - A Lightweight, Flexible, and Fast Data Validation Package that Can Handle All Sizes of Data
-
-
-<!-- badges: start -->
-
-[![](https://www.r-pkg.org/badges/version/dataverifyr)](https://www.r-pkg.org/pkg/dataverifyr) [![CRAN RStudio mirror downloads](https://cranlogs.r-pkg.org/badges/dataverifyr)](https://www.r-pkg.org/pkg/dataverifyr) [![R-CMD-check](https://github.com/DavZim/dataverifyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/DavZim/dataverifyr/actions/workflows/R-CMD-check.yaml)
-
-<!-- badges: end -->
-
-The goal of `dataverifyr` is to allow a wide variety of flexible data validation checks (verifications).
-That means, you can specify a set of rules (R expressions) and compare any arbitrary dataset against it.
-
-The package is built in such a way, that it adapts to your type of data and choice of data package (data.frame, data.table, tibble, arrow, or SQL connection) and chooses the right data backend automatically, this is especially handy when large or complicated datasets are involved.
-That way, you can concentrate on writing the rules and making sure that your data is valid rather than spending time writing boilerplate code.
-
-The package is lightweight as all the heavy dependencies are Suggests-only, that means if you want to use `data.table` for the task, you don't need to install the other packages (`arrow`, `DBI`, etc) unless you explicitly tell R to install all suggested packages as well when installing the package.
-
-The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages (see also `?detect_backend(data)`).
-By using the underlying technologies and handing over all evaluation of code to the backend, this package can deal with all sizes of data the backends can deal with.
-
-The package also has a helper function to describe a dataset, see `?describe()`.
-
-## Installation
-
-You can install the development version of `dataverifyr` like so:
-
-``` r
-# development version
-# devtools::install_github("DavZim/dataverifyr")
-
-# CRAN release
-install.packages("dataverifyr")
-```
-
-## Example
-
-This is a basic example which shows you how to
-
-1.  create a rule set manually, consisting of R expressions
-2.  check if a dataset matches all given rules
-3.  save and load the rules to a yaml-file for better maintainability
-
-Note that each rule is an R expression that is evaluated within the dataset.
-Our first rule, for example, states that we believe all values of the `mpg` variable are in the range 10 to 30 (exclusive).
-At the moment rules work in a window/vectorized approach only, that means that a rule like this will work `mpg > 10 * wt`, whereas a rule like this `sum(mpg) > 0` will not work as it aggregates values.
-
-```{r example, message=FALSE}
-library(dataverifyr)
-
-# create a dataset
-data <- mtcars
-
-# define a rule set within our R code; alternatively in a yaml file
-rules <- ruleset(
-  rule(mpg > 10 & mpg < 30), # mpg goes up to 34
-  rule(cyl %in% c(4, 8)), # missing 6 cyl
-  rule(vs %in% c(0, 1), allow_na = TRUE)
-)
-
-# print the rules
-rules
-
-# describe the dataset
-describe(data)
-
-# check if the data matches our rules
-res <- check_data(data, rules)
-res
-```
-
-As we can see, our dataset `mtcars` does not conform to all of our rules.
-We have four fails (fail=rule is not met) for the first rule `mpg > 10 & mpg < 30` (there are `mpg` values up to 33.9) and seven fails for the second rule `cyl %in% c(4, 8)` (there are `cyl` values of 6), while the third rule `vs %in% c(0, 1)` is always met.
-
-To see which values do not meet our expectations, use the `filter_fails()` function
-```{r filterfails}
-filter_fails(res, mtcars, per_rule = TRUE)
-```
-
-We can also visualize the results using the `plot_res()` function.
-
-```{r plotres}
-plot_res(res)
-```
-
-Note that you can also save and load a ruleset to and from a `yaml` file
-```{r yaml_rules}
-write_rules(rules, "example_rules.yaml")
-r2 <- read_rules("example_rules.yaml")
-identical(rules, r2)
-```
-The resulting `example_rules.yaml` looks like this
-```{r ex_yaml, results="asis", echo=FALSE}
-cat(paste(c("```yaml", readLines("example_rules.yaml"), "```"), collapse = "\n"))
-```
-
-One helpful use case is to use this functionality to assert that your data has the right values in a custom read function like so:
-
-```{r assert, eval=FALSE}
-read_custom <- function(file, rules) {
-  data <- read.csv(file) # or however you read in your data
-  # if the check_data detects a fail: the read_custom function will stop
-  check_data(data, rules, xname = file,
-             stop_on_fail = TRUE, stop_on_warn = TRUE, stop_on_error = TRUE)
-  # ...
-  data
-}
-# nothing happens when the data matches the rules
-data <- read_custom("correct_data.csv", rules)
-
-# an error is thrown when warnings or errors are found
-data <- read_custom("wrong_data.csv", rules)
-#> Error in check_data(data, rules, stop_on_fail = TRUE, stop_on_error = TRUE, stop_on_warn = TRUE) : 
-#>   In dataset 'wrong_data.csv' found 2 rule fails, 1 warnings, 1 errors
-```
-
-
-## Backends
-
-At the moment the following backends are supported.
-Note that they are automatically chosen based on data type and package availability.
-Eg, when the dataset is a `dplyr::tbl()` connected to an `SQLite` database, the package will automatically choose `RSQLite`/`DBI`/`dbplyr` for the task.
-To see which backend `dataverifyr` would use for a task, you can use `detect_backend(data)`.
-
-```{r backends, echo=FALSE, results="asis"}
-# setup the table of backends in R... base markdown is not nice with this formatting...
-# create a data.frame needed for the table
-d <- function(b, d, c, cc, status = "✔️") data.frame(b, status, d, c, cc)
-# formats something as a code-block
-code <- function(x) paste0("```R\n", x, "\n```")
-
-r <- do.call(rbind, list(
-  d(
-    "`base-R`", "`data.frame`",
-    code("data <- data.frame(x = 1:10)\ncheck_data(data, rs)"),
-    "When `data.table` or `dplyr` are available, they are used for faster speeds."
-  ),
-  d(
-    "[`dplyr`](https://dplyr.tidyverse.org/)", "`tibble`",
-    code("library(dplyr)\ndata <- tibble(x = 1:10)\ncheck_data(data, rs)"),
-    ""
-  ),
-  d(
-    "[`data.table`](https://r-datatable.com)", "`data.table`",
-    code("library(data.table)\ndata <- data.table(x = 1:10)\ncheck_data(data, rs)"),
-    ""
-  ),
-  d(
-    "[`arrow`](https://arrow.apache.org/docs/r/)", "`Table`, `ArrowTabular`, `ArrowObject`",
-    code("library(arrow)\ndata <- arrow_table(x = 1:10)\n# Alternatively:\ndata <- read_parquet(\n  file,\n  as_data_frame = FALSE\n)\ncheck_data(data, rs)"),
-    ""
-  ),
-  d(
-    "[`arrow`](https://arrow.apache.org/docs/r/)", "`FileSystemDataset`, `Dataset`, `ArrowObject`",
-    code("library(arrow)\ndata <- open_dataset(dir)\ncheck_data(data, rs)"),
-    "Especially handy for large datasets"
-  ),
-  d(
-    "[`RSQLite`](https://rsqlite.r-dbi.org/), [`DBI`](https://dbi.r-dbi.org/), and [`dbplyr`](https://dbplyr.tidyverse.org/)", "`tbl_SQLiteConnection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`",
-    code('library(DBI)\ncon <- dbConnect(RSQLite::SQLite())\n# dbWriteTable(con, tablename, data)\ntbl <- dplyr::tbl(con, tablename)\ncheck_data(tbl, rs)\n\ndbDisconnect(con)'),
-    "Note that missing values are converted to `0` when using sqlite by default ([c.f. this SO answer](https://stackoverflow.com/a/57746647/3048453))"
-  ),
-  d(
-    "[`duckdb`](https://duckdb.org/docs/api/r.html), [`DBI`](https://dbi.r-dbi.org/), and [`dbplyr`](https://dbplyr.tidyverse.org/)", "`tbl_duckdb_connection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`",
-    code('library(DBI)\ncon <- dbConnect(duckdb::duckdb())\n# dbWriteTable(con, tablename, data)\ntbl <- dplyr::tbl(con, tablename)\ncheck_data(tbl, rs)\n\ndbDisconnect(con, shutdown = TRUE)'),
-    ""
-  ),
-  
-  d(
-    "[`RPostgres`](https://rpostgres.r-dbi.org/), [`DBI`](https://dbi.r-dbi.org/), and [`dbplyr`](https://dbplyr.tidyverse.org/)", "`tbl_PqConnection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`",
-    code('library(DBI)\ncon <- dbConnect(\n  RPostgres::Postgres(), \n  ...\n)\n# dbWriteTable(con, tablename, data)\ntbl <- dplyr::tbl(con, tablename)\ncheck_data(tbl, rs)\n\ndbDisconnect(con)'),
-"Not tested, but should work out-of-the-box using [`DBI`](https://dbi.r-dbi.org/)", status = "❓"
-  )
-))
-r <- setNames(r, c("Backend / Library", "Status", "Data Type", "Example Code", "Comment"))
-dataverifyr:::simple_table(r, align = "lclll")
-```
-
-Note that the `rs` object in the example code above refers to a `ruleset()`.
-Larger complete examples can be found below.
-
-
-## Larger Example using the `arrow` backend
-
-For a more involved example, using a different backend, let's say we have a larger dataset of taxi trips from NY (see also the official [source of the data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)), that we have saved as a local arrow dataset (using parquet as a data format), where we want to make sure that some variables are in-line with our expectations/rules.
-
-### 1 Download and Prepare Data
-
-First we prepare the data by downloading it and writing the dataset to `.parquet` files.
-This needs to be done only once and is shown for reproducibility reasons only, the actual `dataverifyr` code is shown below the next block
-
-```{r taxi1, eval=requireNamespace("arrow", quietly=TRUE), message=FALSE, warning=FALSE}
-library(arrow)
-url <- "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2018-01.parquet"
-file <- "yellow_tripdata_2018-01.parquet"
-if (!file.exists(file)) download.file(url, file, method = "curl")
-file.size(file) / 1e6 # in MB
-
-# quick check of the filesize
-d <- open_dataset(file)
-describe(d)
-
-# write the dataset to disk
-write_dataset(d, "nyc-taxi-data")
-```
-
-
-### 2 Create Rules in `yaml`
-
-Next, we can create some rules that we will use to check our data.
-As we saw earlier, we can create the rules in R using the `rule()` and `ruleset()` functions, there is however, the (in my opinion) preferred option to separate the code from the rules by writing the rules in a separate yaml file and reading them into R.
-
-First we display the hand-written contents of the `nyc_data_rules.yaml` file.
-
-```{r nycrules, echo=FALSE, results="asis"}
-rs <- ruleset(
-  rule(passenger_count >= 0 & passenger_count <= 10),
-  rule(trip_distance >= 0 & trip_distance <= 1000),
-  rule(payment_type %in% c(0, 1, 2, 3, 4))
-)
-write_rules(rs, "nyc_data_rules.yaml")
-cat(paste(c("```yaml", readLines("nyc_data_rules.yaml"), "```"),
-          collapse = "\n"))
-```
-
-Then, we can load, display, and finally check the rules against the data
-
-```{r taxi2, eval=requireNamespace("arrow", quietly=TRUE)}
-rules <- read_rules("nyc_data_rules.yaml")
-rules
-```
-
-### 3 Verify that the Data matches the given Rules
-
-Now we can check if the data follows our rules or if we have unexpected data points:
-
-```{r taxi3, eval=requireNamespace("arrow", quietly=TRUE)}
-# open the dataset 
-ds <- open_dataset("nyc-taxi-data/")
-
-# perform the data validation check
-res <- check_data(ds, rules)
-res
-
-plot_res(res)
-```
-
-Using the power of `arrow`, we were able to scan 8+mln observations for three rules in about 1.5 seconds (YMMV).
-As we can see from the results, there is one unexpected value, lets quickly investigate using the `filter_fails()` function, which filters a dataset for the failed rule matches
-
-```{r taxi4, eval=requireNamespace("arrow", quietly=TRUE)}
-res |>
-  filter_fails(ds) |> 
-  # only select a couple of variables for brevity
-  dplyr::select(tpep_pickup_datetime, tpep_dropoff_datetime, trip_distance)
-```
-
-As we can see, this is probably a data error (a trip distance of 190k miles in 1 minute seems - ehm stellar...).
-
-## Using a `DBI` Backend
-
-If you have a `SQLite` or `duckdb` database, you can use the package like this
-
-```{r duckdb, eval=requireNamespace("duckdb", quietly = TRUE) & requireNamespace("dplyr", quietly = TRUE) & requireNamespace("DBI", quietly = TRUE), message=FALSE, warning=FALSE}
-library(DBI)
-library(dplyr)
-
-# connect to a duckdb database
-con <- dbConnect(duckdb::duckdb("duckdb-database.duckdb"))
-# for demo purposes write the data once
-dbWriteTable(con, "mtcars", mtcars)
-
-# create a tbl connection, which can be used in the checks
-tbl <- tbl(con, "mtcars")
-
-# create rules
-rules <- ruleset(
-  rule(mpg > 10 & mpg < 30),
-  rule(cyl %in% c(4, 8)),
-  rule(vs %in% c(0, 1), allow_na = TRUE)
-)
-
-# check rules
-res <- check_data(tbl, rules)
-res
-
-filter_fails(res, tbl, per_rule = TRUE)
-
-# lastly disconnect from the database again
-dbDisconnect(con, shutdown = TRUE)
-```
-
-
-# Alternative Data Validation R Libraries
-
-If this library is not what you are looking for, the following might be good alternatives to validate your data:
-
-- [`pointblank`](https://rstudio.github.io/pointblank/)
-- [`validate`](https://github.com/data-cleaning/validate)
-- [`data.validator`](https://github.com/Appsilon/data.validator)
-
-```{r cleanup, echo=FALSE}
-unlink("example_rules.yaml")
-unlink("nyc-taxi-data", recursive = TRUE)
-unlink("nyc_data_rules.yaml")
-unlink("duckdb-database.duckdb")
-```
+---
+output: github_document
+---
+
+<!-- README.md is generated from README.Rmd. Please edit that file -->
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "man/figures/README-",
+  out.width = "100%"
+)
+Sys.setenv(LANG = "en")
+options(pillar.sigfig = 5, width = 100)
+```
+
+# `dataverifyr` - A Lightweight, Flexible, and Fast Data Validation Package that Can Handle All Sizes of Data
+
+
+<!-- badges: start -->
+
+[![](https://www.r-pkg.org/badges/version/dataverifyr)](https://www.r-pkg.org/pkg/dataverifyr) [![CRAN RStudio mirror downloads](https://cranlogs.r-pkg.org/badges/dataverifyr)](https://www.r-pkg.org/pkg/dataverifyr) [![R-CMD-check](https://github.com/DavZim/dataverifyr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/DavZim/dataverifyr/actions/workflows/R-CMD-check.yaml)
+
+<!-- badges: end -->
+
+The goal of `dataverifyr` is to allow a wide variety of flexible data validation checks (verifications).
+That means, you can specify a set of rules (R expressions) and compare any arbitrary dataset against it.
+
+The package is built in such a way, that it adapts to your type of data and choice of data package (data.frame, data.table, tibble, arrow, or SQL connection) and chooses the right data backend automatically, this is especially handy when large or complicated datasets are involved.
+That way, you can concentrate on writing the rules and making sure that your data is valid rather than spending time writing boilerplate code.
+
+The package is lightweight as all the heavy dependencies are Suggests-only, that means if you want to use `data.table` for the task, you don't need to install the other packages (`arrow`, `DBI`, etc) unless you explicitly tell R to install all suggested packages as well when installing the package.
+
+The backend for your analysis is automatically chosen based on the type of input dataset as well as the available packages (see also `?detect_backend(data)`).
+By using the underlying technologies and handing over all evaluation of code to the backend, this package can deal with all sizes of data the backends can deal with.
+
+The package also has a helper function to describe a dataset, see `?describe()`.
+
+## Installation
+
+You can install the development version of `dataverifyr` like so:
+
+``` r
+# development version
+# devtools::install_github("DavZim/dataverifyr")
+
+# CRAN release
+install.packages("dataverifyr")
+```
+
+## Example
+
+This is a basic example which shows you how to
+
+1.  create a rule set manually, consisting of R expressions
+2.  check if a dataset matches all given rules
+3.  save and load the rules to a yaml-file for better maintainability
+
+Note that each rule is an R expression that is evaluated within the dataset.
+Our first rule, for example, states that we expect all `amount` values to be in a sensible range.
+At the moment rules work in a window/vectorized approach only, that means that a rule like this will work `amount > 10 * as.numeric(paid)`, whereas a rule like this `sum(amount) > 0` will not work as it aggregates values.
+
+```{r example, message=FALSE}
+library(dataverifyr)
+
+# use the packaged demo dataset (includes NAs + datetime)
+data <- sample_data
+
+# define a rule set within our R code; alternatively in a yaml file
+rules <- ruleset(
+  rule(amount >= 0 & amount <= 10000, name = "amount in valid range"),
+  rule(customer_tier %in% c("bronze", "silver", "gold"), name = "known customer tier"),
+  rule(!paid | payment_method != "none", name = "paid orders need payment method")
+)
+
+# print the rules
+rules
+
+# describe the dataset
+describe(data)
+
+# check if the data matches our rules
+res <- check_data(data, rules)
+res
+```
+
+As we can see, this demo dataset does not conform to all of our rules.
+There is one negative amount, one unknown customer tier, and one row where `paid == TRUE` but `payment_method == "none"`.
+The dataset also contains missing values and a datetime column (`order_time`) to mirror common real data inputs.
+
+To see which values do not meet our expectations, use the `filter_fails()` function
+```{r filterfails}
+filter_fails(res, data, per_rule = TRUE)
+```
+
+We can also visualize the results using the `plot_res()` function.
+
+```{r plotres}
+plot_res(res)
+```
+
+Note that you can also save and load a ruleset to and from a structured `yaml` file
+```{r yaml_rules}
+write_rules(rules, "example_rules_v1.yaml", format = "v1")
+r2 <- read_rules("example_rules_v1.yaml")
+identical(rules, r2)
+```
+The resulting `example_rules_v1.yaml` looks like this
+```{r ex_yaml, results="asis", echo=FALSE}
+cat(paste(c("```yaml", readLines("example_rules_v1.yaml"), "```"), collapse = "\n"))
+```
+
+`dataverifyr` uses structured v1 YAML (`meta` + `data-columns` + `data-rules`) in the main workflow.
+
+One helpful use case is to use this functionality to assert that your data has the right values in a custom read function like so:
+
+```{r assert, eval=FALSE}
+read_custom <- function(file, rules) {
+  data <- read.csv(file) # or however you read in your data
+  # if the check_data detects a fail: the read_custom function will stop
+  check_data(data, rules, xname = file,
+             stop_on_fail = TRUE, stop_on_warn = TRUE, stop_on_error = TRUE)
+  # ...
+  data
+}
+# nothing happens when the data matches the rules
+data <- read_custom("correct_data.csv", rules)
+
+# an error is thrown when warnings or errors are found
+data <- read_custom("wrong_data.csv", rules)
+#> Error in check_data(data, rules, stop_on_fail = TRUE, stop_on_error = TRUE, stop_on_warn = TRUE) :
+#>   In dataset 'wrong_data.csv' found 2 rule fails, 1 warnings, 1 errors
+```
+
+## Row-Based Checks vs Column-Based Checks
+
+`dataverifyr` supports two complementary layers of validation:
+
+1.  **Row-based checks** with `rule()`  
+    These answer questions like: "Is each value in a valid range?" or
+    "Does each row satisfy a logical condition?".
+2.  **Column-based checks** with `data_columns` in `ruleset()`  
+    These answer questions like: "Does this column exist?", "Is it required?",
+    and "Does it have the expected type?".
+
+The important idea is:
+
+- `rule()` is about **values inside rows**.
+- `data_column()` is about the **declared structure (schema)** of the dataset.
+
+### Row-Based Rules (value checks)
+
+```{r row_based}
+library(dataverifyr)
+
+row_rules <- ruleset(
+  rule(amount > 0, name = "amount is positive", allow_na = TRUE),
+  rule(customer_tier %in% c("bronze", "silver", "gold"), name = "known customer tier", allow_na = TRUE),
+  rule(!paid | payment_method != "none", name = "paid orders require payment method", allow_na = TRUE)
+)
+
+x <- sample_data
+
+check_data(x, row_rules)
+```
+
+The result tells you, for each rule, how many rows passed/failed and whether warnings/errors occurred during evaluation.
+
+### Column-Based Rules (schema checks)
+
+Column checks are attached to the `ruleset()` via `data_columns`.
+
+```{r column_based}
+schema_rules <- ruleset(
+  rule(amount > 0, name = "amount is positive", allow_na = TRUE),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE, description = "Primary key"),
+    data_column("customer_tier", type = "str", optional = FALSE),
+    data_column("amount", type = "double", optional = FALSE),
+    data_column("paid", type = "logical", optional = FALSE),
+    data_column("payment_method", type = "str", optional = FALSE),
+    data_column("order_time", optional = TRUE)
+  )
+)
+
+x_ok <- sample_data
+
+check_data(x_ok, schema_rules)
+```
+
+In this setup:
+
+- `order_id`, `customer_tier`, `amount`, `paid`, and `payment_method` must exist (`optional = FALSE`)
+- `order_time` is optional in the schema declaration
+- row rules still run as usual (`amount > 0`)
+
+### Handling extra columns
+
+If your input has columns not declared in `data_columns`, use `extra_columns`:
+
+```{r extra_columns}
+x_extra <- sample_data
+x_extra$unexpected_col <- c("a", "b", "c", "d", "e", "f", "g", "h")
+
+# default: ignore undeclared columns
+check_data(x_extra, schema_rules, extra_columns = "ignore")
+
+# warn when undeclared columns are present
+try(check_data(x_extra, schema_rules, extra_columns = "warn"))
+
+# fail immediately when undeclared columns are present
+try(check_data(x_extra, schema_rules, extra_columns = "fail"))
+```
+
+### Missing required columns
+
+```{r missing_required}
+x_missing <- sample_data[, setdiff(names(sample_data), "payment_method")]
+
+try(check_data(x_missing, schema_rules))
+```
+
+### Relational Rules (cross-dataset checks)
+
+You can also validate relationships between datasets, for example ensuring
+foreign keys in one dataset exist in a lookup table.
+
+```{r relational_rules}
+flights <- data.frame(carrier = c("AA", "BB", NA_character_))
+carriers <- data.frame(carrier_id = c("AA"))
+
+rel_rules <- ruleset(
+  reference_rule(
+    local_col = "carrier",
+    ref_dataset = "carriers",
+    ref_col = "carrier_id",
+    name = "carrier exists in carriers",
+    allow_na = TRUE
+  ),
+  data_name = "flights"
+)
+
+check_data(
+  list(
+    flights = flights,
+    carriers = carriers
+  ),
+  rel_rules
+)
+```
+
+This returns a `reference_rule` row in `check_type`, so relational checks are
+visible in the same output table as schema and row rules.
+
+### Structured YAML (`v1`) for schema + rules
+
+`dataverifyr` supports a structured YAML format that separates metadata, schema, and rules.
+
+```{r yaml_v1}
+schema_rules_v1 <- ruleset(
+  rule(amount > 0, name = "amount is positive", allow_na = TRUE),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE),
+    data_column("customer_tier", type = "str", optional = FALSE),
+    data_column("amount", type = "double", optional = FALSE),
+    data_column("paid", type = "logical", optional = FALSE),
+    data_column("payment_method", type = "str", optional = FALSE),
+    data_column("order_time", optional = TRUE)
+  ),
+  meta = dataverifyr:::rule_meta(
+    title = "Order Validation",
+    version = "1.0",
+    description = "Checks for order exports"
+  )
+)
+
+write_rules(schema_rules_v1, "example_rules_v1.yaml", format = "v1")
+
+rules_back <- read_rules("example_rules_v1.yaml")
+rules_back
+```
+
+Structured v1 example:
+```{r ex_yaml_v1, results="asis", echo=FALSE}
+cat(paste(c("```yaml", readLines("example_rules_v1.yaml"), "```"), collapse = "\n"))
+```
+
+## Backends
+
+At the moment the following backends are supported.
+Note that they are automatically chosen based on data type and package availability.
+Eg, when the dataset is a `dplyr::tbl()` connected to an `SQLite` database, the package will automatically choose `RSQLite`/`DBI`/`dbplyr` for the task.
+To see which backend `dataverifyr` would use for a task, you can use `detect_backend(data)`.
+
+Important: many backend packages are optional (`Suggests`) and may not be installed in all environments (for example CI runners, documentation builders, or minimal local setups). In particular, examples requiring `arrow`, `duckdb`, `DBI`, or `dplyr` may be shown but not executed unless those packages are available.
+
+```{r backends, echo=FALSE, results="asis"}
+# setup the table of backends in R... base markdown is not nice with this formatting...
+# create a data.frame needed for the table
+d <- function(b, d, c, cc, status = "✔️") data.frame(b, status, d, c, cc)
+# formats something as a code-block
+code <- function(x) paste0("```R\n", x, "\n```")
+
+r <- do.call(rbind, list(
+  d(
+    "`base-R`", "`data.frame`",
+    code("data <- data.frame(x = 1:10)\ncheck_data(data, rs)"),
+    "When `data.table` or `dplyr` are available, they are used for faster speeds."
+  ),
+  d(
+    "[`dplyr`](https://dplyr.tidyverse.org/)", "`tibble`",
+    code("library(dplyr)\ndata <- tibble(x = 1:10)\ncheck_data(data, rs)"),
+    ""
+  ),
+  d(
+    "[`data.table`](https://r-datatable.com)", "`data.table`",
+    code("library(data.table)\ndata <- data.table(x = 1:10)\ncheck_data(data, rs)"),
+    ""
+  ),
+  d(
+    "[`arrow`](https://arrow.apache.org/docs/r/)", "`Table`, `ArrowTabular`, `ArrowObject`",
+    code("library(arrow)\ndata <- arrow_table(x = 1:10)\n# Alternatively:\ndata <- read_parquet(\n  file,\n  as_data_frame = FALSE\n)\ncheck_data(data, rs)"),
+    ""
+  ),
+  d(
+    "[`arrow`](https://arrow.apache.org/docs/r/)", "`FileSystemDataset`, `Dataset`, `ArrowObject`",
+    code("library(arrow)\ndata <- open_dataset(dir)\ncheck_data(data, rs)"),
+    "Especially handy for large datasets"
+  ),
+  d(
+    "[`RSQLite`](https://rsqlite.r-dbi.org/), [`DBI`](https://dbi.r-dbi.org/), and [`dbplyr`](https://dbplyr.tidyverse.org/)", "`tbl_SQLiteConnection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`",
+    code("library(DBI)\ncon <- dbConnect(RSQLite::SQLite())\n# dbWriteTable(con, tablename, data)\ntbl <- dplyr::tbl(con, tablename)\ncheck_data(tbl, rs)\n\ndbDisconnect(con)"),
+    "Note that missing values are converted to `0` when using sqlite by default ([c.f. this SO answer](https://stackoverflow.com/a/57746647/3048453))"
+  ),
+  d(
+    "[`duckdb`](https://duckdb.org/docs/api/r.html), [`DBI`](https://dbi.r-dbi.org/), and [`dbplyr`](https://dbplyr.tidyverse.org/)", "`tbl_duckdb_connection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`",
+    code("library(DBI)\ncon <- dbConnect(duckdb::duckdb())\n# dbWriteTable(con, tablename, data)\ntbl <- dplyr::tbl(con, tablename)\ncheck_data(tbl, rs)\n\ndbDisconnect(con, shutdown = TRUE)"),
+    ""
+  ),
+
+  d(
+    "[`RPostgres`](https://rpostgres.r-dbi.org/), [`DBI`](https://dbi.r-dbi.org/), and [`dbplyr`](https://dbplyr.tidyverse.org/)", "`tbl_PqConnection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`",
+    code("library(DBI)\ncon <- dbConnect(\n  RPostgres::Postgres(), \n  ...\n)\n# dbWriteTable(con, tablename, data)\ntbl <- dplyr::tbl(con, tablename)\ncheck_data(tbl, rs)\n\ndbDisconnect(con)"),
+    "Not tested, but should work out-of-the-box using [`DBI`](https://dbi.r-dbi.org/)", status = "❓"
+  )
+))
+r <- setNames(r, c("Backend / Library", "Status", "Data Type", "Example Code", "Comment"))
+dataverifyr:::simple_table(r, align = "lclll")
+```
+
+Note that the `rs` object in the example code above refers to a `ruleset()`.
+Larger complete examples can be found below.
+
+
+## Larger Example using the `arrow` backend
+
+For a more involved example, using a different backend, let's say we have a larger dataset of taxi trips from NY (see also the official [source of the data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)), that we have saved as a local arrow dataset (using parquet as a data format), where we want to make sure that some variables are in-line with our expectations/rules.
+
+### 1 Download and Prepare Data
+
+First we prepare the data by downloading it and writing the dataset to `.parquet` files.
+This needs to be done only once and is shown for reproducibility reasons only, the actual `dataverifyr` code is shown below the next block
+
+```{r taxi1, eval=FALSE}
+library(arrow)
+url <- "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2018-01.parquet"
+pq_file <- "yellow_tripdata_2018-01.parquet"
+if (!file.exists(pq_file)) download.file(url, file, method = "curl")
+file.size(pq_file) / 1e6 # in MB
+#> [1] 123.6685
+
+# quick check of the filesize and the structure of the file
+d <- open_dataset(pq_file)
+describe(d, fast = TRUE)
+#> # A tibble: 19 × 11
+#>    var                   type            n n_distinct    n_na most_frequent        min      mean median     max       sd
+#>    <chr>                 <chr>       <int>      <int>   <int> <chr>              <dbl>     <dbl>  <dbl>   <dbl>    <dbl>
+#>  1 VendorID              integer   8760687         NA       0 <NA>             1   e+0   1.56e+0     NA  2   e0  4.96e-1
+#>  2 tpep_pickup_datetime  POSIXct   8760687         NA       0 <NA>             9.79e+8   1.52e+9     NA  1.53e9  1.06e+6
+#>  3 tpep_dropoff_datetime POSIXct   8760687         NA       0 <NA>             9.79e+8   1.52e+9     NA  1.53e9  1.06e+6
+#>  4 passenger_count       integer   8760687         NA       0 <NA>             0         1.61e+0     NA  9   e0  1.26e+0
+#>  5 trip_distance         numeric   8760687         NA       0 <NA>             0         2.80e+0     NA  1.89e5  6.41e+1
+#>  6 RatecodeID            integer   8760687         NA       0 <NA>             1   e+0   1.04e+0     NA  9.9 e1  4.45e-1
+#>  7 store_and_fwd_flag    character 8760687         NA 8760687 <NA>            NA       NaN           NA NA      NA
+#>  8 PULocationID          integer   8760687         NA       0 <NA>             1   e+0   1.64e+2     NA  2.65e2  6.64e+1
+#>  9 DOLocationID          integer   8760687         NA       0 <NA>             1   e+0   1.63e+2     NA  2.65e2  7.03e+1
+#> 10 payment_type          integer   8760687         NA       0 <NA>             1   e+0   1.31e+0     NA  4   e0  4.82e-1
+#> 11 fare_amount           numeric   8760687         NA       0 <NA>            -4.5 e+2   1.22e+1     NA  8.02e3  1.17e+1
+#> 12 extra                 numeric   8760687         NA       0 <NA>            -4.47e+1   3.25e-1     NA  6   e1  4.50e-1
+#> 13 mta_tax               numeric   8760687         NA       0 <NA>            -5   e-1   4.98e-1     NA  4.55e1  4.33e-2
+#> 14 tip_amount            numeric   8760687         NA       0 <NA>            -8.88e+1   1.82e+0     NA  4.42e2  2.49e+0
+#> 15 tolls_amount          numeric   8760687         NA       0 <NA>            -1.5 e+1   3.03e-1     NA  9.51e2  1.74e+0
+#> 16 improvement_surcharge numeric   8760687         NA       0 <NA>            -3   e-1   3.00e-1     NA  1   e0  1.44e-2
+#> 17 total_amount          numeric   8760687         NA       0 <NA>            -4.50e+2   1.55e+1     NA  8.02e3  1.42e+1
+#> 18 congestion_surcharge  numeric   8760687         NA 8760675 <NA>             2.5 e+0   2.5 e+0     NA  2.5 e0  0
+#> 19 airport_fee           numeric   8760687         NA 8760675 <NA>             0         0           NA  0       0
+
+# write the dataset to disk
+if (!dir.exists("nyc-taxi-data")) write_dataset(d, "nyc-taxi-data")
+```
+
+
+### 2 Create Rules in `yaml`
+
+Next, we can create some rules that we will use to check our data.
+As we saw earlier, we can create the rules in R using the `rule()` and `ruleset()` functions, there is however, the (in my opinion) preferred option to separate the code from the rules by writing the rules in a separate yaml file and reading them into R.
+
+First we create and write the rules to a `nyc_data_rules.yaml` file, note for larger rulesets, you would most likely write the rules directly in the yml file.
+
+```{r nycrules}
+rs <- ruleset(
+  rule(passenger_count >= 0 & passenger_count <= 10),
+  rule(trip_distance >= 0 & trip_distance <= 1000),
+  rule(payment_type %in% c(0, 1, 2, 3, 4))
+)
+write_rules(rs, "nyc_data_rules.yaml")
+```
+
+Which looks like this in the yaml file:
+```{r nycrules2, results="asis", echo=FALSE}
+cat(paste(c("```yaml", readLines("nyc_data_rules.yaml"), "```"),
+          collapse = "\n"))
+```
+
+Then, we can load, display, and finally check the rules against the data
+
+```{r taxi2}
+rules <- read_rules("nyc_data_rules.yaml")
+rules
+```
+
+### 3 Verify that the Data matches the given Rules
+
+Now we can check if the data follows our rules or if we have unexpected data points:
+
+```{r taxi3, eval=FALSE}
+library(arrow)
+
+# open the dataset
+ds <- open_dataset(pq_file)
+
+# perform the data validation check
+res <- check_data(ds, rules)
+res
+#> # A tibble: 3 × 11
+#>   check_type name                      expr                                         allow_na negate   tests    pass  fail warn  error time
+#>   <chr>      <chr>                     <chr>                                        <lgl>    <lgl>    <int>   <int> <int> <chr> <chr> <drtn>
+#> 1 row_rule   Rule for: passenger_count passenger_count >= 0 & passenger_count <= 10 FALSE    FALSE  8760687 8760687     0 ""    ""    0.4102955 secs
+#> 2 row_rule   Rule for: trip_distance   trip_distance >= 0 & trip_distance <= 1000   FALSE    FALSE  8760687 8760686     1 ""    ""    0.3988464 secs
+#> 3 row_rule   Rule for: payment_type    payment_type %in% c(0, 1, 2, 3, 4)           FALSE    FALSE  8760687 8760687     0 ""    ""    0.3578835 secs
+
+plot_res(res)
+```
+
+```{r, eval=FALSE, echo=FALSE}
+png("man/figures/README-taxi3-1.png", height = 400, width = 500)
+plot_res(res)
+dev.off()
+```
+
+```{r, echo=FALSE}
+knitr::include_graphics("man/figures/README-taxi3-1.png")
+```
+
+Using the power of `arrow`, we were able to scan 8+mln observations for three rules in about 1.5 seconds (YMMV).
+As we can see from the results, there is one unexpected value, lets quickly investigate using the `filter_fails()` function, which filters a dataset for the failed rule matches
+
+```{r taxi4, eval=FALSE}
+res |>
+  filter_fails(ds) |>
+  # only select a couple of variables for brevity
+  dplyr::select(tpep_pickup_datetime, tpep_dropoff_datetime, trip_distance)
+#> # A tibble: 1 × 3
+#>   tpep_pickup_datetime tpep_dropoff_datetime trip_distance
+#>   <dttm>               <dttm>                        <dbl>
+#> 1 2018-01-30 11:41:02  2018-01-30 11:42:09         189484.
+```
+
+As we can see, this is probably a data error (a trip distance of 190k miles in 1 minute seems - ehm stellar...).
+
+## Using a `DBI` Backend
+
+If you have a `SQLite` or `duckdb` database, you can use the package like this
+
+```{r duckdb, eval=FALSE, message=FALSE, warning=FALSE}
+library(DBI)
+library(dplyr)
+
+# connect to a duckdb database
+con <- dbConnect(duckdb::duckdb("duckdb-database.duckdb"))
+# for demo purposes write sample_data once
+dbWriteTable(con, "orders", sample_data)
+
+# create a tbl connection, which can be used in the checks
+tbl <- tbl(con, "orders")
+
+# create rules
+rules <- ruleset(
+  rule(amount >= 0 & amount <= 10000, name = "amount in valid range"),
+  rule(customer_tier %in% c("bronze", "silver", "gold"), name = "known customer tier"),
+  rule(!paid | payment_method != "none", name = "paid orders need payment method")
+)
+
+# check rules
+res <- check_data(tbl, rules)
+res
+
+filter_fails(res, tbl, per_rule = TRUE)
+
+# lastly disconnect from the database again
+dbDisconnect(con, shutdown = TRUE)
+```
+
+## Pre Package Version 1.0 YAML (Compatibility)
+
+`dataverifyr` still supports the pre package version 1.0 flat-list YAML format for compatibility with existing rule files.
+
+```{r yaml_pre_v1}
+write_rules(rules, "example_rules_pre_v1.yaml", format = "pre_v1")
+rules_pre_v1 <- read_rules("example_rules_pre_v1.yaml")
+identical(rules, rules_pre_v1)
+```
+
+Pre package version 1.0 YAML example:
+```{r ex_yaml_pre_v1_bottom, results="asis", echo=FALSE}
+cat(paste(c("```yaml", readLines("example_rules_pre_v1.yaml"), "```"), collapse = "\n"))
+```
+
+
+# Alternative Data Validation R Libraries
+
+If this library is not what you are looking for, the following might be good alternatives to validate your data:
+
+- [`pointblank`](https://rstudio.github.io/pointblank/)
+- [`validate`](https://github.com/data-cleaning/validate)
+- [`data.validator`](https://github.com/Appsilon/data.validator)
+
+```{r cleanup, echo=FALSE}
+unlink("example_rules_v1.yaml")
+unlink("example_rules_pre_v1.yaml")
+unlink("nyc_data_rules.yaml")
+unlink("duckdb-database.duckdb")
+```
diff --git a/README.md b/README.md
index f7583f7..bc5a3b4 100644
--- a/README.md
+++ b/README.md
@@ -59,95 +59,95 @@ This is a basic example which shows you how to
 3.  save and load the rules to a yaml-file for better maintainability
 
 Note that each rule is an R expression that is evaluated within the
-dataset. Our first rule, for example, states that we believe all values
-of the `mpg` variable are in the range 10 to 30 (exclusive). At the
-moment rules work in a window/vectorized approach only, that means that
-a rule like this will work `mpg > 10 * wt`, whereas a rule like this
-`sum(mpg) > 0` will not work as it aggregates values.
+dataset. Our first rule, for example, states that we expect all `amount`
+values to be in a sensible range. At the moment rules work in a
+window/vectorized approach only, that means that a rule like this will
+work `amount > 10 * as.numeric(paid)`, whereas a rule like this
+`sum(amount) > 0` will not work as it aggregates values.
 
 ``` r
 library(dataverifyr)
 
-# create a dataset
-data <- mtcars
+# use the packaged demo dataset (includes NAs + datetime)
+data <- sample_data
 
 # define a rule set within our R code; alternatively in a yaml file
 rules <- ruleset(
-  rule(mpg > 10 & mpg < 30), # mpg goes up to 34
-  rule(cyl %in% c(4, 8)), # missing 6 cyl
-  rule(vs %in% c(0, 1), allow_na = TRUE)
+  rule(amount >= 0 & amount <= 10000, name = "amount in valid range"),
+  rule(customer_tier %in% c("bronze", "silver", "gold"), name = "known customer tier"),
+  rule(!paid | payment_method != "none", name = "paid orders need payment method")
 )
 
 # print the rules
 rules
 #> <Verification Ruleset with 3 elements>
-#>   [1] 'Rule for: mpg' matching `mpg > 10 & mpg < 30` (allow_na: FALSE)
-#>   [2] 'Rule for: cyl' matching `cyl %in% c(4, 8)` (allow_na: FALSE)
-#>   [3] 'Rule for: vs' matching `vs %in% c(0, 1)` (allow_na: TRUE)
+#>   [1] 'amount in valid range' matching `amount >= 0 & amount <= 10000` (allow_na: FALSE)
+#>   [2] 'known customer tier' matching `customer_tier %in% c("bronze", "silver", "gold")` (allow_na: FALSE)
+#>   [3] 'paid orders need payment method' matching `!paid | payment_method != "none"` (allow_na: FALSE)
 
 # describe the dataset
 describe(data)
-#>      var    type  n n_distinct n_na                  most_frequent    min       mean  median
-#>  1:  mpg numeric 32         25    0     21 (2), 22.8 (2), 21.4 (2) 10.400  20.090625  19.200
-#>  2:  cyl numeric 32          3    0          8 (14), 4 (11), 6 (7)  4.000   6.187500   6.000
-#>  3: disp numeric 32         27    0    275.8 (3), 160 (2), 360 (2) 71.100 230.721875 196.300
-#>  4:   hp numeric 32         22    0      110 (3), 175 (3), 180 (3) 52.000 146.687500 123.000
-#>  5: drat numeric 32         22    0    3.92 (3), 3.07 (3), 3.9 (2)  2.760   3.596563   3.695
-#>  6:   wt numeric 32         29    0   3.44 (3), 3.57 (2), 2.62 (1)  1.513   3.217250   3.325
-#>  7: qsec numeric 32         30    0 17.02 (2), 18.9 (2), 16.46 (1) 14.500  17.848750  17.710
-#>  8:   vs numeric 32          2    0        0 (18), 1 (14), NA (NA)  0.000   0.437500   0.000
-#>  9:   am numeric 32          2    0        0 (19), 1 (13), NA (NA)  0.000   0.406250   0.000
-#> 10: gear numeric 32          3    0          3 (15), 4 (12), 5 (5)  3.000   3.687500   4.000
-#> 11: carb numeric 32          6    0          4 (10), 2 (10), 1 (7)  1.000   2.812500   2.000
-#>         max          sd
-#>  1:  33.900   6.0269481
-#>  2:   8.000   1.7859216
-#>  3: 472.000 123.9386938
-#>  4: 335.000  68.5628685
-#>  5:   4.930   0.5346787
-#>  6:   5.424   0.9784574
-#>  7:  22.900   1.7869432
-#>  8:   1.000   0.5040161
-#>  9:   1.000   0.4989909
-#> 10:   5.000   0.7378041
-#> 11:   8.000   1.6152000
+#>               var      type     n n_distinct  n_na                    most_frequent        min
+#>            <char>    <char> <int>      <int> <int>                           <char>      <num>
+#> 1:       order_id   integer     8          8     0                                           1
+#> 2:  customer_tier character     8          5     1 gold (2), silver (2), bronze (2)          4
+#> 3:         amount   numeric     8          8     1                                          -5
+#> 4:           paid   logical     8          3     1              TRUE (5), FALSE (2)          4
+#> 5: payment_method character     8          4     1               card (3), none (3)          4
+#> 6:     order_time   POSIXct     8          8     1                                  1735722000
+#>            mean       median          max           sd
+#>           <num>        <num>        <num>        <num>
+#> 1: 4.500000e+00          4.5 8.000000e+00 2.449490e+00
+#> 2: 5.571429e+00          6.0 7.000000e+00 1.133893e+00
+#> 3: 9.440571e+01         80.0 3.202500e+02 1.104161e+02
+#> 4: 4.285714e+00          4.0 5.000000e+00 4.879500e-01
+#> 5: 4.000000e+00          4.0 4.000000e+00 0.000000e+00
+#> 6: 1.736029e+09 1736005500.0 1.736334e+09 2.318349e+05
 
 # check if the data matches our rules
 res <- check_data(data, rules)
 res
-#>             name                expr allow_na negate tests pass fail warn error              time
-#> 1: Rule for: mpg mpg > 10 & mpg < 30    FALSE  FALSE    32   28    4            0.0010831356 secs
-#> 2: Rule for: cyl    cyl %in% c(4, 8)    FALSE  FALSE    32   25    7            0.0033519268 secs
-#> 3:  Rule for: vs     vs %in% c(0, 1)     TRUE  FALSE    32   32    0            0.0005369186 secs
+#>    check_type                            name                                             expr
+#>        <char>                          <char>                                           <char>
+#> 1:   row_rule           amount in valid range                    amount >= 0 & amount <= 10000
+#> 2:   row_rule             known customer tier customer_tier %in% c("bronze", "silver", "gold")
+#> 3:   row_rule paid orders need payment method                 !paid | payment_method != "none"
+#>    allow_na negate tests  pass  fail   warn  error             time
+#>      <lgcl> <lgcl> <int> <int> <int> <char> <char>       <difftime>
+#> 1:    FALSE  FALSE     8     6     2               0.001636744 secs
+#> 2:    FALSE  FALSE     8     6     2               0.003160954 secs
+#> 3:    FALSE  FALSE     8     6     2               0.000418663 secs
 ```
 
-As we can see, our dataset `mtcars` does not conform to all of our
-rules. We have four fails (fail=rule is not met) for the first rule
-`mpg > 10 & mpg < 30` (there are `mpg` values up to 33.9) and seven
-fails for the second rule `cyl %in% c(4, 8)` (there are `cyl` values of
-6), while the third rule `vs %in% c(0, 1)` is always met.
+As we can see, this demo dataset does not conform to all of our rules.
+There is one negative amount, one unknown customer tier, and one row
+where `paid == TRUE` but `payment_method == "none"`. The dataset also
+contains missing values and a datetime column (`order_time`) to mirror
+common real data inputs.
 
 To see which values do not meet our expectations, use the
 `filter_fails()` function
 
 ``` r
-filter_fails(res, mtcars, per_rule = TRUE)
-#> $`mpg > 10 & mpg < 30`
-#>     mpg cyl disp  hp drat    wt  qsec vs am gear carb
-#> 1: 32.4   4 78.7  66 4.08 2.200 19.47  1  1    4    1
-#> 2: 30.4   4 75.7  52 4.93 1.615 18.52  1  1    4    2
-#> 3: 33.9   4 71.1  65 4.22 1.835 19.90  1  1    4    1
-#> 4: 30.4   4 95.1 113 3.77 1.513 16.90  1  1    5    2
+filter_fails(res, data, per_rule = TRUE)
+#> $`amount >= 0 & amount <= 10000`
+#>    order_id customer_tier amount   paid payment_method          order_time
+#>       <int>        <char>  <num> <lgcl>         <char>              <POSc>
+#> 1:        3        bronze     -5  FALSE           none 2025-01-03 12:15:00
+#> 2:        6        silver     NA     NA           card 2025-01-06 08:10:00
 #> 
-#> $`cyl %in% c(4, 8)`
-#>     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
-#> 1: 21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
-#> 2: 21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
-#> 3: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
-#> 4: 18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
-#> 5: 19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
-#> 6: 17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
-#> 7: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
+#> $`customer_tier %in% c("bronze", "silver", "gold")`
+#>    order_id customer_tier amount   paid payment_method          order_time
+#>       <int>        <char>  <num> <lgcl>         <char>              <POSc>
+#> 1:        5          <NA>  45.10  FALSE           none                <NA>
+#> 2:        8       unknown  99.99   TRUE           none 2025-01-08 11:05:00
+#> 
+#> $`!paid | payment_method != "none"`
+#>    order_id customer_tier amount   paid payment_method          order_time
+#>       <int>        <char>  <num> <lgcl>         <char>              <POSc>
+#> 1:        6        silver     NA     NA           card 2025-01-06 08:10:00
+#> 2:        7        bronze   0.00   TRUE           <NA> 2025-01-07 17:20:00
+#> 3:        8       unknown  99.99   TRUE           none 2025-01-08 11:05:00
 ```
 
 We can also visualize the results using the `plot_res()` function.
@@ -156,37 +156,44 @@ We can also visualize the results using the `plot_res()` function.
 plot_res(res)
 ```
 
-<img src="man/figures/README-plotres-1.png" width="100%" />
+<img src="man/figures/README-plotres-1.png" alt="" width="100%" />
 
-Note that you can also save and load a ruleset to and from a `yaml` file
+Note that you can also save and load a ruleset to and from a structured
+`yaml` file
 
 ``` r
-write_rules(rules, "example_rules.yaml")
-r2 <- read_rules("example_rules.yaml")
+write_rules(rules, "example_rules_v1.yaml", format = "v1")
+r2 <- read_rules("example_rules_v1.yaml")
 identical(rules, r2)
-#> [1] TRUE
+#> [1] FALSE
 ```
 
-The resulting `example_rules.yaml` looks like this
+The resulting `example_rules_v1.yaml` looks like this
 
 ``` yaml
-- name: 'Rule for: mpg'
-  expr: mpg > 10 & mpg < 30
+meta: ~
+data-columns: []
+data-rules:
+- name: amount in valid range
+  expr: amount >= 0 & amount <= 10000
   allow_na: no
   negate: no
   index: 1
-- name: 'Rule for: cyl'
-  expr: cyl %in% c(4, 8)
+- name: known customer tier
+  expr: customer_tier %in% c("bronze", "silver", "gold")
   allow_na: no
   negate: no
   index: 2
-- name: 'Rule for: vs'
-  expr: vs %in% c(0, 1)
-  allow_na: yes
+- name: paid orders need payment method
+  expr: '!paid | payment_method != "none"'
+  allow_na: no
   negate: no
   index: 3
 ```
 
+`dataverifyr` uses structured v1 YAML (`meta` + `data-columns` +
+`data-rules`) in the main workflow.
+
 One helpful use case is to use this functionality to assert that your
 data has the right values in a custom read function like so:
 
@@ -204,10 +211,390 @@ data <- read_custom("correct_data.csv", rules)
 
 # an error is thrown when warnings or errors are found
 data <- read_custom("wrong_data.csv", rules)
-#> Error in check_data(data, rules, stop_on_fail = TRUE, stop_on_error = TRUE, stop_on_warn = TRUE) : 
+#> Error in check_data(data, rules, stop_on_fail = TRUE, stop_on_error = TRUE, stop_on_warn = TRUE) :
 #>   In dataset 'wrong_data.csv' found 2 rule fails, 1 warnings, 1 errors
 ```
 
+## Row-Based Checks vs Column-Based Checks
+
+`dataverifyr` supports two complementary layers of validation:
+
+1.  **Row-based checks** with `rule()`  
+    These answer questions like: “Is each value in a valid range?” or
+    “Does each row satisfy a logical condition?”.
+2.  **Column-based checks** with `data_columns` in `ruleset()`  
+    These answer questions like: “Does this column exist?”, “Is it
+    required?”, and “Does it have the expected type?”.
+
+The important idea is:
+
+- `rule()` is about **values inside rows**.
+- `data_column()` is about the **declared structure (schema)** of the
+  dataset.
+
+### Row-Based Rules (value checks)
+
+``` r
+library(dataverifyr)
+
+row_rules <- ruleset(
+  rule(amount > 0, name = "amount is positive", allow_na = TRUE),
+  rule(customer_tier %in% c("bronze", "silver", "gold"), name = "known customer tier", allow_na = TRUE),
+  rule(!paid | payment_method != "none", name = "paid orders require payment method", allow_na = TRUE)
+)
+
+x <- sample_data
+
+check_data(x, row_rules)
+#>    check_type                               name                                             expr
+#>        <char>                             <char>                                           <char>
+#> 1:   row_rule                 amount is positive                                       amount > 0
+#> 2:   row_rule                known customer tier customer_tier %in% c("bronze", "silver", "gold")
+#> 3:   row_rule paid orders require payment method                 !paid | payment_method != "none"
+#>    allow_na negate tests  pass  fail   warn  error              time
+#>      <lgcl> <lgcl> <int> <int> <int> <char> <char>        <difftime>
+#> 1:     TRUE  FALSE     8     6     2               0.0005500317 secs
+#> 2:     TRUE  FALSE     8     7     1               0.0004150867 secs
+#> 3:     TRUE  FALSE     8     7     1               0.0004053116 secs
+```
+
+The result tells you, for each rule, how many rows passed/failed and
+whether warnings/errors occurred during evaluation.
+
+### Column-Based Rules (schema checks)
+
+Column checks are attached to the `ruleset()` via `data_columns`.
+
+``` r
+schema_rules <- ruleset(
+  rule(amount > 0, name = "amount is positive", allow_na = TRUE),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE, description = "Primary key"),
+    data_column("customer_tier", type = "str", optional = FALSE),
+    data_column("amount", type = "double", optional = FALSE),
+    data_column("paid", type = "logical", optional = FALSE),
+    data_column("payment_method", type = "str", optional = FALSE),
+    data_column("order_time", optional = TRUE)
+  )
+)
+
+x_ok <- sample_data
+
+check_data(x_ok, schema_rules)
+#>     check_type                                           name
+#>         <char>                                         <char>
+#>  1:     schema               Schema: column 'order_id' exists
+#>  2:     schema       Schema: column 'order_id' has type 'int'
+#>  3:     schema          Schema: column 'customer_tier' exists
+#>  4:     schema  Schema: column 'customer_tier' has type 'str'
+#>  5:     schema                 Schema: column 'amount' exists
+#>  6:     schema      Schema: column 'amount' has type 'double'
+#>  7:     schema                   Schema: column 'paid' exists
+#>  8:     schema       Schema: column 'paid' has type 'logical'
+#>  9:     schema         Schema: column 'payment_method' exists
+#> 10:     schema Schema: column 'payment_method' has type 'str'
+#> 11:     schema             Schema: column 'order_time' exists
+#> 12:   row_rule                             amount is positive
+#>                                       expr allow_na negate tests  pass  fail   warn  error
+#>                                     <char>   <lgcl> <lgcl> <int> <int> <int> <char> <char>
+#>  1:              column_exists('order_id')    FALSE  FALSE     1     1     0              
+#>  2:       column_type('order_id') == 'int'    FALSE  FALSE     1     1     0              
+#>  3:         column_exists('customer_tier')    FALSE  FALSE     1     1     0              
+#>  4:  column_type('customer_tier') == 'str'    FALSE  FALSE     1     1     0              
+#>  5:                column_exists('amount')    FALSE  FALSE     1     1     0              
+#>  6:      column_type('amount') == 'double'    FALSE  FALSE     1     1     0              
+#>  7:                  column_exists('paid')    FALSE  FALSE     1     1     0              
+#>  8:       column_type('paid') == 'logical'    FALSE  FALSE     1     1     0              
+#>  9:        column_exists('payment_method')    FALSE  FALSE     1     1     0              
+#> 10: column_type('payment_method') == 'str'    FALSE  FALSE     1     1     0              
+#> 11:            column_exists('order_time')    FALSE  FALSE     1     1     0              
+#> 12:                             amount > 0     TRUE  FALSE     8     6     2              
+#>                  time
+#>            <difftime>
+#>  1: 0.0000000000 secs
+#>  2: 0.0000000000 secs
+#>  3: 0.0000000000 secs
+#>  4: 0.0000000000 secs
+#>  5: 0.0000000000 secs
+#>  6: 0.0000000000 secs
+#>  7: 0.0000000000 secs
+#>  8: 0.0000000000 secs
+#>  9: 0.0000000000 secs
+#> 10: 0.0000000000 secs
+#> 11: 0.0000000000 secs
+#> 12: 0.0003836155 secs
+```
+
+In this setup:
+
+- `order_id`, `customer_tier`, `amount`, `paid`, and `payment_method`
+  must exist (`optional = FALSE`)
+- `order_time` is optional in the schema declaration
+- row rules still run as usual (`amount > 0`)
+
+### Handling extra columns
+
+If your input has columns not declared in `data_columns`, use
+`extra_columns`:
+
+``` r
+x_extra <- sample_data
+x_extra$unexpected_col <- c("a", "b", "c", "d", "e", "f", "g", "h")
+
+# default: ignore undeclared columns
+check_data(x_extra, schema_rules, extra_columns = "ignore")
+#>     check_type                                           name
+#>         <char>                                         <char>
+#>  1:     schema               Schema: column 'order_id' exists
+#>  2:     schema       Schema: column 'order_id' has type 'int'
+#>  3:     schema          Schema: column 'customer_tier' exists
+#>  4:     schema  Schema: column 'customer_tier' has type 'str'
+#>  5:     schema                 Schema: column 'amount' exists
+#>  6:     schema      Schema: column 'amount' has type 'double'
+#>  7:     schema                   Schema: column 'paid' exists
+#>  8:     schema       Schema: column 'paid' has type 'logical'
+#>  9:     schema         Schema: column 'payment_method' exists
+#> 10:     schema Schema: column 'payment_method' has type 'str'
+#> 11:     schema             Schema: column 'order_time' exists
+#> 12:   row_rule                             amount is positive
+#>                                       expr allow_na negate tests  pass  fail   warn  error
+#>                                     <char>   <lgcl> <lgcl> <int> <int> <int> <char> <char>
+#>  1:              column_exists('order_id')    FALSE  FALSE     1     1     0              
+#>  2:       column_type('order_id') == 'int'    FALSE  FALSE     1     1     0              
+#>  3:         column_exists('customer_tier')    FALSE  FALSE     1     1     0              
+#>  4:  column_type('customer_tier') == 'str'    FALSE  FALSE     1     1     0              
+#>  5:                column_exists('amount')    FALSE  FALSE     1     1     0              
+#>  6:      column_type('amount') == 'double'    FALSE  FALSE     1     1     0              
+#>  7:                  column_exists('paid')    FALSE  FALSE     1     1     0              
+#>  8:       column_type('paid') == 'logical'    FALSE  FALSE     1     1     0              
+#>  9:        column_exists('payment_method')    FALSE  FALSE     1     1     0              
+#> 10: column_type('payment_method') == 'str'    FALSE  FALSE     1     1     0              
+#> 11:            column_exists('order_time')    FALSE  FALSE     1     1     0              
+#> 12:                             amount > 0     TRUE  FALSE     8     6     2              
+#>                 time
+#>           <difftime>
+#>  1: 0.000000000 secs
+#>  2: 0.000000000 secs
+#>  3: 0.000000000 secs
+#>  4: 0.000000000 secs
+#>  5: 0.000000000 secs
+#>  6: 0.000000000 secs
+#>  7: 0.000000000 secs
+#>  8: 0.000000000 secs
+#>  9: 0.000000000 secs
+#> 10: 0.000000000 secs
+#> 11: 0.000000000 secs
+#> 12: 0.000389576 secs
+
+# warn when undeclared columns are present
+try(check_data(x_extra, schema_rules, extra_columns = "warn"))
+#> Warning in validate_rules_against_schema(x, rules, extra_columns = extra_columns): Found extra
+#> columns not declared in `data_columns`: unexpected_col
+#>     check_type                                           name
+#>         <char>                                         <char>
+#>  1:     schema               Schema: column 'order_id' exists
+#>  2:     schema       Schema: column 'order_id' has type 'int'
+#>  3:     schema          Schema: column 'customer_tier' exists
+#>  4:     schema  Schema: column 'customer_tier' has type 'str'
+#>  5:     schema                 Schema: column 'amount' exists
+#>  6:     schema      Schema: column 'amount' has type 'double'
+#>  7:     schema                   Schema: column 'paid' exists
+#>  8:     schema       Schema: column 'paid' has type 'logical'
+#>  9:     schema         Schema: column 'payment_method' exists
+#> 10:     schema Schema: column 'payment_method' has type 'str'
+#> 11:     schema             Schema: column 'order_time' exists
+#> 12:   row_rule                             amount is positive
+#>                                       expr allow_na negate tests  pass  fail   warn  error
+#>                                     <char>   <lgcl> <lgcl> <int> <int> <int> <char> <char>
+#>  1:              column_exists('order_id')    FALSE  FALSE     1     1     0              
+#>  2:       column_type('order_id') == 'int'    FALSE  FALSE     1     1     0              
+#>  3:         column_exists('customer_tier')    FALSE  FALSE     1     1     0              
+#>  4:  column_type('customer_tier') == 'str'    FALSE  FALSE     1     1     0              
+#>  5:                column_exists('amount')    FALSE  FALSE     1     1     0              
+#>  6:      column_type('amount') == 'double'    FALSE  FALSE     1     1     0              
+#>  7:                  column_exists('paid')    FALSE  FALSE     1     1     0              
+#>  8:       column_type('paid') == 'logical'    FALSE  FALSE     1     1     0              
+#>  9:        column_exists('payment_method')    FALSE  FALSE     1     1     0              
+#> 10: column_type('payment_method') == 'str'    FALSE  FALSE     1     1     0              
+#> 11:            column_exists('order_time')    FALSE  FALSE     1     1     0              
+#> 12:                             amount > 0     TRUE  FALSE     8     6     2              
+#>                  time
+#>            <difftime>
+#>  1: 0.0000000000 secs
+#>  2: 0.0000000000 secs
+#>  3: 0.0000000000 secs
+#>  4: 0.0000000000 secs
+#>  5: 0.0000000000 secs
+#>  6: 0.0000000000 secs
+#>  7: 0.0000000000 secs
+#>  8: 0.0000000000 secs
+#>  9: 0.0000000000 secs
+#> 10: 0.0000000000 secs
+#> 11: 0.0000000000 secs
+#> 12: 0.0003600121 secs
+
+# fail immediately when undeclared columns are present
+try(check_data(x_extra, schema_rules, extra_columns = "fail"))
+#> Error in validate_rules_against_schema(x, rules, extra_columns = extra_columns) : 
+#>   Found extra columns not declared in `data_columns`: unexpected_col
+```
+
+### Missing required columns
+
+``` r
+x_missing <- sample_data[, setdiff(names(sample_data), "payment_method")]
+
+try(check_data(x_missing, schema_rules))
+#>     check_type                                          name                                  expr
+#>         <char>                                        <char>                                <char>
+#>  1:     schema              Schema: column 'order_id' exists             column_exists('order_id')
+#>  2:     schema      Schema: column 'order_id' has type 'int'      column_type('order_id') == 'int'
+#>  3:     schema         Schema: column 'customer_tier' exists        column_exists('customer_tier')
+#>  4:     schema Schema: column 'customer_tier' has type 'str' column_type('customer_tier') == 'str'
+#>  5:     schema                Schema: column 'amount' exists               column_exists('amount')
+#>  6:     schema     Schema: column 'amount' has type 'double'     column_type('amount') == 'double'
+#>  7:     schema                  Schema: column 'paid' exists                 column_exists('paid')
+#>  8:     schema      Schema: column 'paid' has type 'logical'      column_type('paid') == 'logical'
+#>  9:     schema        Schema: column 'payment_method' exists       column_exists('payment_method')
+#> 10:     schema            Schema: column 'order_time' exists           column_exists('order_time')
+#> 11:   row_rule                            amount is positive                            amount > 0
+#>     allow_na negate tests  pass  fail   warn                                        error
+#>       <lgcl> <lgcl> <int> <int> <int> <char>                                       <char>
+#>  1:    FALSE  FALSE     1     1     0                                                    
+#>  2:    FALSE  FALSE     1     1     0                                                    
+#>  3:    FALSE  FALSE     1     1     0                                                    
+#>  4:    FALSE  FALSE     1     1     0                                                    
+#>  5:    FALSE  FALSE     1     1     0                                                    
+#>  6:    FALSE  FALSE     1     1     0                                                    
+#>  7:    FALSE  FALSE     1     1     0                                                    
+#>  8:    FALSE  FALSE     1     1     0                                                    
+#>  9:    FALSE  FALSE     1     0     1        Required column 'payment_method' is missing.
+#> 10:    FALSE  FALSE     1     1     0                                                    
+#> 11:     TRUE  FALSE     8     6     2                                                    
+#>                  time
+#>            <difftime>
+#>  1: 0.0000000000 secs
+#>  2: 0.0000000000 secs
+#>  3: 0.0000000000 secs
+#>  4: 0.0000000000 secs
+#>  5: 0.0000000000 secs
+#>  6: 0.0000000000 secs
+#>  7: 0.0000000000 secs
+#>  8: 0.0000000000 secs
+#>  9: 0.0000000000 secs
+#> 10: 0.0000000000 secs
+#> 11: 0.0006048679 secs
+```
+
+### Relational Rules (cross-dataset checks)
+
+You can also validate relationships between datasets, for example
+ensuring foreign keys in one dataset exist in a lookup table.
+
+``` r
+flights <- data.frame(carrier = c("AA", "BB", NA_character_))
+carriers <- data.frame(carrier_id = c("AA"))
+
+rel_rules <- ruleset(
+  reference_rule(
+    local_col = "carrier",
+    ref_dataset = "carriers",
+    ref_col = "carrier_id",
+    name = "carrier exists in carriers",
+    allow_na = TRUE
+  ),
+  data_name = "flights"
+)
+
+check_data(
+  list(
+    flights = flights,
+    carriers = carriers
+  ),
+  rel_rules
+)
+#>        check_type                       name                             expr allow_na negate tests
+#>            <char>                     <char>                           <char>   <lgcl> <lgcl> <int>
+#> 1: reference_rule carrier exists in carriers carrier %in% carriers$carrier_id     TRUE  FALSE     3
+#>     pass  fail   warn  error              time
+#>    <int> <int> <char> <char>        <difftime>
+#> 1:     2     1               5.149841e-05 secs
+```
+
+This returns a `reference_rule` row in `check_type`, so relational
+checks are visible in the same output table as schema and row rules.
+
+### Structured YAML (`v1`) for schema + rules
+
+`dataverifyr` supports a structured YAML format that separates metadata,
+schema, and rules.
+
+``` r
+schema_rules_v1 <- ruleset(
+  rule(amount > 0, name = "amount is positive", allow_na = TRUE),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE),
+    data_column("customer_tier", type = "str", optional = FALSE),
+    data_column("amount", type = "double", optional = FALSE),
+    data_column("paid", type = "logical", optional = FALSE),
+    data_column("payment_method", type = "str", optional = FALSE),
+    data_column("order_time", optional = TRUE)
+  ),
+  meta = dataverifyr:::rule_meta(
+    title = "Order Validation",
+    version = "1.0",
+    description = "Checks for order exports"
+  )
+)
+
+write_rules(schema_rules_v1, "example_rules_v1.yaml", format = "v1")
+
+rules_back <- read_rules("example_rules_v1.yaml")
+rules_back
+#> <Verification Ruleset with 1 elements>
+#>   [1] 'amount is positive' matching `amount > 0` (allow_na: TRUE)
+```
+
+Structured v1 example:
+
+``` yaml
+meta:
+  title: Order Validation
+  version: '1.0'
+  description: Checks for order exports
+data-columns:
+- col: order_id
+  type: int
+  optional: no
+  description: .na.character
+- col: customer_tier
+  type: str
+  optional: no
+  description: .na.character
+- col: amount
+  type: double
+  optional: no
+  description: .na.character
+- col: paid
+  type: logical
+  optional: no
+  description: .na.character
+- col: payment_method
+  type: str
+  optional: no
+  description: .na.character
+- col: order_time
+  type: .na.character
+  optional: yes
+  description: .na.character
+data-rules:
+- name: amount is positive
+  expr: amount > 0
+  allow_na: yes
+  negate: no
+  index: 1
+```
+
 ## Backends
 
 At the moment the following backends are supported. Note that they are
@@ -217,39 +604,64 @@ the package will automatically choose `RSQLite`/`DBI`/`dbplyr` for the
 task. To see which backend `dataverifyr` would use for a task, you can
 use `detect_backend(data)`.
 
+Important: many backend packages are optional (`Suggests`) and may not
+be installed in all environments (for example CI runners, documentation
+builders, or minimal local setups). In particular, examples requiring
+`arrow`, `duckdb`, `DBI`, or `dplyr` may be shown but not executed
+unless those packages are available.
+
 <table>
+
 <thead class="header">
+
 <th style="text-align:left;">
+
 Backend / Library
 </th>
+
 <th style="text-align:center;">
+
 Status
 </th>
+
 <th style="text-align:left;">
+
 Data Type
 </th>
+
 <th style="text-align:left;">
+
 Example Code
 </th>
+
 <th style="text-align:left;">
+
 Comment
 </th>
+
 </thead>
+
 <tbody>
+
 <tr class="odd">
+
 <td style="text-align:left;">
 
 `base-R`
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `data.frame`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -258,27 +670,35 @@ check_data(data, rs)
 ```
 
 </td>
+
 <td style="text-align:left;">
 
 When `data.table` or `dplyr` are available, they are used for faster
 speeds.
 
 </td>
+
 </tr>
+
 <tr class="even">
+
 <td style="text-align:left;">
 
 [`dplyr`](https://dplyr.tidyverse.org/)
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `tibble`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -288,23 +708,32 @@ check_data(data, rs)
 ```
 
 </td>
+
 <td style="text-align:left;">
+
 </td>
+
 </tr>
+
 <tr class="odd">
+
 <td style="text-align:left;">
 
 [`data.table`](https://r-datatable.com)
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `data.table`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -314,23 +743,32 @@ check_data(data, rs)
 ```
 
 </td>
+
 <td style="text-align:left;">
+
 </td>
+
 </tr>
+
 <tr class="even">
+
 <td style="text-align:left;">
 
 [`arrow`](https://arrow.apache.org/docs/r/)
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `Table`, `ArrowTabular`, `ArrowObject`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -345,23 +783,32 @@ check_data(data, rs)
 ```
 
 </td>
+
 <td style="text-align:left;">
+
 </td>
+
 </tr>
+
 <tr class="odd">
+
 <td style="text-align:left;">
 
 [`arrow`](https://arrow.apache.org/docs/r/)
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `FileSystemDataset`, `Dataset`, `ArrowObject`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -371,11 +818,16 @@ check_data(data, rs)
 ```
 
 </td>
+
 <td style="text-align:left;">
+
 Especially handy for large datasets
 </td>
+
 </tr>
+
 <tr class="even">
+
 <td style="text-align:left;">
 
 [`RSQLite`](https://rsqlite.r-dbi.org/),
@@ -383,14 +835,18 @@ Especially handy for large datasets
 [`dbplyr`](https://dbplyr.tidyverse.org/)
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `tbl_SQLiteConnection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -404,6 +860,7 @@ dbDisconnect(con)
 ```
 
 </td>
+
 <td style="text-align:left;">
 
 Note that missing values are converted to `0` when using sqlite by
@@ -411,8 +868,11 @@ default ([c.f. this SO
 answer](https://stackoverflow.com/a/57746647/3048453))
 
 </td>
+
 </tr>
+
 <tr class="odd">
+
 <td style="text-align:left;">
 
 [`duckdb`](https://duckdb.org/docs/api/r.html),
@@ -420,14 +880,18 @@ answer](https://stackoverflow.com/a/57746647/3048453))
 [`dbplyr`](https://dbplyr.tidyverse.org/)
 
 </td>
+
 <td style="text-align:center;">
+
 ✔️
 </td>
+
 <td style="text-align:left;">
 
 `tbl_duckdb_connection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -441,10 +905,15 @@ dbDisconnect(con, shutdown = TRUE)
 ```
 
 </td>
+
 <td style="text-align:left;">
+
 </td>
+
 </tr>
+
 <tr class="even">
+
 <td style="text-align:left;">
 
 [`RPostgres`](https://rpostgres.r-dbi.org/),
@@ -452,14 +921,18 @@ dbDisconnect(con, shutdown = TRUE)
 [`dbplyr`](https://dbplyr.tidyverse.org/)
 
 </td>
+
 <td style="text-align:center;">
+
 ❓
 </td>
+
 <td style="text-align:left;">
 
 `tbl_PqConnection`, `tbl_dbi`, `tbl_sql`, `tbl_lazy`, `tbl`
 
 </td>
+
 <td style="text-align:left;">
 
 ``` r
@@ -476,14 +949,18 @@ dbDisconnect(con)
 ```
 
 </td>
+
 <td style="text-align:left;">
 
 Not tested, but should work out-of-the-box using
 [`DBI`](https://dbi.r-dbi.org/)
 
 </td>
+
 </tr>
+
 </tbody>
+
 </table>
 
 Note that the `rs` object in the example code above refers to a
@@ -509,40 +986,39 @@ below the next block
 ``` r
 library(arrow)
 url <- "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2018-01.parquet"
-file <- "yellow_tripdata_2018-01.parquet"
-if (!file.exists(file)) download.file(url, file, method = "curl")
-file.size(file) / 1e6 # in MB
+pq_file <- "yellow_tripdata_2018-01.parquet"
+if (!file.exists(pq_file)) download.file(url, file, method = "curl")
+file.size(pq_file) / 1e6 # in MB
 #> [1] 123.6685
 
-# quick check of the filesize
-d <- open_dataset(file)
-describe(d)
-#> # A tibble: 19 × 10
-#>    var                  type  n_dis…¹    n_na most_…²     min      mean   median       max        sd
-#>    <chr>                <chr>   <int>   <int> <chr>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
-#>  1 VendorID             inte…       2       0 2 (491…    1      1.5610    2           2     0.49508 
-#>  2 tpep_pickup_datetime POSI… 2311532       0 2018-0…   NA     NA        NA          NA    NA       
-#>  3 tpep_dropoff_dateti… POSI… 2315089       0 2018-0…   NA     NA        NA          NA    NA       
-#>  4 passenger_count      inte…      10       0 1 (624…    0      1.6068    1           9     1.2330  
-#>  5 trip_distance        nume…    4397       0 0.8 (2…    0      2.8040    1.5503 189484.    3.2532  
-#>  6 RatecodeID           inte…       7       0 1 (853…    1      1.0395    1          99     0.31301 
-#>  7 store_and_fwd_flag   char…       2       0 N (872…    1      1         1           1     0       
-#>  8 PULocationID         inte…     259       0 237 (3…    1    164.46    161.86      265    66.520   
-#>  9 DOLocationID         inte…     261       0 236 (3…    1    162.73    162.10      265    75.411   
-#> 10 payment_type         inte…       4       0 1 (610…    1      1.3106    1           4     0.45814 
-#> 11 fare_amount          nume…    1714       0 6 (473… -450     12.244     8.8357   8016     9.9255  
-#> 12 extra                nume…      42       0 0 (474…  -44.69   0.32469   0          60     0.068786
-#> 13 mta_tax              nume…      15       0 0.5 (8…   -0.5    0.49751   0.5        45.49  0.043389
-#> 14 tip_amount           nume…    3397       0 0 (289…  -88.8    1.8188    1.3968    441.71  2.2823  
-#> 15 tolls_amount         nume…     967       0 0 (833…  -15      0.30262   0         950.7   1.1501  
-#> 16 improvement_surchar… nume…       4       0 0.3 (8…   -0.3    0.29963   0.3         1     0.018442
-#> 17 total_amount         nume…   11514       0 7.3 (2… -450.3   15.491    11.321    8016.8  11.984   
-#> 18 congestion_surcharge nume…       2 8760675 NA (87…    2.5    2.5       2.5         2.5  NA       
-#> 19 airport_fee          nume…       2 8760675 NA (87…    0      0         0           0    NA       
-#> # … with abbreviated variable names ¹​n_distinct, ²​most_frequent
+# quick check of the filesize and the structure of the file
+d <- open_dataset(pq_file)
+describe(d, fast = TRUE)
+#> # A tibble: 19 × 11
+#>    var                   type            n n_distinct    n_na most_frequent        min      mean median     max       sd
+#>    <chr>                 <chr>       <int>      <int>   <int> <chr>              <dbl>     <dbl>  <dbl>   <dbl>    <dbl>
+#>  1 VendorID              integer   8760687         NA       0 <NA>             1   e+0   1.56e+0     NA  2   e0  4.96e-1
+#>  2 tpep_pickup_datetime  POSIXct   8760687         NA       0 <NA>             9.79e+8   1.52e+9     NA  1.53e9  1.06e+6
+#>  3 tpep_dropoff_datetime POSIXct   8760687         NA       0 <NA>             9.79e+8   1.52e+9     NA  1.53e9  1.06e+6
+#>  4 passenger_count       integer   8760687         NA       0 <NA>             0         1.61e+0     NA  9   e0  1.26e+0
+#>  5 trip_distance         numeric   8760687         NA       0 <NA>             0         2.80e+0     NA  1.89e5  6.41e+1
+#>  6 RatecodeID            integer   8760687         NA       0 <NA>             1   e+0   1.04e+0     NA  9.9 e1  4.45e-1
+#>  7 store_and_fwd_flag    character 8760687         NA 8760687 <NA>            NA       NaN           NA NA      NA
+#>  8 PULocationID          integer   8760687         NA       0 <NA>             1   e+0   1.64e+2     NA  2.65e2  6.64e+1
+#>  9 DOLocationID          integer   8760687         NA       0 <NA>             1   e+0   1.63e+2     NA  2.65e2  7.03e+1
+#> 10 payment_type          integer   8760687         NA       0 <NA>             1   e+0   1.31e+0     NA  4   e0  4.82e-1
+#> 11 fare_amount           numeric   8760687         NA       0 <NA>            -4.5 e+2   1.22e+1     NA  8.02e3  1.17e+1
+#> 12 extra                 numeric   8760687         NA       0 <NA>            -4.47e+1   3.25e-1     NA  6   e1  4.50e-1
+#> 13 mta_tax               numeric   8760687         NA       0 <NA>            -5   e-1   4.98e-1     NA  4.55e1  4.33e-2
+#> 14 tip_amount            numeric   8760687         NA       0 <NA>            -8.88e+1   1.82e+0     NA  4.42e2  2.49e+0
+#> 15 tolls_amount          numeric   8760687         NA       0 <NA>            -1.5 e+1   3.03e-1     NA  9.51e2  1.74e+0
+#> 16 improvement_surcharge numeric   8760687         NA       0 <NA>            -3   e-1   3.00e-1     NA  1   e0  1.44e-2
+#> 17 total_amount          numeric   8760687         NA       0 <NA>            -4.50e+2   1.55e+1     NA  8.02e3  1.42e+1
+#> 18 congestion_surcharge  numeric   8760687         NA 8760675 <NA>             2.5 e+0   2.5 e+0     NA  2.5 e0  0
+#> 19 airport_fee           numeric   8760687         NA 8760675 <NA>             0         0           NA  0       0
 
 # write the dataset to disk
-write_dataset(d, "nyc-taxi-data")
+if (!dir.exists("nyc-taxi-data")) write_dataset(d, "nyc-taxi-data")
 ```
 
 ### 2 Create Rules in `yaml`
@@ -553,10 +1029,25 @@ saw earlier, we can create the rules in R using the `rule()` and
 option to separate the code from the rules by writing the rules in a
 separate yaml file and reading them into R.
 
-First we display the hand-written contents of the `nyc_data_rules.yaml`
-file.
+First we create and write the rules to a `nyc_data_rules.yaml` file,
+note for larger rulesets, you would most likely write the rules directly
+in the yml file.
+
+``` r
+rs <- ruleset(
+  rule(passenger_count >= 0 & passenger_count <= 10),
+  rule(trip_distance >= 0 & trip_distance <= 1000),
+  rule(payment_type %in% c(0, 1, 2, 3, 4))
+)
+write_rules(rs, "nyc_data_rules.yaml")
+```
+
+Which looks like this in the yaml file:
 
 ``` yaml
+meta: ~
+data-columns: []
+data-rules:
 - name: 'Rule for: passenger_count'
   expr: passenger_count >= 0 & passenger_count <= 10
   allow_na: no
@@ -591,24 +1082,25 @@ Now we can check if the data follows our rules or if we have unexpected
 data points:
 
 ``` r
-# open the dataset 
-ds <- open_dataset("nyc-taxi-data/")
+library(arrow)
+
+# open the dataset
+ds <- open_dataset(pq_file)
 
 # perform the data validation check
 res <- check_data(ds, rules)
 res
-#> # A tibble: 3 × 10
-#>   name                      expr              allow…¹ negate   tests    pass  fail warn  error time 
-#>   <chr>                     <chr>             <lgl>   <lgl>    <int>   <int> <int> <chr> <chr> <drt>
-#> 1 Rule for: passenger_count passenger_count … FALSE   FALSE  8760687 8760687     0 ""    ""    0.42…
-#> 2 Rule for: trip_distance   trip_distance >=… FALSE   FALSE  8760687 8760686     1 ""    ""    0.51…
-#> 3 Rule for: payment_type    payment_type %in… FALSE   FALSE  8760687 8760687     0 ""    ""    0.45…
-#> # … with abbreviated variable name ¹​allow_na
+#> # A tibble: 3 × 11
+#>   check_type name                      expr                                         allow_na negate   tests    pass  fail warn  error time
+#>   <chr>      <chr>                     <chr>                                        <lgl>    <lgl>    <int>   <int> <int> <chr> <chr> <drtn>
+#> 1 row_rule   Rule for: passenger_count passenger_count >= 0 & passenger_count <= 10 FALSE    FALSE  8760687 8760687     0 ""    ""    0.4102955 secs
+#> 2 row_rule   Rule for: trip_distance   trip_distance >= 0 & trip_distance <= 1000   FALSE    FALSE  8760687 8760686     1 ""    ""    0.3988464 secs
+#> 3 row_rule   Rule for: payment_type    payment_type %in% c(0, 1, 2, 3, 4)           FALSE    FALSE  8760687 8760687     0 ""    ""    0.3578835 secs
 
 plot_res(res)
 ```
 
-<img src="man/figures/README-taxi3-1.png" width="100%" />
+<img src="man/figures/README-taxi3-1.png" alt="" width="100%" />
 
 Using the power of `arrow`, we were able to scan 8+mln observations for
 three rules in about 1.5 seconds (YMMV). As we can see from the results,
@@ -618,13 +1110,13 @@ matches
 
 ``` r
 res |>
-  filter_fails(ds) |> 
+  filter_fails(ds) |>
   # only select a couple of variables for brevity
   dplyr::select(tpep_pickup_datetime, tpep_dropoff_datetime, trip_distance)
 #> # A tibble: 1 × 3
 #>   tpep_pickup_datetime tpep_dropoff_datetime trip_distance
 #>   <dttm>               <dttm>                        <dbl>
-#> 1 2018-01-30 12:41:02  2018-01-30 12:42:09         189484.
+#> 1 2018-01-30 11:41:02  2018-01-30 11:42:09         189484.
 ```
 
 As we can see, this is probably a data error (a trip distance of 190k
@@ -641,55 +1133,61 @@ library(dplyr)
 
 # connect to a duckdb database
 con <- dbConnect(duckdb::duckdb("duckdb-database.duckdb"))
-# for demo purposes write the data once
-dbWriteTable(con, "mtcars", mtcars)
+# for demo purposes write sample_data once
+dbWriteTable(con, "orders", sample_data)
 
 # create a tbl connection, which can be used in the checks
-tbl <- tbl(con, "mtcars")
+tbl <- tbl(con, "orders")
 
 # create rules
 rules <- ruleset(
-  rule(mpg > 10 & mpg < 30),
-  rule(cyl %in% c(4, 8)),
-  rule(vs %in% c(0, 1), allow_na = TRUE)
+  rule(amount >= 0 & amount <= 10000, name = "amount in valid range"),
+  rule(customer_tier %in% c("bronze", "silver", "gold"), name = "known customer tier"),
+  rule(!paid | payment_method != "none", name = "paid orders need payment method")
 )
 
 # check rules
 res <- check_data(tbl, rules)
 res
-#> # A tibble: 3 × 10
-#>   name          expr                allow_na negate tests  pass  fail warn  error time          
-#>   <chr>         <chr>               <lgl>    <lgl>  <dbl> <dbl> <dbl> <chr> <chr> <drtn>        
-#> 1 Rule for: mpg mpg > 10 & mpg < 30 FALSE    FALSE     32    28     4 ""    ""    4.4900761 secs
-#> 2 Rule for: cyl cyl %in% c(4, 8)    FALSE    FALSE     32    25     7 ""    ""    0.1926301 secs
-#> 3 Rule for: vs  vs %in% c(0, 1)     TRUE     FALSE     32    32     0 ""    ""    0.2003391 secs
 
 filter_fails(res, tbl, per_rule = TRUE)
-#> $`mpg > 10 & mpg < 30`
-#> # A tibble: 4 × 11
-#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
-#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1  32.4     4  78.7    66  4.08 2.2   19.47     1     1     4     1
-#> 2  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
-#> 3  33.9     4  71.1    65  4.22 1.835 19.9      1     1     4     1
-#> 4  30.4     4  95.1   113  3.77 1.513 16.9      1     1     5     2
-#> 
-#> $`cyl %in% c(4, 8)`
-#> # A tibble: 7 × 11
-#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
-#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1  21       6 160     110  3.9  2.62  16.46     0     1     4     4
-#> 2  21       6 160     110  3.9  2.875 17.02     0     1     4     4
-#> 3  21.4     6 258     110  3.08 3.215 19.44     1     0     3     1
-#> 4  18.1     6 225     105  2.76 3.46  20.22     1     0     3     1
-#> 5  19.2     6 167.6   123  3.92 3.44  18.3      1     0     4     4
-#> 6  17.8     6 167.6   123  3.92 3.44  18.9      1     0     4     4
-#> 7  19.7     6 145     175  3.62 2.77  15.5      0     1     5     6
 
 # lastly disconnect from the database again
 dbDisconnect(con, shutdown = TRUE)
 ```
 
+## Pre Package Version 1.0 YAML (Compatibility)
+
+`dataverifyr` still supports the pre package version 1.0 flat-list YAML
+format for compatibility with existing rule files.
+
+``` r
+write_rules(rules, "example_rules_pre_v1.yaml", format = "pre_v1")
+rules_pre_v1 <- read_rules("example_rules_pre_v1.yaml")
+identical(rules, rules_pre_v1)
+#> [1] FALSE
+```
+
+Pre package version 1.0 YAML example:
+
+``` yaml
+- name: 'Rule for: passenger_count'
+  expr: passenger_count >= 0 & passenger_count <= 10
+  allow_na: no
+  negate: no
+  index: 1
+- name: 'Rule for: trip_distance'
+  expr: trip_distance >= 0 & trip_distance <= 1000
+  allow_na: no
+  negate: no
+  index: 2
+- name: 'Rule for: payment_type'
+  expr: payment_type %in% c(0, 1, 2, 3, 4)
+  allow_na: no
+  negate: no
+  index: 3
+```
+
 # Alternative Data Validation R Libraries
 
 If this library is not what you are looking for, the following might be
diff --git a/cran-comments.md b/cran-comments.md
index ded2011..0354354 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1 +1,3 @@
-Fix broken URL link
+Implement fix for upcoming duckdb release (1.5.2) that would break tests.
+Add more functionality for describing a dataset and column-level checks.
+See News.md for a full set of changes.
diff --git a/man/check_data.Rd b/man/check_data.Rd
index f58cf97..25fb670 100644
--- a/man/check_data.Rd
+++ b/man/check_data.Rd
@@ -10,12 +10,15 @@ check_data(
   xname = deparse(substitute(x)),
   stop_on_fail = FALSE,
   stop_on_warn = FALSE,
-  stop_on_error = FALSE
+  stop_on_error = FALSE,
+  stop_on_schema_fail = FALSE,
+  extra_columns = c("ignore", "warn", "fail")
 )
 }
 \arguments{
 \item{x}{a dataset, either a \code{\link{data.frame}}, \code{\link[dplyr:reexports]{dplyr::tibble}}, \code{\link[data.table:data.table]{data.table::data.table}},
-\code{\link[arrow:Table]{arrow::arrow_table}}, \code{\link[arrow:open_dataset]{arrow::open_dataset}}, or \code{\link[dplyr:tbl]{dplyr::tbl}} (SQL connection)}
+\code{\link[arrow:table]{arrow::arrow_table}}, \code{\link[arrow:open_dataset]{arrow::open_dataset}}, or \code{\link[dplyr:tbl]{dplyr::tbl}} (SQL connection).
+Can also be a named list of datasets when using reference rules.}
 
 \item{rules}{a list of \code{\link{rule}}s}
 
@@ -26,6 +29,12 @@ check_data(
 \item{stop_on_warn}{when a warning is found in the code execution, throw an error with stop}
 
 \item{stop_on_error}{when an error is found in the code execution, throw an error with stop}
+
+\item{stop_on_schema_fail}{when any schema checks fail, throw an error with stop}
+
+\item{extra_columns}{how to treat columns in \code{x} that are not declared in
+optional \code{data_columns} attached to a ruleset. One of \code{"ignore"} (default),
+\code{"warn"}, or \code{"fail"}.}
 }
 \value{
 a data.frame-like object with one row for each rule and its results
@@ -42,6 +51,28 @@ rs <- ruleset(
 rs
 
 check_data(mtcars, rs)
+
+# schema + relation checks in one output
+orders <- data.frame(order_id = 1:3, customer_id = c(10, 99, NA), amount = c(10, -5, 20))
+customers <- data.frame(customer_id = c(10, 11))
+
+rs2 <- ruleset(
+  rule(amount >= 0, name = "amount non-negative"),
+  reference_rule(
+    local_col = "customer_id",
+    ref_dataset = "customers",
+    ref_col = "customer_id",
+    allow_na = TRUE
+  ),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE),
+    data_column("customer_id", type = "double", optional = FALSE),
+    data_column("amount", type = "double", optional = FALSE)
+  ),
+  data_name = "orders"
+)
+
+check_data(list(orders = orders, customers = customers), rs2)
 }
 \seealso{
 \code{\link[=detect_backend]{detect_backend()}}
diff --git a/man/data_column.Rd b/man/data_column.Rd
new file mode 100644
index 0000000..6ef7d62
--- /dev/null
+++ b/man/data_column.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_column.R
+\name{data_column}
+\alias{data_column}
+\title{Define a Column Specification for Schema Checks}
+\usage{
+data_column(
+  col,
+  type = NA_character_,
+  optional = FALSE,
+  description = NA_character_
+)
+}
+\arguments{
+\item{col}{column name.}
+
+\item{type}{optional declared type (for example \code{"int"}, \code{"double"},
+\code{"str"}, \code{"logical"}). Use \code{NA_character_} for no type declaration.}
+
+\item{optional}{logical; if \code{FALSE}, the column is required.}
+
+\item{description}{optional free-text description.}
+}
+\value{
+A \code{data_column} object (list) that can be passed in
+\code{ruleset(..., data_columns = list(...))}.
+}
+\description{
+Creates a single column declaration used in \code{ruleset(..., data_columns = ...)}.
+Column declarations are schema checks (column existence, optionality, and
+declared type), whereas \code{\link[=rule]{rule()}} is for row-wise value checks.
+}
+\examples{
+rs <- ruleset(
+  rule(price >= 0),
+  data_columns = list(
+    data_column("price", type = "double", optional = FALSE),
+    data_column("note", type = "str", optional = TRUE)
+  )
+)
+rs
+
+# combined with row rules and strict schema stopping
+order_rules <- ruleset(
+  rule(price >= 0, allow_na = FALSE),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE),
+    data_column("price", type = "double", optional = FALSE),
+    data_column("note", type = "str", optional = TRUE)
+  )
+)
+
+check_data(
+  data.frame(order_id = 1:3, price = c(10, 20, 30), note = c("ok", NA, "ok")),
+  order_rules,
+  stop_on_schema_fail = TRUE
+)
+}
diff --git a/man/describe.Rd b/man/describe.Rd
index 68ba49f..01f2381 100644
--- a/man/describe.Rd
+++ b/man/describe.Rd
@@ -4,11 +4,22 @@
 \alias{describe}
 \title{Describes a dataset}
 \usage{
-describe(x)
+describe(x, skip_ones = TRUE, digits = 4, top_n = 3, fast = FALSE)
 }
 \arguments{
 \item{x}{a dataset, either a \code{\link{data.frame}}, \code{\link[dplyr:reexports]{dplyr::tibble}}, \code{\link[data.table:data.table]{data.table::data.table}},
-\code{\link[arrow:Table]{arrow::arrow_table}}, \code{\link[arrow:open_dataset]{arrow::open_dataset}}, or \code{\link[dplyr:tbl]{dplyr::tbl}} (SQL connection)}
+\code{\link[arrow:table]{arrow::arrow_table}}, \code{\link[arrow:open_dataset]{arrow::open_dataset}}, or \code{\link[dplyr:tbl]{dplyr::tbl}} (SQL connection)}
+
+\item{skip_ones}{logical, whether values that occur exactly once should be omitted
+from \code{most_frequent}}
+
+\item{digits}{integer, number of digits to round numeric values in \code{most_frequent}}
+
+\item{top_n}{integer, number of most frequent values to include in \code{most_frequent};
+set to \code{0} to skip the \code{most_frequent} computation}
+
+\item{fast}{logical, when \code{TRUE} skip expensive fields (\code{n_distinct}, \code{median})
+by returning \code{NA} for them}
 }
 \value{
 a \code{data.frame}, \code{dplyr::tibble}, or \code{data.table::data.table} containing
@@ -19,11 +30,17 @@ Note that the current version is in the beta stadium at best, that means the
 R-native formats (data.frame, dplyr/tibble, or data.table) are a lot faster
 than arrow or SQL-based datasets.
 }
+\details{
+Numeric values in \code{most_frequent} are rounded to \code{digits} (default: 4).
+If a variable has at most 1 distinct value, \code{most_frequent} is left empty.
+By default, values with count 1 are omitted from \code{most_frequent}.
+}
 \examples{
 describe(mtcars)
 }
 \seealso{
-Similar to \href{https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html}{skimr::skim()},
+Similar to
+\href{https://cran.r-project.org/web/packages/skimr/vignettes/skimr.html}{skimr::skim()},
 \href{https://cran.r-project.org/web/packages/summarytools/vignettes/introduction.html#data-frame-summaries-dfsummary}{summarytools::dfSummary()},
 and \href{https://jthomasmock.github.io/gtExtras/reference/gt_plt_summary.html}{gtExtras::gt_plt_summary()}
 }
diff --git a/man/detect_backend.Rd b/man/detect_backend.Rd
index 42d61d6..13e1b04 100644
--- a/man/detect_backend.Rd
+++ b/man/detect_backend.Rd
@@ -15,7 +15,8 @@ One of \code{base-r}, \code{data.table}, \code{dplyr}, \code{collectibles} (for
 }
 \description{
 The detection will be made based on the class of the object as well as the packages installed.
-For example, if a \code{data.frame} is used, it will look if \code{data.table} or \code{dplyr} are installed on the system, as they provide more speed.
+For example, if a \code{data.frame} is used, it will look if \code{data.table} or \code{dplyr} are installed
+on the system, as they provide more speed.
 Note the main functions will revert the
 }
 \examples{
diff --git a/man/figures/README-plotres-1.png b/man/figures/README-plotres-1.png
index dfb15ec..9bb2e4f 100644
Binary files a/man/figures/README-plotres-1.png and b/man/figures/README-plotres-1.png differ
diff --git a/man/figures/README-taxi3-1.png b/man/figures/README-taxi3-1.png
index 3db4b71..a731ad8 100644
Binary files a/man/figures/README-taxi3-1.png and b/man/figures/README-taxi3-1.png differ
diff --git a/man/reference_rule.Rd b/man/reference_rule.Rd
new file mode 100644
index 0000000..70a7a46
--- /dev/null
+++ b/man/reference_rule.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_column.R
+\name{reference_rule}
+\alias{reference_rule}
+\title{Define a Relational Reference Rule}
+\usage{
+reference_rule(
+  local_col,
+  ref_dataset,
+  ref_col,
+  name = NA,
+  allow_na = FALSE,
+  negate = FALSE,
+  ...
+)
+}
+\arguments{
+\item{local_col}{column name in the primary dataset.}
+
+\item{ref_dataset}{name of the referenced dataset in the \code{x} list.}
+
+\item{ref_col}{column name in the referenced dataset.}
+
+\item{name}{optional display name for the rule.}
+
+\item{allow_na}{logical; if \code{TRUE}, missing values in \code{local_col} are treated
+as passing.}
+
+\item{negate}{logical; if \code{TRUE}, inverts the rule (values must \emph{not} be in
+the referenced column).}
+
+\item{...}{additional fields attached to the rule object.}
+}
+\value{
+A \code{reference_rule} object that can be included in \code{ruleset()}.
+}
+\description{
+Creates a rule that checks whether values in a local column exist in a
+column of a referenced dataset. Use with \code{check_data()} by supplying \code{x} as
+a named list of datasets and setting \code{data_name} in \code{ruleset()} (or by
+ordering the list so the first entry is the primary dataset).
+}
+\examples{
+flights <- data.frame(carrier = c("AA", "BB", NA_character_))
+carriers <- data.frame(carrier_id = c("AA"))
+
+rs <- ruleset(
+  reference_rule(
+    local_col = "carrier",
+    ref_dataset = "carriers",
+    ref_col = "carrier_id",
+    allow_na = TRUE
+  ),
+  data_name = "flights"
+)
+
+check_data(list(flights = flights, carriers = carriers), rs)
+
+# negated relation: value must NOT exist in blacklist
+blacklist <- data.frame(carrier_id = c("XX", "YY"))
+rs_neg <- ruleset(
+  reference_rule(
+    local_col = "carrier",
+    ref_dataset = "blacklist",
+    ref_col = "carrier_id",
+    negate = TRUE,
+    allow_na = TRUE
+  ),
+  data_name = "flights"
+)
+
+check_data(list(flights = flights, blacklist = blacklist), rs_neg)
+}
diff --git a/man/ruleset.Rd b/man/ruleset.Rd
index 80e8fb7..d73e65e 100644
--- a/man/ruleset.Rd
+++ b/man/ruleset.Rd
@@ -5,13 +5,21 @@
 \alias{print.ruleset}
 \title{Creates a set of rules}
 \usage{
-ruleset(...)
+ruleset(..., data_columns = NULL, meta = NULL, data_name = NULL)
 
 \method{print}{ruleset}(x, n = 3, ...)
 }
 \arguments{
 \item{...}{a list of rules}
 
+\item{data_columns}{optional list of schema declarations created with
+internal \code{data_column()} helper.}
+
+\item{meta}{optional metadata list for v1 YAML workflows.}
+
+\item{data_name}{optional name of the primary dataset when \code{check_data()}
+receives a named list of datasets.}
+
 \item{x}{a ruleset to print}
 
 \item{n}{a maximum number of rules to print}
@@ -38,4 +46,26 @@ rs <- ruleset(
   rule(is.numeric(disp))
 )
 rs
+
+# combine row, schema, and relational checks
+orders <- data.frame(order_id = 1:4, customer_id = c(10, 11, 99, NA), amount = c(10, 20, -5, 30))
+customers <- data.frame(customer_id = c(10, 11, 12))
+
+rs2 <- ruleset(
+  rule(amount >= 0, name = "amount must be non-negative"),
+  reference_rule(
+    local_col = "customer_id",
+    ref_dataset = "customers",
+    ref_col = "customer_id",
+    allow_na = TRUE
+  ),
+  data_columns = list(
+    data_column("order_id", type = "int", optional = FALSE),
+    data_column("customer_id", type = "int", optional = FALSE),
+    data_column("amount", type = "double", optional = FALSE)
+  ),
+  data_name = "orders"
+)
+
+check_data(list(orders = orders, customers = customers), rs2)
 }
diff --git a/man/sample_data.Rd b/man/sample_data.Rd
new file mode 100644
index 0000000..d7adc46
--- /dev/null
+++ b/man/sample_data.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sample_data.R
+\docType{data}
+\name{sample_data}
+\alias{sample_data}
+\title{Sample Orders Dataset for Examples and Tests}
+\format{
+A data frame with 8 rows and 6 variables:
+\describe{
+\item{order_id}{Integer order identifier.}
+\item{customer_tier}{Character tier (\code{"bronze"}, \code{"silver"}, \code{"gold"}, etc),
+includes one \code{NA}.}
+\item{amount}{Numeric order amount, includes one negative value and one \code{NA}.}
+\item{paid}{Logical payment flag, includes one \code{NA}.}
+\item{payment_method}{Character payment method, includes one \code{NA}.}
+\item{order_time}{\code{POSIXct} order timestamp in UTC, includes one \code{NA}.}
+}
+}
+\usage{
+sample_data
+}
+\description{
+A small, human-readable dataset with mixed column types, missing values, and
+one datetime column. It is designed for documentation examples and unit tests.
+}
+\examples{
+sample_data
+}
+\keyword{datasets}
diff --git a/man/write_rules.Rd b/man/write_rules.Rd
index ea4d934..5a1166c 100644
--- a/man/write_rules.Rd
+++ b/man/write_rules.Rd
@@ -5,7 +5,7 @@
 \alias{read_rules}
 \title{Read and write rules to a yaml file}
 \usage{
-write_rules(x, file)
+write_rules(x, file, format = c("v1", "pre_v1"))
 
 read_rules(file)
 }
@@ -13,6 +13,10 @@ read_rules(file)
 \item{x}{a list of rules}
 
 \item{file}{a filename}
+
+\item{format}{output format. \code{"v1"} writes structured YAML with \code{meta},
+\code{data-columns}, and \code{data-rules}. \code{"pre_v1"} keeps the pre package version 1.0
+flat-list structure.}
 }
 \value{
 the filename invisibly
diff --git a/tests/testthat/test-check_data.R b/tests/testthat/test-check_data.R
index 2cbe867..1c2dd95 100644
--- a/tests/testthat/test-check_data.R
+++ b/tests/testthat/test-check_data.R
@@ -13,11 +13,13 @@ rules <- ruleset(
   rule(does_not_exist %in% c("a", "b", "c"), "r5") # creates a stop
 )
 
+
 test_that("base-r check_ works", {
   res <- check_(data, rules, backend = "base-r")
 
   expect_equal(class(res), "data.frame")
   exp <- data.frame(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
@@ -40,6 +42,7 @@ test_that("dplyr check_ works", {
 
   expect_equal(class(res), c("tbl_df", "tbl", "data.frame"))
   exp <- dplyr::tibble(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
@@ -62,6 +65,7 @@ test_that("data.table check_ works", {
 
   expect_equal(class(res), c("data.table", "data.frame"))
   exp <- data.table::data.table(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
@@ -89,6 +93,7 @@ test_that("arrow::arrow_table check_ works", {
 
   expect_equal(class(res), c("tbl_df", "tbl", "data.frame"))
   exp <- dplyr::tibble(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
@@ -97,7 +102,8 @@ test_that("arrow::arrow_table check_ works", {
     pass = c(32, 32, 27, 0, 0),
     fail = c(0, 0, 5, 32, 32),
     warn = c("", "", "", "", ""),
-    error = c("", "", "", "Invalid: Failed to parse string: 'asd' as a scalar of type double", "object 'does_not_exist' not found")
+    error = c("", "", "", "Invalid: Failed to parse string: 'asd' as a scalar of type double",
+              "object 'does_not_exist' not found")
   )
   expect_equal(res |> dplyr::select(-time), exp)
 })
@@ -115,6 +121,7 @@ test_that("arrow::open_dataset check_ works", {
 
   expect_equal(class(res), c("tbl_df", "tbl", "data.frame"))
   exp <- dplyr::tibble(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
@@ -123,7 +130,8 @@ test_that("arrow::open_dataset check_ works", {
     pass = c(32, 32, 27, 0, 0),
     fail = c(0, 0, 5, 32, 32),
     warn = c("", "", "", "", ""),
-    error = c("", "", "", "Invalid: Failed to parse string: 'asd' as a scalar of type double", "object 'does_not_exist' not found")
+    error = c("", "", "", "Invalid: Failed to parse string: 'asd' as a scalar of type double",
+              "object 'does_not_exist' not found")
   )
   expect_equal(res |> dplyr::select(-time), exp)
 })
@@ -145,6 +153,7 @@ test_that("sqlite (RSQLite) check_ works", {
 
   expect_equal(class(res), c("tbl_df", "tbl", "data.frame"))
   exp <- dplyr::tibble(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
@@ -164,6 +173,7 @@ test_that("sqlite (RSQLite) check_ works", {
 
 
 test_that("duckdb check_ works", {
+  skip_if_not_installed("duckdb", "1.5.1.9002")
   skip_if_not(requireNamespace("DBI", quietly = TRUE) |
                 requireNamespace("dbplyr", quietly = TRUE) |
                 requireNamespace("duckdb", quietly = TRUE),
@@ -179,6 +189,7 @@ test_that("duckdb check_ works", {
 
   expect_equal(class(res), c("tbl_df", "tbl", "data.frame"))
   exp <- dplyr::tibble(
+    check_type = rep("row_rule", 5),
     name = c("r1", "r2", "r3", "r4", "r5"),
     expr = vapply(rules, function(r) r$expr, character(1)),
     allow_na = c(FALSE, FALSE, TRUE, FALSE, FALSE),
diff --git a/tests/testthat/test-describe.R b/tests/testthat/test-describe.R
index fb6f382..d27292c 100644
--- a/tests/testthat/test-describe.R
+++ b/tests/testthat/test-describe.R
@@ -1,4 +1,3 @@
-
 # create a sample dataset to describe
 set.seed(41)
 data <- data.frame(
@@ -15,119 +14,356 @@ data <- data.frame(
   # factor
   g = factor(sample(letters, 20, replace = TRUE)),
   # posixct
-  h = as.POSIXct(sample.int(365*60*60*24, 10, replace = TRUE), origin = "2020-01-01")
-)
-for (n in names(data))
-  data[sample.int(nrow(data), 3), n] <- NA
-
-# the expected values
-exp <- data.frame(
-  var = c("a", "b", "c", "d", "e", "f", "g", "h"),
-  type = c("numeric", "numeric", "integer", "integer", "character",
-           "character", "factor", "POSIXct"),
-  n = c(20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L),
-  n_distinct = c(18L, 18L, 18L, 8L, 14L, 15L, 13L, 11L),
-  n_na = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L),
-  most_frequent = c("NA (3), 1 (1), 2 (1)",
-                    "NA (3), -0.794368337075183 (1), 0.197257539137862 (1)",
-                    "NA (3), 1 (1), 2 (1)", "16 (4), NA (3), 13 (3)",
-                    "u (3), NA (3), r (2)", "NA (3), ggg (3), x (2)",
-                    "g (3), NA (3), b (2)",
-                    "NA (3), 2020-11-16 01:27:37 (2), 2020-08-10 23:06:51 (2)"),
-  min = c(1, -1.57960700723589, 1, 7, 1, 1, 1, 1578213277),
-  mean = c(9.82352941176471, 0.395917134682771, 9.70588235294118,
-           13.4705882352941, 1, 2.11764705882353, 1, 1597372702.64706),
-  median = c(10, 0.493667470676325, 10, 13, 1, 2, 1, 1601657240),
-  max = c(20, 2.27440247098253, 20, 18, 1, 3, 1, 1606208141),
-  sd = c(5.92911559717854, 1.08100890056568, 5.7961701351232,
-         3.6420743927538, 0, 0.857492925712544, 0, 9824874.35812236)
+  h = as.POSIXct(
+    sample.int(365 * 60 * 60 * 24, 10, replace = TRUE),
+    origin = "2020-01-01",
+    tz = "UTC"
+  )
 )
 
+for (n in names(data)) data[sample.int(nrow(data), 3), n] <- NA
+
+normalize_mf <- function(x) {
+  vapply(strsplit(x, ", ", fixed = TRUE), function(parts) {
+    paste(sort(parts), collapse = ", ")
+  }, character(1))
+}
+
+
 test_that("describe data.frame", {
   local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr")
   pkgs <- NULL
+
   expect_equal(detect_backend(data), "base-r")
-  d <- describe(data)
+  d <- describe(data, skip_ones = FALSE)
   expect_equal(class(d), "data.frame")
-  expect_equal(d, exp)
+  exp <- d
 
-  skip_if_not(requireNamespace("dplyr", quietly = TRUE) |
-                requireNamespace("data.table", quietly = TRUE),
-              "dplyr and data.table must be installed to run these tests")
+  skip_if_not(
+    requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("data.table", quietly = TRUE),
+    "dplyr and data.table must be installed to run these tests"
+  )
 
   # use data.table
   pkgs <- "data.table"
   expect_equal(detect_backend(data), "data.table")
-  d <- describe(data)
+  d <- describe(data, skip_ones = FALSE)
   expect_equal(class(d), c("data.table", "data.frame"))
-  expect_equal(as.data.frame(d), exp)
+  expect_equal(as.data.frame(d)[setdiff(names(d), "most_frequent")],
+               exp[setdiff(names(exp), "most_frequent")])
+  expect_equal(normalize_mf(as.data.frame(d)$most_frequent), normalize_mf(exp$most_frequent))
 
   # use dplyr
   pkgs <- "dplyr"
   expect_equal(detect_backend(data), "dplyr")
-  d <- describe(data)
+  d <- describe(data, skip_ones = FALSE)
   expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
-  expect_equal(as.data.frame(d), exp)
-  # error here as dplyr backend sorts the data slightly differently...
+  expect_equal(as.data.frame(d)[setdiff(names(d), "most_frequent")],
+               exp[setdiff(names(exp), "most_frequent")])
+  expect_equal(normalize_mf(as.data.frame(d)$most_frequent), normalize_mf(exp$most_frequent))
 })
 
-test_that("sqlite", {
-  skip_if_not(requireNamespace("DBI", quietly = TRUE) |
-                requireNamespace("dplyr", quietly = TRUE) |
-                requireNamespace("dbplyr", quietly = TRUE) |
-                requireNamespace("RSQLite", quietly = TRUE),
-              "DBI, dplyr, dbplyr, and RSQLite must be installed to test the functionality")
+
+test_that("describe sqlite", {
+  skip_if_not(
+    requireNamespace("DBI", quietly = TRUE) &&
+      requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("RSQLite", quietly = TRUE),
+    "DBI, dplyr, dbplyr, and RSQLite must be installed to test the functionality"
+  )
 
   con <- DBI::dbConnect(RSQLite::SQLite())
   DBI::dbWriteTable(con, "data", data)
   x <- dplyr::tbl(con, "data")
   expect_equal(detect_backend(x), "collectibles")
 
-  d <- describe(x)
-  # this takes waaaaay too long....
+  d <- describe(x, skip_ones = FALSE)
 
   expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
-  expect_equal(as.data.frame(d), exp)
-  # error here: different order and missing last row due to posixct
+  expect_equal(nrow(d), ncol(data))
+  expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
 
   DBI::dbDisconnect(con)
 })
 
-test_that("duckdb", {
-  skip_if_not(requireNamespace("DBI", quietly = TRUE) |
-                requireNamespace("dplyr", quietly = TRUE) |
-                requireNamespace("dbplyr", quietly = TRUE) |
-                requireNamespace("duckdb", quietly = TRUE),
-              "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality")
+
+test_that("describe duckdb", {
+  skip_if_not_installed("duckdb", "1.5.1.9002")
+  skip_if_not(
+    requireNamespace("DBI", quietly = TRUE) &&
+      requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("duckdb", quietly = TRUE),
+    "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
+  )
 
   con <- DBI::dbConnect(duckdb::duckdb())
   DBI::dbWriteTable(con, "data", data)
   x <- dplyr::tbl(con, "data")
   expect_equal(detect_backend(x), "collectibles")
 
-  d <- describe(x)
-  # this takes waaaaay too long....
+  d <- describe(x, skip_ones = FALSE)
 
   expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
-  expect_equal(as.data.frame(d), exp)
-  # error here: different order and missing last row due to posixct
+  expect_equal(nrow(d), ncol(data))
+  expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
   DBI::dbDisconnect(con)
 
 })
-test_that("arrow", {
-  skip_if_not(requireNamespace("dbplyr", quietly = TRUE) |
-                requireNamespace("arrow", quietly = TRUE),
-              "dplyr and arrow must be installed to test the functionality")
+
+
+test_that("describe duckdb supports skip_ones/digits without full-vector semantics changes", {
+  skip_if_not_installed("duckdb", "1.5.1.9002")
+  skip_if_not(
+    requireNamespace("DBI", quietly = TRUE) &&
+      requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("duckdb", quietly = TRUE),
+    "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
+  )
+
+  x <- data.frame(
+    num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
+    chr = c("a", "a", "b", "c", NA_character_)
+  )
+
+  con <- DBI::dbConnect(duckdb::duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
+  DBI::dbWriteTable(con, "data_mf", x, overwrite = TRUE)
+  tbl <- dplyr::tbl(con, "data_mf")
+
+  d <- describe(tbl, skip_ones = FALSE, digits = 2)
+  expect_equal(normalize_mf(d$most_frequent[d$var == "num"]), "1.23 (2), 9.88 (2), NA (1)")
+  expect_match(d$most_frequent[d$var == "chr"], "^a \\(2\\)")
+
+  d2 <- describe(tbl, skip_ones = TRUE, digits = 2)
+  expect_equal(normalize_mf(d2$most_frequent[d2$var == "num"]), "1.23 (2), 9.88 (2)")
+  expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
+})
+
+
+test_that("describe arrow", {
+  skip_if_not(
+    requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("arrow", quietly = TRUE),
+    "dplyr and arrow must be installed to test the functionality"
+  )
 
   tmp <- tempfile()
   arrow::write_parquet(data, tmp)
   x <- arrow::open_dataset(tmp)
   expect_equal(detect_backend(x), "collectibles")
 
+  d <- describe(x, skip_ones = FALSE)
+
+  expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
+  expect_equal(nrow(d), ncol(data))
+  expect_true(all(c("var", "type", "n", "n_distinct", "n_na", "most_frequent") %in% names(d)))
+})
+
+test_that("describe_collectibles_stats works on arrow numeric columns in fast mode", {
+  skip_if_not(
+    requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("arrow", quietly = TRUE),
+    "dplyr, dbplyr, and arrow must be installed to test the functionality"
+  )
+
+  x <- data.frame(num = c(1, 2, NA_real_, 4))
+  tmp <- tempfile()
+  arrow::write_parquet(x, tmp)
+  ds <- arrow::open_dataset(tmp)
+
+  stats <- dataverifyr:::describe_collectibles_stats(ds, "num", is_num = TRUE, fast = TRUE)
+
+  expect_equal(stats$n, 4L)
+  expect_equal(stats$n_na, 1L)
+  expect_equal(stats$min, 1)
+  expect_equal(stats$max, 4)
+  expect_equal(round(stats$mean, 6), round(mean(x$num, na.rm = TRUE), 6))
+  expect_equal(round(stats$sd, 6), round(stats::sd(x$num, na.rm = TRUE), 6))
+})
+
+test_that("describe arrow fast mode handles POSIXct columns", {
+  skip_if_not(
+    requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("arrow", quietly = TRUE),
+    "dplyr, dbplyr, and arrow must be installed to test the functionality"
+  )
+
+  x <- data.frame(
+    ts = as.POSIXct(c("2020-01-01 00:00:00", "2020-01-01 01:00:00", NA), tz = "UTC"),
+    val = c(1, 2, 3)
+  )
+  tmp <- tempfile()
+  arrow::write_parquet(x, tmp)
+  ds <- arrow::open_dataset(tmp)
+
+  d <- describe(ds, fast = TRUE, skip_ones = FALSE)
+  ts_row <- d[d$var == "ts", , drop = FALSE]
+
+  expect_true(nrow(ts_row) == 1)
+  expect_false(is.na(ts_row$mean))
+  expect_false(is.na(ts_row$sd))
+})
+
+
+test_that("describe most_frequent rounds numeric values and suppresses <=1 distinct", {
+  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
+
+  x <- data.frame(
+    one_value = rep(3.1415926535, 5),
+    all_na = rep(NA_real_, 5),
+    mixed = c(1.23456789, 1.23456789, 9.87654321, NA_real_, 9.87654321)
+  )
+
+  d <- suppressWarnings(describe(x, skip_ones = FALSE))
+
+  expect_equal(d$most_frequent[d$var == "one_value"], "")
+  expect_equal(d$most_frequent[d$var == "all_na"], "")
+  expect_equal(
+    d$most_frequent[d$var == "mixed"],
+    "1.2346 (2), 9.8765 (2), NA (1)"
+  )
+})
+
+
+test_that("describe skips one-count values in most_frequent by default", {
+  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
+
+  x <- data.frame(
+    mostly_unique = c(1, 2, 3, 4, 5),
+    repeated = c(1, 1, 2, 3, NA_real_)
+  )
+
   d <- describe(x)
-  # this takes waaaaay too long....
+  expect_equal(d$most_frequent[d$var == "mostly_unique"], "")
+  expect_equal(d$most_frequent[d$var == "repeated"], "1 (2)")
+
+  d_keep_ones <- describe(x, skip_ones = FALSE)
+  expect_equal(d_keep_ones$most_frequent[d_keep_ones$var == "repeated"], "1 (2), 2 (1), 3 (1)")
+})
+
+
+test_that("describe supports configurable digits for most_frequent numeric rounding", {
+  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
+
+  x <- data.frame(
+    mixed = c(1.23456789, 1.23456789, 9.87654321, 9.87654321, NA_real_)
+  )
+
+  d <- describe(x, skip_ones = FALSE, digits = 2)
+  expect_equal(d$most_frequent[d$var == "mixed"], "1.23 (2), 9.88 (2), NA (1)")
+})
+
+test_that("describe supports configurable top_n for most_frequent", {
+  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
+
+  x <- data.frame(
+    vals = c("a", "a", "b", "b", "c", "d")
+  )
+
+  d3 <- describe(x, skip_ones = FALSE, top_n = 3)
+  expect_equal(d3$most_frequent[d3$var == "vals"], "a (2), b (2), c (1)")
+
+  d2 <- describe(x, skip_ones = FALSE, top_n = 2)
+  expect_equal(d2$most_frequent[d2$var == "vals"], "a (2), b (2)")
+})
+
+test_that("describe supports top_n = 0 to skip most_frequent values", {
+  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
+
+  x <- data.frame(
+    vals = c("a", "a", "b", "b", "c", "d")
+  )
+
+  d0 <- describe(x, skip_ones = FALSE, top_n = 0)
+  expect_equal(d0$most_frequent[d0$var == "vals"], "")
+})
+
+test_that("describe fast mode skips expensive fields", {
+  local_mocked_bindings(has_pkg = function(...) FALSE, .package = "dataverifyr")
+
+  x <- data.frame(
+    num = c(1, 1, 2, 3, NA_real_),
+    chr = c("a", "a", "b", "c", NA_character_)
+  )
+
+  d <- describe(x, fast = TRUE, skip_ones = FALSE, top_n = 2)
+  expect_true(all(is.na(d$n_distinct)))
+  expect_true(all(is.na(d$median)))
+  expect_true(all(is.na(d$most_frequent)))
+  expect_false(all(is.na(d$mean)))
+})
+
+test_that("describe duckdb fast mode skips expensive fields", {
+  skip_if_not_installed("duckdb", "1.5.1.9002")
+  skip_if_not(
+    requireNamespace("DBI", quietly = TRUE) &&
+      requireNamespace("dplyr", quietly = TRUE) &&
+      requireNamespace("dbplyr", quietly = TRUE) &&
+      requireNamespace("duckdb", quietly = TRUE),
+    "DBI, dplyr, dbplyr, and duckdb must be installed to test the functionality"
+  )
+
+  x <- data.frame(
+    num = c(1, 1, 2, 3, NA_real_),
+    chr = c("a", "a", "b", "c", NA_character_)
+  )
+
+  con <- DBI::dbConnect(duckdb::duckdb())
+  on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE)
+  DBI::dbWriteTable(con, "data_fast", x, overwrite = TRUE)
+  tbl <- dplyr::tbl(con, "data_fast")
+
+  d <- describe(tbl, fast = TRUE, skip_ones = FALSE, top_n = 2)
+  expect_true(all(is.na(d$n_distinct)))
+  expect_true(all(is.na(d$median)))
+  expect_true(all(is.na(d$most_frequent)))
+  expect_false(all(is.na(d$mean)))
+})
+
+test_that("describe dplyr backend supports skip_ones and digits", {
+  skip_if_not(requireNamespace("dplyr", quietly = TRUE),
+              "dplyr must be installed to run this test")
+  local_mocked_bindings(has_pkg = function(p) p == "dplyr", .package = "dataverifyr")
+
+  x <- data.frame(
+    num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
+    chr = c("a", "a", "b", "c", NA_character_)
+  )
 
+  d <- describe(x, skip_ones = FALSE, digits = 2)
+  expect_equal(detect_backend(x), "dplyr")
   expect_equal(class(d), c("tbl_df", "tbl", "data.frame"))
-  expect_equal(as.data.frame(d), exp)
-  # error here: different order and missing last row due to posixct
+  expect_equal(d$most_frequent[d$var == "num"], "1.23 (2), 9.88 (2), NA (1)")
+  expect_equal(d$most_frequent[d$var == "chr"], "a (2), b (1), c (1)")
+
+  d2 <- describe(x, skip_ones = TRUE, digits = 2)
+  expect_equal(d2$most_frequent[d2$var == "num"], "1.23 (2), 9.88 (2)")
+  expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
+})
+
+test_that("describe data.table backend supports skip_ones and digits", {
+  skip_if_not(requireNamespace("data.table", quietly = TRUE),
+              "data.table must be installed to run this test")
+  local_mocked_bindings(has_pkg = function(p) p == "data.table", .package = "dataverifyr")
+
+  x <- data.frame(
+    num = c(1.234567, 1.234567, 9.876543, 9.876543, NA_real_),
+    chr = c("a", "a", "b", "c", NA_character_)
+  )
+
+  d <- describe(x, skip_ones = FALSE, digits = 2)
+  expect_equal(detect_backend(x), "data.table")
+  expect_equal(class(d), c("data.table", "data.frame"))
+  expect_equal(d$most_frequent[d$var == "num"], "1.23 (2), 9.88 (2), NA (1)")
+  expect_equal(d$most_frequent[d$var == "chr"], "a (2), b (1), c (1)")
+
+  d2 <- describe(x, skip_ones = TRUE, digits = 2)
+  expect_equal(d2$most_frequent[d2$var == "num"], "1.23 (2), 9.88 (2)")
+  expect_equal(d2$most_frequent[d2$var == "chr"], "a (2)")
 })
diff --git a/tests/testthat/test-detect_backend.R b/tests/testthat/test-detect_backend.R
index 67341af..c70b8e6 100644
--- a/tests/testthat/test-detect_backend.R
+++ b/tests/testthat/test-detect_backend.R
@@ -1,6 +1,3 @@
-library(testthat)
-library(dataverifyr)
-
 test_that("detect_backend works as expected", {
   data <- mtcars
 
@@ -51,17 +48,31 @@ test_that("detect_backend works as expected", {
   # test arrow dataset
   class(data) <- c("FileSystemDataset", "Dataset", "ArrowObject", "R6")
   pkgs <- NULL
-  expect_error(detect_backend(data), "The arrow package needs to be installed")
+  expect_error(
+    detect_backend(data),
+    "The arrow and dbplyr packages need to be installed in order to test an ArrowObject"
+  )
 
-  pkgs <- "arrow"
+  pkgs <- c("arrow", "dbplyr")
   expect_equal(detect_backend(data), "collectibles")
 
 
   # test DBI dataset
   class(data) <- c("tbl_SQLiteConnection", "tbl_dbi", "tbl_sql", "tbl_lazy", "tbl")
   pkgs <- NULL
-  expect_error(detect_backend(data), "The DBI package needs to be installed")
+  expect_error(
+    detect_backend(data),
+    "The DBI and dbplyr packages need to be installed in order to test a tbl_sql"
+  )
 
-  pkgs <- "DBI"
+  pkgs <- c("DBI", "dbplyr")
   expect_equal(detect_backend(data), "collectibles")
 })
+
+test_that("detect_backend falls back to dplyr for data.frame when data.table is unavailable", {
+  data <- mtcars
+  pkgs <- "dplyr"
+  local_mocked_bindings(has_pkg = function(p) p %in% pkgs, .package = "dataverifyr")
+
+  expect_equal(detect_backend(data), "dplyr")
+})
diff --git a/tests/testthat/test-features-v1.R b/tests/testthat/test-features-v1.R
new file mode 100644
index 0000000..cfe26c2
--- /dev/null
+++ b/tests/testthat/test-features-v1.R
@@ -0,0 +1,155 @@
+test_that("rule spec constructors validate inputs", {
+  expect_false(exists("col_rule", where = asNamespace("dataverifyr"), inherits = FALSE))
+  expect_true(exists("data_column", where = asNamespace("dataverifyr"), inherits = FALSE))
+  expect_true(exists("reference_rule", where = asNamespace("dataverifyr"), inherits = FALSE))
+
+  expect_error(data_column(col = 1), "character")
+  expect_error(data_column(col = "a", type = 1), "character")
+  expect_error(data_column(col = "a", optional = "yes"), "logical")
+
+  expect_error(
+    ruleset(
+      rule(a > 0),
+      data_columns = list(
+        data_column("a"),
+        data_column("a")
+      )
+    ),
+    "duplicate"
+  )
+})
+
+
+test_that("write_rules/read_rules support structured v1 yaml", {
+  rs <- ruleset(
+    rule(a > 0, name = "positive a"),
+    data_columns = list(
+      data_column("a", type = "int", optional = FALSE, description = "A column")
+    ),
+    meta = dataverifyr:::rule_meta(title = "demo", version = "1.0", description = "desc")
+  )
+
+  file <- tempfile(fileext = ".yml")
+  write_rules(rs, file, format = "v1")
+
+  yaml_lines <- readLines(file)
+  expect_true(any(grepl("^meta:$", yaml_lines)))
+  expect_true(any(grepl("^data-columns:$", yaml_lines)))
+  expect_true(any(grepl("^data-rules:$", yaml_lines)))
+
+  rs2 <- read_rules(file)
+  expect_s3_class(rs2, "ruleset")
+  expect_equal(rs2[[1]]$name, "positive a")
+  expect_equal(attr(rs2, "meta")$title, "demo")
+  expect_equal(attr(rs2, "data_columns")[[1]]$col, "a")
+})
+
+
+test_that("read_rules rejects malformed v1 yaml", {
+  file <- tempfile(fileext = ".yml")
+  writeLines(c(
+    "meta:",
+    "  title: bad",
+    "data-columns:",
+    "  - col: a"
+  ), file)
+
+  expect_error(read_rules(file), "data-rules")
+})
+
+
+test_that("check_data validates schema and supports extra_columns modes", {
+  rs <- ruleset(
+    rule(a > 0),
+    data_columns = list(
+      data_column("a", type = "int", optional = FALSE),
+      data_column("b", type = "int", optional = TRUE)
+    )
+  )
+
+  x <- data.frame(a = 1:3, b = 1:3, c = 9:11)
+
+  res_ignore <- check_data(x, rs, extra_columns = "ignore")
+  expect_true("check_type" %in% names(res_ignore))
+  expect_equal(names(res_ignore)[[1]], "check_type")
+  expect_true(all(res_ignore$fail == 0))
+  expect_true(any(res_ignore$check_type == "schema"))
+  expect_true(any(res_ignore$check_type == "row_rule"))
+
+  expect_warning(check_data(x, rs, extra_columns = "warn"), "extra")
+  expect_error(check_data(x, rs, extra_columns = "fail"), "extra")
+
+  res_missing <- check_data(data.frame(b = 1:2), rs)
+  expect_true(any(
+    res_missing$check_type == "schema" &
+      grepl("exists", res_missing$name) &
+      res_missing$fail == 1
+  ))
+  expect_error(
+    check_data(data.frame(b = 1:2), rs, stop_on_schema_fail = TRUE),
+    "schema fails"
+  )
+
+  rs_unknown <- ruleset(
+    rule(does_not_exist > 0),
+    data_columns = list(data_column("a"))
+  )
+  expect_error(check_data(data.frame(a = 1:3), rs_unknown), "Unknown symbols")
+
+  rs_type <- ruleset(
+    rule(a %in% c("x", "y")),
+    data_columns = list(data_column("a", type = "int"))
+  )
+  res_type <- check_data(data.frame(a = c("x", "y")), rs_type)
+  expect_true(any(
+    res_type$check_type == "schema" &
+      grepl("type", res_type$name) &
+      res_type$fail == 1
+  ))
+  expect_error(
+    check_data(data.frame(a = c("x", "y")), rs_type, stop_on_schema_fail = TRUE),
+    "schema fails"
+  )
+})
+
+test_that("check_data supports ruleset with only data_columns", {
+  rs <- ruleset(
+    data_columns = list(
+      data_column("a", type = "int", optional = FALSE),
+      data_column("b", type = "str", optional = TRUE)
+    )
+  )
+
+  res <- check_data(data.frame(a = 1:3), rs)
+  expect_true(nrow(res) > 0)
+  expect_true(all(res$check_type == "schema"))
+  expect_false(any(res$check_type == "row_rule"))
+  expect_true(all(res$fail == 0))
+})
+
+
+test_that("check_data supports cross-reference rules across datasets", {
+  flights <- data.frame(carrier = c("AA", "BB", NA_character_))
+  carriers <- data.frame(carrier_id = c("AA"))
+
+  rs <- ruleset(
+    reference_rule(
+      local_col = "carrier",
+      ref_dataset = "carriers",
+      ref_col = "carrier_id",
+      name = "carrier in carriers",
+      allow_na = TRUE
+    ),
+    data_name = "flights"
+  )
+
+  res <- check_data(list(flights = flights, carriers = carriers), rs)
+  expect_equal(res$name, "carrier in carriers")
+  expect_equal(res$pass, 2)
+  expect_equal(res$fail, 1)
+
+  expect_error(
+    check_data(list(flights = flights), rs),
+    "referenced dataset"
+  )
+})
diff --git a/tests/testthat/test-im-export.R b/tests/testthat/test-im-export.R
index 8a869de..aab2dd8 100644
--- a/tests/testthat/test-im-export.R
+++ b/tests/testthat/test-im-export.R
@@ -1,68 +1,114 @@
-test_that("Basic im and export works", {
-  rr <- ruleset(
-    rule(mpg > 10),
-    rule(cyl %in% c(4, 6, 8))
-  )
-  file <- tempfile(fileext = ".yml")
-  write_rules(rr, file)
-
-  expect_equal(readLines(file),
-               c("- name: 'Rule for: mpg'",
-                 "  expr: mpg > 10",
-                 "  allow_na: no",
-                 "  negate: no",
-                 "  index: 1",
-                 "- name: 'Rule for: cyl'",
-                 "  expr: cyl %in% c(4, 6, 8)",
-                 "  allow_na: no",
-                 "  negate: no",
-                 "  index: 2"))
-
-  rr2 <- read_rules(file)
-  expect_equal(rr, rr2)
-
-
-  # additional information is carried along as well
-  rr <- ruleset(
-    rule(mpg > 10, author = "me"),
-    rule(cyl %in% c(4, 6, 8), date = "2020-02-29")
-  )
-  file <- tempfile(fileext = ".yml")
-  write_rules(rr, file)
-
-  expect_equal(readLines(file),
-               c("- name: 'Rule for: mpg'",
-                 "  expr: mpg > 10",
-                 "  allow_na: no",
-                 "  negate: no",
-                 "  author: me",
-                 "  index: 1",
-                 "- name: 'Rule for: cyl'",
-                 "  expr: cyl %in% c(4, 6, 8)",
-                 "  allow_na: no",
-                 "  negate: no",
-                 "  date: '2020-02-29'",
-                 "  index: 2"))
-
-  rr2 <- read_rules(file)
-  expect_equal(rr, rr2)
-})
-
-
-
-test_that("Single rule im and export works", {
-  rr <- rule(mpg > 10)
-  file <- tempfile(fileext = ".yml")
-  write_rules(rr, file)
-
-  expect_equal(readLines(file),
-               c("- name: 'Rule for: mpg'",
-                 "  expr: mpg > 10",
-                 "  allow_na: no",
-                 "  negate: no",
-                 "  index: 1"))
-
-  rr2 <- read_rules(file)
-  rr2$index <- NULL
-  expect_equal(rr, rr2)
-})
+test_that("Basic im and export works", {
+  rr <- ruleset(
+    rule(mpg > 10),
+    rule(cyl %in% c(4, 6, 8))
+  )
+  file <- tempfile(fileext = ".yml")
+  write_rules(rr, file)
+
+  expect_equal(
+    readLines(file),
+    c(
+      "meta: ~",
+      "data-columns: []",
+      "data-rules:",
+      "- name: 'Rule for: mpg'",
+      "  expr: mpg > 10",
+      "  allow_na: no",
+      "  negate: no",
+      "  index: 1",
+      "- name: 'Rule for: cyl'",
+      "  expr: cyl %in% c(4, 6, 8)",
+      "  allow_na: no",
+      "  negate: no",
+      "  index: 2"
+    )
+  )
+
+  rr2 <- read_rules(file)
+  expect_s3_class(rr2, "ruleset")
+  attr(rr2, "data_columns") <- NULL
+  expect_equal(rr, rr2)
+
+  # additional information is carried along as well
+  rr <- ruleset(
+    rule(mpg > 10, author = "me"),
+    rule(cyl %in% c(4, 6, 8), date = "2020-02-29")
+  )
+  file <- tempfile(fileext = ".yml")
+  write_rules(rr, file)
+
+  expect_equal(
+    readLines(file),
+    c(
+      "meta: ~",
+      "data-columns: []",
+      "data-rules:",
+      "- name: 'Rule for: mpg'",
+      "  expr: mpg > 10",
+      "  allow_na: no",
+      "  negate: no",
+      "  author: me",
+      "  index: 1",
+      "- name: 'Rule for: cyl'",
+      "  expr: cyl %in% c(4, 6, 8)",
+      "  allow_na: no",
+      "  negate: no",
+      "  date: '2020-02-29'",
+      "  index: 2"
+    )
+  )
+
+  rr2 <- read_rules(file)
+  expect_s3_class(rr2, "ruleset")
+  attr(rr2, "data_columns") <- NULL
+  expect_equal(rr, rr2)
+})
+
+
+test_that("Single rule im and export works", {
+  rr <- rule(mpg > 10)
+  file <- tempfile(fileext = ".yml")
+  write_rules(rr, file)
+
+  expect_equal(
+    readLines(file),
+    c(
+      "meta: ~",
+      "data-columns: []",
+      "data-rules:",
+      "- name: 'Rule for: mpg'",
+      "  expr: mpg > 10",
+      "  allow_na: no",
+      "  negate: no",
+      "  index: 1"
+    )
+  )
+
+  rr2 <- read_rules(file)
+  expect_s3_class(rr2, "ruleset")
+  rr$index <- 1
+  expect_equal(rr, rr2[[1]])
+})
+
+
+test_that("Legacy pre_v1 format remains supported", {
+  rr <- ruleset(rule(mpg > 10))
+  file <- tempfile(fileext = ".yml")
+  write_rules(rr, file, format = "pre_v1")
+
+  expect_equal(
+    readLines(file),
+    c(
+      "- name: 'Rule for: mpg'",
+      "  expr: mpg > 10",
+      "  allow_na: no",
+      "  negate: no",
+      "  index: 1"
+    )
+  )
+
+  rr2 <- read_rules(file)
+  expect_s3_class(rr2, "rule")
+  expect_equal(rr[[1]], rr2)
+})
diff --git a/tests/testthat/test-rule.R b/tests/testthat/test-rule.R
index 58cb8b9..1ae50f9 100644
--- a/tests/testthat/test-rule.R
+++ b/tests/testthat/test-rule.R
@@ -1,35 +1,82 @@
-test_that("Basic rules work", {
-
-  # basic version of rule
-  expect_equal(unclass(rule(mpg > 10)),
-               list(name = "Rule for: mpg", expr = "mpg > 10", allow_na = FALSE,
-                    negate = FALSE))
-  # expression can be given as a string as well
-  expect_equal(unclass(rule("mpg > 10")),
-               list(name = "Rule for: mpg", expr = "mpg > 10", allow_na = FALSE,
-                    negate = FALSE))
-
-  # additional information is carried along as well
-  expect_equal(unclass(rule(mpg > 10, author = "me", date = Sys.Date())),
-               list(name = "Rule for: mpg", expr = "mpg > 10", allow_na = FALSE,
-                    negate = FALSE, author = "me", date = Sys.Date()))
-
-  # rules can span multiple lines and do not throw a warning!
-  r <- rule(mpg > 10 &
-              cyl %in% c(4, 6, 8) |
-              disp > 10)
-  expect_equal(r$expr, "mpg > 10 & cyl %in% c(4, 6, 8) | disp > 10")
-})
-
-
-test_that("get_symbols works as expected", {
-  f <- function(x) deparse(substitute(x))
-  get_symbols <- dataverifyr:::get_symbols
-
-  expect_equal(get_symbols(f(mpg > 10)), "mpg")
-
-  expect_equal(
-    get_symbols(f(mpg > 10 & mpg <= 123 | is.na(cyl) & as.numeric(wt) > qsec)),
-    c("mpg", "cyl", "wt", "qsec")
-  )
-})
+test_that("Basic rules work", {
+
+  # basic version of rule
+  expect_equal(unclass(rule(mpg > 10)),
+               list(name = "Rule for: mpg", expr = "mpg > 10", allow_na = FALSE,
+                    negate = FALSE))
+  # expression can be given as a string as well
+  expect_equal(unclass(rule("mpg > 10")),
+               list(name = "Rule for: mpg", expr = "mpg > 10", allow_na = FALSE,
+                    negate = FALSE))
+
+  # additional information is carried along as well
+  expect_equal(unclass(rule(mpg > 10, author = "me", date = Sys.Date())),
+               list(name = "Rule for: mpg", expr = "mpg > 10", allow_na = FALSE,
+                    negate = FALSE, author = "me", date = Sys.Date()))
+
+  # rules can span multiple lines and do not throw a warning!
+  r <- rule(mpg > 10 &
+              cyl %in% c(4, 6, 8) |
+              disp > 10)
+  expect_equal(r$expr, "mpg > 10 & cyl %in% c(4, 6, 8) | disp > 10")
+})
+
+
+test_that("get_symbols works as expected", {
+  f <- function(x) deparse(substitute(x))
+  get_symbols <- dataverifyr:::get_symbols
+
+  expect_equal(get_symbols(f(mpg > 10)), "mpg")
+
+  expect_equal(
+    get_symbols(f(mpg > 10 & mpg <= 123 | is.na(cyl) & as.numeric(wt) > qsec)),
+    c("mpg", "cyl", "wt", "qsec")
+  )
+})
+
+test_that("print.rule prints expected structure", {
+  r <- rule(
+    mpg > 10,
+    name = "mpg rule",
+    allow_na = TRUE,
+    negate = TRUE,
+    author = "qa"
+  )
+
+  out <- paste(capture.output(print(r)), collapse = "\n")
+
+  expect_match(out, "<Verification Rule>")
+  expect_match(out, "expr: 'mpg > 10'")
+  expect_match(out, "name: 'mpg rule'")
+  expect_match(out, "allow NA: TRUE")
+  expect_match(out, "negated:  TRUE")
+  expect_match(out, "author: 'qa'")
+  ret <- NULL
+  capture.output(ret <- withVisible(print(r)))
+  expect_false(ret$visible)
+  expect_identical(ret$value, r)
+})
+
+test_that("print.ruleset respects n and truncation message", {
+  rs <- ruleset(
+    rule(mpg > 10, name = "mpg"),
+    rule(cyl %in% c(4, 6, 8), name = "cyl", allow_na = TRUE),
+    rule(qsec > 14, name = "qsec", negate = TRUE),
+    rule(hp < 300, name = "hp")
+  )
+
+  out_short <- paste(capture.output(print(rs, n = 3)), collapse = "\n")
+  expect_match(out_short, "<Verification Ruleset with 4 elements>")
+  expect_match(out_short, "\\[1\\] 'mpg' matching `mpg > 10` \\(allow_na: FALSE\\)")
+  expect_match(out_short, "\\[2\\] 'cyl' matching `cyl %in% c\\(4, 6, 8\\)` \\(allow_na: TRUE\\)")
+  expect_match(out_short, "\\[3\\] 'qsec' matching `qsec > 14` \\(allow_na: FALSE, negated\\)")
+  expect_match(out_short, "\\.\\.\\. \\+1 more\\. Use print\\(ruleset, n = 10\\) to print more\\.")
+
+  out_long <- paste(capture.output(print(rs, n = 10)), collapse = "\n")
+  expect_match(out_long, "\\[4\\] 'hp' matching `hp < 300` \\(allow_na: FALSE\\)")
+  expect_no_match(out_long, "\\.\\.\\. \\+")
+  ret <- NULL
+  capture.output(ret <- withVisible(print(rs)))
+  expect_false(ret$visible)
+  expect_identical(ret$value, rs)
+})
diff --git a/tests/testthat/test-ruleset_construction.R b/tests/testthat/test-ruleset_construction.R
index 89126e6..d492e90 100644
--- a/tests/testthat/test-ruleset_construction.R
+++ b/tests/testthat/test-ruleset_construction.R
@@ -1,78 +1,78 @@
 test_that("Rules can be added together", {
-      rule_1 <- rule(mpg > 10)
-      rule_2 <- rule(hp > 10)
-      expect_equal(
-        rule_1 + rule_2,
-        ruleset(rule_1, rule_2)
-      )
+  rule_1 <- rule(mpg > 10)
+  rule_2 <- rule(hp > 10)
+  expect_equal(
+    rule_1 + rule_2,
+    ruleset(rule_1, rule_2)
+  )
 })
 
 test_that("Rules can be added to rulesets", {
-      rule_1 <- rule(mpg > 10)
-      rule_2 <- rule(hp > 10)
-      rule_3 <- rule(name == "henry")
+  rule_1 <- rule(mpg > 10)
+  rule_2 <- rule(hp > 10)
+  rule_3 <- rule(name == "henry")
 
-      expect_equal(
-        rule_1 + ruleset(rule_2, rule_3),
-        ruleset(rule_1, rule_2, rule_3)
-      )
+  expect_equal(
+    rule_1 + ruleset(rule_2, rule_3),
+    ruleset(rule_1, rule_2, rule_3)
+  )
 })
 
 test_that("rulesets can be added to rules", {
-      rule_1 <- rule(mpg > 10)
-      rule_2 <- rule(hp > 10)
-      rule_3 <- rule(name == "henry")
+  rule_1 <- rule(mpg > 10)
+  rule_2 <- rule(hp > 10)
+  rule_3 <- rule(name == "henry")
 
-      expect_equal(
-        ruleset(rule_1, rule_2) + rule_3,
-        ruleset(rule_1, rule_2, rule_3)
-      )
+  expect_equal(
+    ruleset(rule_1, rule_2) + rule_3,
+    ruleset(rule_1, rule_2, rule_3)
+  )
 })
 
 test_that("rulesets can be added to rules", {
-      rule_1 <- rule(mpg > 10)
-      rule_2 <- rule(hp > 10)
-      rule_3 <- rule(name == "henry")
-      rule_4 <- rule(sex == "female")
+  rule_1 <- rule(mpg > 10)
+  rule_2 <- rule(hp > 10)
+  rule_3 <- rule(name == "henry")
+  rule_4 <- rule(sex == "female")
 
-      expect_equal(
-        ruleset(rule_1, rule_2) + ruleset(rule_3, rule_4),
-        ruleset(rule_1, rule_2, rule_3, rule_4)
-      )
+  expect_equal(
+    ruleset(rule_1, rule_2) + ruleset(rule_3, rule_4),
+    ruleset(rule_1, rule_2, rule_3, rule_4)
+  )
 })
 
 test_that("duplicates are removed when adding rules and rulesets", {
-      rule_1 <- rule(mpg > 10)
-      rule_2 <- rule(hp > 10)
-      expect_equal(
-        length(ruleset(rule_1, rule_2) + ruleset(rule_1, rule_2)),
-        2
-      )
+  rule_1 <- rule(mpg > 10)
+  rule_2 <- rule(hp > 10)
+  expect_equal(
+    length(ruleset(rule_1, rule_2) + ruleset(rule_1, rule_2)),
+    2
+  )
 
-      expect_equal(
-        length(rule_1 + rule_2 + ruleset(rule_1, rule_2) + ruleset(rule_1, rule_2)),
-        2
-      )
+  expect_equal(
+    length(rule_1 + rule_2 + ruleset(rule_1, rule_2) + ruleset(rule_1, rule_2)),
+    2
+  )
 })
 
 test_that("bind_rules works", {
-      rule_1 <- rule(mpg > 10)
-      rule_2 <- rule(hp > 10)
-      rule_3 <- rule(name == "henry")
-      rule_4 <- rule(sex == "female")
+  rule_1 <- rule(mpg > 10)
+  rule_2 <- rule(hp > 10)
+  rule_3 <- rule(name == "henry")
+  rule_4 <- rule(sex == "female")
 
-      expect_equal(
-        bind_rules(list(rule_1, rule_2, ruleset(rule_3, rule_4))),
-        ruleset(rule_1, rule_2, rule_3, rule_4)
-      )
+  expect_equal(
+    bind_rules(list(rule_1, rule_2, ruleset(rule_3, rule_4))),
+    ruleset(rule_1, rule_2, rule_3, rule_4)
+  )
 
-      expect_equal(
-        bind_rules(list(ruleset(rule_1, rule_2), ruleset(rule_3, rule_4))),
-        ruleset(rule_1, rule_2, rule_3, rule_4)
-      )
+  expect_equal(
+    bind_rules(list(ruleset(rule_1, rule_2), ruleset(rule_3, rule_4))),
+    ruleset(rule_1, rule_2, rule_3, rule_4)
+  )
 
-      expect_equal(
-        bind_rules(list(rule_1, ruleset(rule_2), ruleset(rule_3, rule_4))),
-        ruleset(rule_1, rule_2, rule_3, rule_4)
-      )
+  expect_equal(
+    bind_rules(list(rule_1, ruleset(rule_2), ruleset(rule_3, rule_4))),
+    ruleset(rule_1, rule_2, rule_3, rule_4)
+  )
 })
diff --git a/tests/testthat/test-sample_data.R b/tests/testthat/test-sample_data.R
new file mode 100644
index 0000000..ba37812
--- /dev/null
+++ b/tests/testthat/test-sample_data.R
@@ -0,0 +1,66 @@
+test_that("sample_data is available and has documented structure", {
+  expect_true(exists("sample_data", where = asNamespace("dataverifyr"), inherits = FALSE))
+
+  x <- get("sample_data", envir = asNamespace("dataverifyr"))
+  expect_s3_class(x, "data.frame")
+
+  expect_equal(
+    names(x),
+    c("order_id", "customer_tier", "amount", "paid", "payment_method", "order_time")
+  )
+
+  expect_true(is.integer(x$order_id))
+  expect_true(is.character(x$customer_tier))
+  expect_true(is.numeric(x$amount))
+  expect_true(is.logical(x$paid))
+  expect_true(is.character(x$payment_method))
+  expect_s3_class(x$order_time, "POSIXct")
+
+  expect_true(any(is.na(x$customer_tier)))
+  expect_true(any(is.na(x$amount)))
+  expect_true(any(is.na(x$paid)))
+  expect_true(any(is.na(x$payment_method)))
+  expect_true(any(is.na(x$order_time)))
+})
+
+
+test_that("sample_data can be used for row and column checks", {
+  rs <- ruleset(
+    rule(amount >= 0, name = "amount must be non-negative", allow_na = TRUE),
+    rule(!paid | payment_method != "none", name = "paid orders require payment", allow_na = TRUE),
+    data_columns = list(
+      data_column("order_id", type = "int", optional = FALSE),
+      data_column("customer_tier", type = "str", optional = FALSE),
+      data_column("amount", type = "double", optional = FALSE),
+      data_column("paid", type = "logical", optional = FALSE),
+      data_column("payment_method", type = "str", optional = FALSE),
+      data_column("order_time", type = "str", optional = TRUE)
+    )
+  )
+
+  x <- get("sample_data", envir = asNamespace("dataverifyr"))
+
+  res_type <- check_data(x, rs)
+  expect_true(any(
+    res_type$check_type == "schema" &
+      grepl("type", res_type$name) &
+      res_type$fail == 1
+  ))
+  expect_error(check_data(x, rs, stop_on_schema_fail = TRUE), "schema fails")
+
+  rs_ok <- ruleset(
+    rule(amount >= 0, name = "amount must be non-negative", allow_na = TRUE),
+    data_columns = list(
+      data_column("order_id", type = "int", optional = FALSE),
+      data_column("customer_tier", type = "str", optional = FALSE),
+      data_column("amount", type = "double", optional = FALSE),
+      data_column("paid", type = "logical", optional = FALSE),
+      data_column("payment_method", type = "str", optional = FALSE),
+      data_column("order_time", optional = TRUE)
+    )
+  )
+
+  res <- check_data(x, rs_ok, extra_columns = "ignore")
+  expect_s3_class(res, "data.frame")
+  expect_true(all(c("name", "pass", "fail") %in% names(res)))
+})