GH-35806: [R] Improve error message for null type inference with sparse CSV data (#49338)

thisisnic · jonkeane · web-flow · commit 2a526c1e623d · 2026-03-27T14:59:23.000Z
### Rationale for this change When reading a CSV with sparse data (many missing values followed by actual values), Arrow can infer a column type as `null` based on the first block of data. When non-null values appear later, the error message incorrectly suggests using `skip = 1` for header rows, which is misleading. ### What changes are included in this PR? Adds a specific check for "conversion error to null" that provides a helpful message explaining the cause (type inference from sparse data) and the solution (change the block size to use for inference). ### Are these changes tested? Yes, added a test in `test-dataset-csv.R`. ### Are there any user-facing changes? Yes, improved error message when CSV type inference fails due to sparse data. --- This PR was authored by Claude (Opus 4.5) and reviewed by @ thisisnic. 🤖 Generated with [Claude Code](https://claude.ai/code) * GitHub Issue: #35806 Lead-authored-by: Nic Crane <thisisnic@gmail.com> Co-authored-by: Jonathan Keane <jkeane@gmail.com> Signed-off-by: Nic Crane <thisisnic@gmail.com>
diff --git a/r/R/util.R b/r/R/util.R
@@ -196,6 +196,20 @@ repeat_value_as_array <- function(object, n) {
 }
 
 handle_csv_read_error <- function(msg, call, schema) {
+  # Dataset collection passes empty schema() when no explicit
+  # CSV schema from the original call is available in this error path.
+  if (grepl("conversion error to null", msg) && is_empty_schema(schema)) {
+    msg <- c(
+      msg,
+      i = paste(
+        "If you have not specified the schema, this error may be due to the column type being",
+        "inferred as `null` because the first block of data contained only missing values.",
+        "See `?csv_read_options` for how to set a larger value or specify a schema if you know the correct types."
+      )
+    )
+    abort(msg, call = call)
+  }
+
   if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
     msg <- c(
       msg,
@@ -290,3 +304,7 @@ col_type_from_compact <- function(x, y) {
     abort(paste0("Unsupported compact specification: '", x, "' for column '", y, "'"))
   )
 }
+
+is_empty_schema <- function(x) {
+  x == schema()
+}
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
@@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", {
     tibble(x = 1.2, y = "c")
   )
 })
+
+test_that("more informative error when column inferred as null due to sparse data (GH-35806)", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  writeLines(c("x,y", paste0(1:100, ",")), tf)
+  write("101,foo", tf, append = TRUE)
+
+  expect_error(
+    open_dataset(
+      tf,
+      format = "csv",
+      read_options = csv_read_options(block_size = 100L)
+    ) |>
+      collect(),
+    "column type being inferred as"
+  )
+})