From 9abfbbe8721f67f41770c469617fe29c2175ca91 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 23 Mar 2026 20:56:25 -0400 Subject: [PATCH 1/6] Local Zarr store wrapper for use in tests. --- Cargo.lock | 4 + Cargo.toml | 4 + data/zarr_store.zarr/meta/bbox/c/0 | Bin 101 -> 0 bytes data/zarr_store.zarr/meta/bbox/zarr.json | 38 ---- data/zarr_store.zarr/meta/collection/c/0 | Bin 39 -> 0 bytes .../zarr_store.zarr/meta/collection/zarr.json | 38 ---- data/zarr_store.zarr/meta/date/c/0 | Bin 33 -> 0 bytes data/zarr_store.zarr/meta/date/zarr.json | 46 ----- data/zarr_store.zarr/meta/zarr.json | 5 - data/zarr_store.zarr/zarr.json | 5 - src/schema.rs | 58 +++--- src/table_provider.rs | 70 ++++--- src/testing/iter_group.rs | 15 +- src/testing/load_into_arrow.rs | 181 +++++++++-------- src/testing/load_zarrs.rs | 190 ++++++++---------- src/testing/mod.rs | 1 + src/testing/utils.rs | 171 ++++++++++++++++ 17 files changed, 441 insertions(+), 385 deletions(-) delete mode 100644 data/zarr_store.zarr/meta/bbox/c/0 delete mode 100644 data/zarr_store.zarr/meta/bbox/zarr.json delete mode 100644 data/zarr_store.zarr/meta/collection/c/0 delete mode 100644 data/zarr_store.zarr/meta/collection/zarr.json delete mode 100644 data/zarr_store.zarr/meta/date/c/0 delete mode 100644 data/zarr_store.zarr/meta/date/zarr.json delete mode 100644 data/zarr_store.zarr/meta/zarr.json delete mode 100644 data/zarr_store.zarr/zarr.json create mode 100644 src/testing/utils.rs diff --git a/Cargo.lock b/Cargo.lock index dcad309..5b5e628 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6140,8 +6140,10 @@ dependencies = [ "arrow-array", "arrow-schema", "async-trait", + "chrono", "datafusion", "futures", + "geo", "geoarrow-array", "geoarrow-schema", "geodatafusion", @@ -6149,6 +6151,8 @@ dependencies = [ "object_store 0.12.5", "thiserror 2.0.18", "tokio", + "walkdir", + "wkb", "zarrs", "zarrs_filesystem", "zarrs_icechunk", diff --git a/Cargo.toml b/Cargo.toml index f1df2e3..69d4ce0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,5 +24,9 @@ zarrs_object_store = "0.5.0" zarrs_storage = { version = "0.4.0", features = ["async"] } [dev-dependencies] +chrono = "0.4.44" +walkdir = "2.5.0" +geo = "0.31" geoarrow-array = "0.7.0" tokio = { version = "1.48", features = ["macros", "rt-multi-thread"] } +wkb = "0.9" diff --git a/data/zarr_store.zarr/meta/bbox/c/0 b/data/zarr_store.zarr/meta/bbox/c/0 deleted file mode 100644 index 989c27c2d6474e5ebc499a0adbf184f1002b0064..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 101 zcmdPcs{c1Zo#8GMg9{Tg0|P@W5Ho@ptPCm+3@Qg091a|CFaZHO5I7*spul~C!H3~p y1NQ>PC(M$}4h_~1*k&-?VH9^@KEW = collection_array - .retrieve_array_subset_elements(&collection_subset) - .unwrap(); - - // Load date array (datetime64[ms]) - let date_array = Array::open(store.clone(), "/meta/date").unwrap(); - let date_subset = ArraySubset::new_with_shape(date_array.shape().to_vec()); - let date_data: Vec = date_array - .retrieve_array_subset_elements(&date_subset) - .unwrap(); - - // Load bbox array (binary data representing WKB geometries) - let bbox_array = Array::open(store.clone(), "/meta/bbox").unwrap(); - let bbox_subset = ArraySubset::new_with_shape(bbox_array.shape().to_vec()); - let bbox_data: Vec> = bbox_array - .retrieve_array_subset_elements(&bbox_subset) +#[tokio::test] +async fn test_load_zarrs_into_arrow_record_batch() { + let wrapper = get_local_zarr_store("data/arrow_batch.zarr").await; + let path = wrapper.get_store_path(); + + { + let store = Arc::new(FilesystemStore::new(path).unwrap()); + + // Load collection array (strings) + let collection_array = Array::open(store.clone(), "/meta/collection").unwrap(); + let collection_subset = ArraySubset::new_with_shape(collection_array.shape().to_vec()); + let collection_data: Vec = collection_array + .retrieve_array_subset_elements(&collection_subset) + .unwrap(); + + // Load date array (datetime64[ms]) + let date_array = Array::open(store.clone(), "/meta/date").unwrap(); + let date_subset = ArraySubset::new_with_shape(date_array.shape().to_vec()); + let date_data: Vec = date_array + .retrieve_array_subset_elements(&date_subset) + .unwrap(); + + // Load bbox array (binary data representing WKB geometries) + let bbox_array = Array::open(store.clone(), "/meta/bbox").unwrap(); + let bbox_subset = ArraySubset::new_with_shape(bbox_array.shape().to_vec()); + let bbox_data: Vec> = bbox_array + .retrieve_array_subset_elements(&bbox_subset) + .unwrap(); + + // Create Arrow arrays from the loaded data + let collection_arrow: ArrayRef = Arc::new(StringArray::from(collection_data.clone())); + let date_arrow: ArrayRef = Arc::new(TimestampMillisecondArray::from(date_data.clone())); + let wkb_crs = Crs::from_authority_code("EPSG:4326".to_string()); + let wkb_metadata = Arc::new(geoarrow_schema::Metadata::new(wkb_crs, None)); + let bbox_refs: Vec<&[u8]> = bbox_data.iter().map(|v| v.as_slice()).collect(); + let wkb_arrow = WkbArray::new(bbox_refs.into(), wkb_metadata); + + // Define the schema + let schema = Arc::new(Schema::new(vec![ + Field::new("collection", DataType::Utf8, false), + Field::new( + "date", + DataType::Timestamp(TimeUnit::Millisecond, None), + false, + ), + wkb_arrow.data_type().to_field("bbox", false), + ])); + + // Create the RecordBatch + let record_batch = RecordBatch::try_new( + schema.clone(), + vec![collection_arrow, date_arrow, wkb_arrow.into_array_ref()], + ) .unwrap(); - // Create Arrow arrays from the loaded data - let collection_arrow: ArrayRef = Arc::new(StringArray::from(collection_data.clone())); - let date_arrow: ArrayRef = Arc::new(TimestampMillisecondArray::from(date_data.clone())); - let wkb_crs = Crs::from_authority_code("EPSG:4326".to_string()); - let wkb_metadata = Arc::new(geoarrow_schema::Metadata::new(wkb_crs, None)); - let bbox_refs: Vec<&[u8]> = bbox_data.iter().map(|v| v.as_slice()).collect(); - let wkb_arrow = WkbArray::new(bbox_refs.into(), wkb_metadata); - - // Define the schema - let schema = Arc::new(Schema::new(vec![ - Field::new("collection", DataType::Utf8, false), - Field::new( - "date", - DataType::Timestamp(TimeUnit::Millisecond, None), - false, - ), - wkb_arrow.data_type().to_field("bbox", false), - ])); - - // Create the RecordBatch - let record_batch = RecordBatch::try_new( - schema.clone(), - vec![collection_arrow, date_arrow, wkb_arrow.into_array_ref()], - ) - .unwrap(); - - // Print the record batch - println!("Created Arrow RecordBatch from Zarr arrays:"); - println!(" Schema: {:?}", record_batch.schema()); - println!(" Num rows: {}", record_batch.num_rows()); - println!(" Num columns: {}", record_batch.num_columns()); - println!("\nData:"); - for i in 0..record_batch.num_rows() { - println!( - " Row {}: collection='{}', date={}, bbox len={}", - i, - collection_data[i], - date_data[i], - bbox_data[i].len() - ); + // Print the record batch + println!("Created Arrow RecordBatch from Zarr arrays:"); + println!(" Schema: {:?}", record_batch.schema()); + println!(" Num rows: {}", record_batch.num_rows()); + println!(" Num columns: {}", record_batch.num_columns()); + println!("\nData:"); + for i in 0..record_batch.num_rows() { + println!( + " Row {}: collection='{}', date={}, bbox len={}", + i, + collection_data[i], + date_data[i], + bbox_data[i].len() + ); + } + + // Assertions + assert_eq!(record_batch.num_rows(), 3); + assert_eq!(record_batch.num_columns(), 3); + assert_eq!(record_batch.schema().fields().len(), 3); + + // Verify the data + let collection_col = record_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(collection_col.value(0), "collection_a"); + assert_eq!(collection_col.value(1), "collection_b"); + assert_eq!(collection_col.value(2), "collection_c"); + + let date_col = record_batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(date_col.value(0), 1672531200000); + assert_eq!(date_col.value(1), 1672617600000); + assert_eq!(date_col.value(2), 1672704000000); } - - // Assertions - assert_eq!(record_batch.num_rows(), 3); - assert_eq!(record_batch.num_columns(), 3); - assert_eq!(record_batch.schema().fields().len(), 3); - - // Verify the data - let collection_col = record_batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(collection_col.value(0), "collection_a"); - assert_eq!(collection_col.value(1), "collection_b"); - assert_eq!(collection_col.value(2), "collection_c"); - - let date_col = record_batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(date_col.value(0), 1672531200000); - assert_eq!(date_col.value(1), 1672617600000); - assert_eq!(date_col.value(2), 1672704000000); } #[tokio::test] diff --git a/src/testing/load_zarrs.rs b/src/testing/load_zarrs.rs index a0b2153..0f75bed 100644 --- a/src/testing/load_zarrs.rs +++ b/src/testing/load_zarrs.rs @@ -1,3 +1,4 @@ +use crate::testing::utils::get_local_zarr_store; use icechunk::{Repository, RepositoryConfig, repository::VersionInfo}; use std::collections::HashMap; use std::path::Path; @@ -7,38 +8,42 @@ use zarrs::array_subset::ArraySubset; use zarrs_filesystem::FilesystemStore; use zarrs_icechunk::AsyncIcechunkStore; -#[test] -fn test_load_collection_array() { - // Create a filesystem store pointing to the zarr store - let store = Arc::new(FilesystemStore::new("data/zarr_store.zarr").unwrap()); - - // Open the collection array from the /meta/collection path - let collection_array = Array::open(store, "/meta/collection").unwrap(); - - // Print array metadata - println!("Array shape: {:?}", collection_array.shape()); - println!("Data type: {:?}", collection_array.data_type()); - - // Create array subset for the entire array (shape is [3]) - let array_subset = ArraySubset::new_with_shape(collection_array.shape().to_vec()); - - // Read the entire array as strings - let data: Vec = collection_array - .retrieve_array_subset_elements(&array_subset) - .unwrap(); - - println!("Collection array contents:"); - for (i, item) in data.iter().enumerate() { - println!(" [{}]: {}", i, item); +#[tokio::test] +async fn test_load_collection_array() { + let wrapper = get_local_zarr_store("data/collection.zarr").await; + let path = wrapper.get_store_path(); + + { + let store = Arc::new(FilesystemStore::new(path).unwrap()); + + // Open the collection array from the /meta/collection path + let collection_array = Array::open(store, "/meta/collection").unwrap(); + + // Print array metadata + println!("Array shape: {:?}", collection_array.shape()); + println!("Data type: {:?}", collection_array.data_type()); + + // Create array subset for the entire array (shape is [3]) + let array_subset = ArraySubset::new_with_shape(collection_array.shape().to_vec()); + + // Read the entire array as strings + let data: Vec = collection_array + .retrieve_array_subset_elements(&array_subset) + .unwrap(); + + println!("Collection array contents:"); + for (i, item) in data.iter().enumerate() { + println!(" [{}]: {}", i, item); + } + + // Basic assertions + assert!(!data.is_empty(), "Collection array should not be empty"); + assert_eq!( + collection_array.shape(), + &[3], + "Collection array should have 3 elements" + ); } - - // Basic assertions - assert!(!data.is_empty(), "Collection array should not be empty"); - assert_eq!( - collection_array.shape(), - &[3], - "Collection array should have 3 elements" - ); } #[tokio::test] @@ -84,38 +89,42 @@ async fn test_load_collection_array_icechunk() { ); } -#[test] -fn test_load_date_array() { - // Create a filesystem store pointing to the zarr store - let store = Arc::new(FilesystemStore::new("data/zarr_store.zarr").unwrap()); - - // Open the date array from the /meta/date path - let date_array = Array::open(store, "/meta/date").unwrap(); - - // Print array metadata - println!("Array shape: {:?}", date_array.shape()); - println!("Data type: {:?}", date_array.data_type()); - - // Create array subset for the entire array (shape is [3]) - let array_subset = ArraySubset::new_with_shape(date_array.shape().to_vec()); - - // Read the entire array as i64 milliseconds (datetime64[ms]) - let data: Vec = date_array - .retrieve_array_subset_elements(&array_subset) - .unwrap(); - - println!("Date array contents (milliseconds since epoch):"); - for (i, ms) in data.iter().enumerate() { - println!(" [{}]: {} ms", i, ms); +#[tokio::test] +async fn test_load_date_array() { + let wrapper = get_local_zarr_store("data/date.zarr").await; + let path = wrapper.get_store_path(); + + { + let store = Arc::new(FilesystemStore::new(path).unwrap()); + + // Open the date array from the /meta/date path + let date_array = Array::open(store, "/meta/date").unwrap(); + + // Print array metadata + println!("Array shape: {:?}", date_array.shape()); + println!("Data type: {:?}", date_array.data_type()); + + // Create array subset for the entire array (shape is [3]) + let array_subset = ArraySubset::new_with_shape(date_array.shape().to_vec()); + + // Read the entire array as i64 milliseconds (datetime64[ms]) + let data: Vec = date_array + .retrieve_array_subset_elements(&array_subset) + .unwrap(); + + println!("Date array contents (milliseconds since epoch):"); + for (i, ms) in data.iter().enumerate() { + println!(" [{}]: {} ms", i, ms); + } + + // Basic assertions + assert!(!data.is_empty(), "Date array should not be empty"); + assert_eq!( + date_array.shape(), + &[3], + "Date array should have 3 elements" + ); } - - // Basic assertions - assert!(!data.is_empty(), "Date array should not be empty"); - assert_eq!( - date_array.shape(), - &[3], - "Date array should have 3 elements" - ); } #[tokio::test] @@ -161,48 +170,17 @@ async fn test_load_date_array_icechunk() { ); } -#[test] -fn test_load_bbox_array() { - // Create a filesystem store pointing to the zarr store - let store = Arc::new(FilesystemStore::new("data/zarr_store.zarr").unwrap()); - - let bbox_array = Array::open(store.clone(), "/meta/bbox").unwrap(); - - println!("HII"); - dbg!(bbox_array.data_type()); - - // let array_subset = ArraySubset::new_with_shape(bbox_array.shape().to_vec()); - // let data: Vec> = bbox_array - // .retrieve_array_subset_elements(&array_subset) - // .unwrap(); - // dbg!(data); - - // // Note: The bbox array uses "variable_length_bytes" data type which is not yet - // // fully supported in zarrs 0.21.2. This test demonstrates reading the raw chunk data. - - // println!("Reading bbox array chunk data from meta/bbox/c/0"); - - // // Read the raw chunk data directly from storage - // let chunk_key = StoreKey::new("meta/bbox/c/0").unwrap(); - // let chunk_data = store.get(&chunk_key).unwrap().unwrap(); - // let chunk_bytes: Vec = chunk_data.to_vec(); - - // println!("Bbox chunk data:"); - // println!(" Chunk key: {}", chunk_key); - // println!(" Raw bytes length: {} bytes", chunk_bytes.len()); - // println!( - // " First 64 bytes (or all if less): {:?}", - // &chunk_bytes[..chunk_bytes.len().min(64)] - // ); - - // // Basic assertions - // assert!( - // !chunk_bytes.is_empty(), - // "Bbox chunk data should not be empty" - // ); - - // // The chunk contains compressed data (zstd) for 3 variable-length byte arrays - // println!("\nNote: This is the compressed chunk data. To properly decode:"); - // println!(" 1. Decompress with zstd"); - // println!(" 2. Decode vlen-bytes format (offsets + data)"); +#[tokio::test] +async fn test_load_bbox_array() { + let wrapper = get_local_zarr_store("data/bbox.zarr").await; + let path = wrapper.get_store_path(); + + { + let store = Arc::new(FilesystemStore::new(path).unwrap()); + + let bbox_array = Array::open(store.clone(), "/meta/bbox").unwrap(); + + println!("HII"); + dbg!(bbox_array.data_type()); + } } diff --git a/src/testing/mod.rs b/src/testing/mod.rs index 097f020..8c6f96d 100644 --- a/src/testing/mod.rs +++ b/src/testing/mod.rs @@ -2,3 +2,4 @@ pub mod datafusion; pub mod iter_group; pub mod load_into_arrow; pub mod load_zarrs; +pub mod utils; diff --git a/src/testing/utils.rs b/src/testing/utils.rs new file mode 100644 index 0000000..26b87a3 --- /dev/null +++ b/src/testing/utils.rs @@ -0,0 +1,171 @@ +use futures::executor::block_on; +use object_store::local::LocalFileSystem; +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; +use zarrs::array::{ArrayBuilder, DataType, FillValue}; +use zarrs::array_subset::ArraySubset; +use zarrs::metadata_ext::data_type::NumpyTimeUnit; +use zarrs_object_store::AsyncObjectStore; +use zarrs_storage::{ + AsyncReadableWritableListableStorageTraits, AsyncWritableStorageTraits, StorePrefix, +}; + +pub(crate) struct LocalZarrStoreWrapper { + store: Arc>, + path: PathBuf, +} + +// Note that this wrapper should use unique store names to avoid collisons with +// running concurrent binary test execution. +#[cfg(test)] +impl LocalZarrStoreWrapper { + pub(crate) fn new(store_name: String) -> Self { + if store_name.is_empty() { + panic!("name for test zarr store cannot be empty!") + } + + let p = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(store_name); + + // Clean up any existing directory first (from failed tests or parallel execution) + if p.exists() { + let _ = fs::remove_dir_all(&p); + } + + fs::create_dir(p.clone()).unwrap(); + let store = AsyncObjectStore::new(LocalFileSystem::new_with_prefix(p.clone()).unwrap()); + Self { + store: Arc::new(store), + path: p, + } + } + + pub(crate) fn get_store(&self) -> Arc> { + self.store.clone() + } + + pub(crate) fn get_store_path(&self) -> String { + self.path.as_os_str().to_str().unwrap().into() + } +} + +// Include drop to remove store when it goes out of test scope +impl Drop for LocalZarrStoreWrapper { + fn drop(&mut self) { + // First, clear all data through the store interface + let prefix = StorePrefix::new("").unwrap(); + let _ = block_on(self.store.erase_prefix(&prefix)); + + // Then recursively remove the directory and all its contents + if self.path.exists() { + let _ = fs::remove_dir_all(&self.path); + } + } +} + +/// Creates three arrays in /meta group: +/// - date: datetime64[ms] array with dates [2023-01-01, 2023-01-02, 2023-01-03] +/// - collection: variable length UTF8 array with ["collection_a", "collection_b", "collection_c"] +/// - bbox: variable length bytes array with WKB-encoded boxes +#[cfg(test)] +pub(crate) async fn generate_test_data_arrays( + store: Arc, +) -> Result<(), Box> { + use chrono::NaiveDate; + + // Zarrs requires explicit group creation to + // avoid implicit group issues. + let group = zarrs::group::GroupBuilder::new().build(store.clone(), "/")?; + group.async_store_metadata().await?; + + let group = zarrs::group::GroupBuilder::new().build(store.clone(), "/meta")?; + group.async_store_metadata().await?; + + let dates = vec![ + NaiveDate::from_ymd_opt(2023, 1, 1).unwrap(), + NaiveDate::from_ymd_opt(2023, 1, 2).unwrap(), + NaiveDate::from_ymd_opt(2023, 1, 3).unwrap(), + ]; + + // Convert to milliseconds since Unix epoch + let date_data: Vec = dates + .iter() + .map(|d| d.and_hms_opt(0, 0, 0).unwrap().and_utc().timestamp_millis()) + .collect(); + + let date_array = ArrayBuilder::new( + vec![3], + vec![3], + DataType::NumpyDateTime64 { + unit: NumpyTimeUnit::Millisecond, + scale_factor: 1.try_into().unwrap(), + }, + FillValue::from(0i64), + ) + .build(store.clone(), "/meta/date")?; + + date_array.async_store_metadata().await?; + date_array + .async_store_array_subset_elements(&ArraySubset::new_with_shape(vec![3]), &date_data) + .await?; + + let collection_data = vec!["collection_a", "collection_b", "collection_c"]; + + let collection_array = + ArrayBuilder::new(vec![3], vec![3], DataType::String, FillValue::from("")) + .build(store.clone(), "/meta/collection")?; + + collection_array.async_store_metadata().await?; + collection_array + .async_store_array_subset_elements(&ArraySubset::new_with_shape(vec![3]), &collection_data) + .await?; + + // Create bbox array - variable length bytes (WKB format) + // Boxes: [-10,-10,10,10], [-20,-20,20,20], [-30,-30,30,30] + use geo::Polygon; + use geo::Rect; + use wkb::writer::{WriteOptions, write_polygon}; + + let boxes = vec![ + Rect::new( + geo::coord! { x: -10.0, y: -10.0 }, + geo::coord! { x: 10.0, y: 10.0 }, + ), + Rect::new( + geo::coord! { x: -20.0, y: -20.0 }, + geo::coord! { x: 20.0, y: 20.0 }, + ), + Rect::new( + geo::coord! { x: -30.0, y: -30.0 }, + geo::coord! { x: 30.0, y: 30.0 }, + ), + ]; + + let write_options = WriteOptions::default(); + let mut bbox_data: Vec> = Vec::new(); + for rect in boxes { + let polygon: Polygon = rect.into(); + let mut buffer = Vec::new(); + write_polygon(&mut buffer, &polygon, &write_options)?; + bbox_data.push(buffer); + } + + let bbox_array = ArrayBuilder::new(vec![3], vec![3], DataType::Bytes, FillValue::from(vec![])) + .build(store.clone(), "/meta/bbox")?; + + bbox_array.async_store_metadata().await?; + bbox_array + .async_store_array_subset_elements(&ArraySubset::new_with_shape(vec![3]), &bbox_data) + .await?; + + Ok(()) +} + +pub(crate) async fn get_local_zarr_store(dir_name: &str) -> LocalZarrStoreWrapper { + let wrapper = LocalZarrStoreWrapper::new(dir_name.into()); + let store = wrapper.get_store(); + generate_test_data_arrays(store) + .await + .expect("Failed to generate test data arrays"); + wrapper +} From 987ff86c4b080f719c3207f4eb66963e8a4a9128 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 24 Mar 2026 18:38:59 -0400 Subject: [PATCH 2/6] Add per test icechunk fixtures and remove pre-generated store. --- Cargo.lock | 1 - Cargo.toml | 1 - data/icechunk/manifests/KC53SCXBBZQR3HR12Z7G | Bin 167 -> 0 bytes data/icechunk/manifests/M0HCK3P6N722F5WSDJZG | Bin 160 -> 0 bytes data/icechunk/manifests/S9CF7DYNZ4GHPJ1JNTSG | Bin 234 -> 0 bytes data/icechunk/refs/branch.main/ref.json | 1 - data/icechunk/snapshots/1CECHNKREP0F1RSTCMT0 | Bin 175 -> 0 bytes data/icechunk/snapshots/ZKWW9E6BW6YFNNDKESR0 | Bin 922 -> 0 bytes .../transactions/ZKWW9E6BW6YFNNDKESR0 | Bin 232 -> 0 bytes src/testing/datafusion.rs | 19 ++-- src/testing/iter_group.rs | 6 +- src/testing/load_into_arrow.rs | 5 +- src/testing/load_zarrs.rs | 57 ++++++------ src/testing/utils.rs | 84 ++++++++++++++++-- 14 files changed, 125 insertions(+), 49 deletions(-) delete mode 100644 data/icechunk/manifests/KC53SCXBBZQR3HR12Z7G delete mode 100644 data/icechunk/manifests/M0HCK3P6N722F5WSDJZG delete mode 100644 data/icechunk/manifests/S9CF7DYNZ4GHPJ1JNTSG delete mode 100644 data/icechunk/refs/branch.main/ref.json delete mode 100644 data/icechunk/snapshots/1CECHNKREP0F1RSTCMT0 delete mode 100644 data/icechunk/snapshots/ZKWW9E6BW6YFNNDKESR0 delete mode 100644 data/icechunk/transactions/ZKWW9E6BW6YFNNDKESR0 diff --git a/Cargo.lock b/Cargo.lock index 5b5e628..965bce9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6151,7 +6151,6 @@ dependencies = [ "object_store 0.12.5", "thiserror 2.0.18", "tokio", - "walkdir", "wkb", "zarrs", "zarrs_filesystem", diff --git a/Cargo.toml b/Cargo.toml index 69d4ce0..4573ea2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,6 @@ zarrs_storage = { version = "0.4.0", features = ["async"] } [dev-dependencies] chrono = "0.4.44" -walkdir = "2.5.0" geo = "0.31" geoarrow-array = "0.7.0" tokio = { version = "1.48", features = ["macros", "rt-multi-thread"] } diff --git a/data/icechunk/manifests/KC53SCXBBZQR3HR12Z7G b/data/icechunk/manifests/KC53SCXBBZQR3HR12Z7G deleted file mode 100644 index eabe6dcc12fac1812d48fcd023e97fa8c64e583f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 167 zcmeZtcKtAad6%tf??_V288 zb5B&3ZM^z1BU)tRMT>%IT-n*SesH0 diff --git a/data/icechunk/manifests/S9CF7DYNZ4GHPJ1JNTSG b/data/icechunk/manifests/S9CF7DYNZ4GHPJ1JNTSG deleted file mode 100644 index 1eb9b69122be0185b2be774bc9b64fb0721ecf29..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 234 zcmeZtcKtAad6%fx0Y!_ZxpkGsMVD@ z!LuJ*UHq33==6@$B;IZD$M6ET8JE`Q2uYu~%W`$0NZIr#>#!8grANGWrgHti?wU8n zM0({y=~Np5u4JPe6}b&u?77bhc^p3)QQ1qU1ZBP78lSt` z_UFUj+;tvvmsx$2n9`zSXMafkKWk-;RfNuqSpt$Xj<0=c|Nr}OE-pcbYl<)R+&E0? dc9zO)4)L(y__3CS;p3?kmbG$TpDdf2m;ml~UGe|` diff --git a/data/icechunk/refs/branch.main/ref.json b/data/icechunk/refs/branch.main/ref.json deleted file mode 100644 index 1f3bbd7..0000000 --- a/data/icechunk/refs/branch.main/ref.json +++ /dev/null @@ -1 +0,0 @@ -{"snapshot":"ZKWW9E6BW6YFNNDKESR0"} \ No newline at end of file diff --git a/data/icechunk/snapshots/1CECHNKREP0F1RSTCMT0 b/data/icechunk/snapshots/1CECHNKREP0F1RSTCMT0 deleted file mode 100644 index b5a102383ab1129d68d176a61c3ca7d60bfb8a72..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 175 zcmeZtcKtAad6%h^Oj?W>ghj=b6*Bd<8unzjC$Tv}v%T|v6ZuPJWR-{tJ& zh-rIdw7l>N!#5-L4`$P!bO)pyWOz97$CInKqf0F%q_~(^M1*cVygV&%V%Lw&gm zRT}^%ZJSBJ7I8Y%yAQhrx`7!ANCT(OEK)!F7B80kB}$0@`p>rIhk{?a8y=?C8 z7%=_R;^&vK9OHrB)=Cw2lGYW$7&Fv~W%X6kZm#xkNdAq<|IxqVphkIega!%tZY~eRH)RAN|j1A zpy`fU^VXSnuNcx3t+>i;nXg$x3%W)TB)`#mGz-(|?#A_+E{j1CyFA!vzP_DqZiidY z9mQn+YUtTpzmj&BG1?u?;%L(DkrZ>qrb`S*@}nlYVqjxpwz+}W*GJvjI*1pXdK`iNy&2NtDB31Zbd$0uvF^c9_o`z-3PQ=%vG) z~M<5vAZ=M<`uPx3w-Oxu>(6ZTGD0 zj75~~xU?!QdgT?5 wpb6C~^b3gN>iqlRrV{;?4ExXs=u?BFmJupIHZZBXZ{3=I`H4q9#wz%zs(5$0&j0`b diff --git a/data/icechunk/transactions/ZKWW9E6BW6YFNNDKESR0 b/data/icechunk/transactions/ZKWW9E6BW6YFNNDKESR0 deleted file mode 100644 index 364f8f0ceeabc58f826195bc92d7433be67da50f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 232 zcmeZtcKtAad6%45Bz$eRvhx z1g$Jg9Y3G7{vP9)wCM1Wr;A-4DPH>YY4!61e&^1v-uicsl7gJWa~2+k>?hW icechunk::session::Session { - let storage = icechunk::new_local_filesystem_storage(Path::new("data/icechunk")) +#[tokio::test] +async fn test_datafusion() { + let ctx = SessionContext::new(); + + let wrapper = get_local_icechunk_store("data/ice_df").await; + let path = wrapper.get_store_path(); + let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await .unwrap(); let config = RepositoryConfig::default(); @@ -18,15 +24,8 @@ async fn create_icechunk_table_provider() -> icechunk::session::Session { .await .unwrap(); let version_info = VersionInfo::BranchTipRef("main".to_string()); - repo.readonly_session(&version_info).await.unwrap() -} - -#[tokio::test] -async fn test_datafusion() { - let ctx = SessionContext::new(); + let icechunk_session = repo.readonly_session(&version_info).await.unwrap(); - // Add icechunk session - let icechunk_session = create_icechunk_table_provider().await; let table_provider = Arc::new( ZarrTableProvider::new_icechunk(icechunk_session, Handle::current(), "/meta") .await diff --git a/src/testing/iter_group.rs b/src/testing/iter_group.rs index b71a61b..63eab42 100644 --- a/src/testing/iter_group.rs +++ b/src/testing/iter_group.rs @@ -1,3 +1,4 @@ +use crate::testing::utils::get_local_icechunk_store; use crate::testing::utils::get_local_zarr_store; use icechunk::{Repository, RepositoryConfig, repository::VersionInfo}; use std::collections::HashMap; @@ -21,9 +22,12 @@ async fn test_load_group() { #[tokio::test] async fn test_load_group_icechunk() { - let storage = icechunk::new_local_filesystem_storage(Path::new("data/icechunk")) + let wrapper = get_local_icechunk_store("data/ice_group").await; + let path = wrapper.get_store_path(); + let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await .unwrap(); + let config = RepositoryConfig::default(); let repo = Repository::open(Some(config), storage, HashMap::new()) .await diff --git a/src/testing/load_into_arrow.rs b/src/testing/load_into_arrow.rs index 2131f1c..f90c6a8 100644 --- a/src/testing/load_into_arrow.rs +++ b/src/testing/load_into_arrow.rs @@ -1,3 +1,4 @@ +use crate::testing::utils::get_local_icechunk_store; use crate::testing::utils::get_local_zarr_store; use arrow_array::{ArrayRef, RecordBatch, StringArray, TimestampMillisecondArray}; use arrow_schema::{DataType, Field, Schema, TimeUnit}; @@ -112,7 +113,9 @@ async fn test_load_zarrs_into_arrow_record_batch() { #[tokio::test] async fn test_load_zarrs_into_arrow_record_batch_icechunk() { - let storage = icechunk::new_local_filesystem_storage(Path::new("data/icechunk")) + let wrapper = get_local_icechunk_store("data/ice_arrow").await; + let path = wrapper.get_store_path(); + let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await .unwrap(); let config = RepositoryConfig::default(); diff --git a/src/testing/load_zarrs.rs b/src/testing/load_zarrs.rs index 0f75bed..f4cffd7 100644 --- a/src/testing/load_zarrs.rs +++ b/src/testing/load_zarrs.rs @@ -1,3 +1,4 @@ +use crate::testing::utils::get_local_icechunk_store; use crate::testing::utils::get_local_zarr_store; use icechunk::{Repository, RepositoryConfig, repository::VersionInfo}; use std::collections::HashMap; @@ -48,7 +49,9 @@ async fn test_load_collection_array() { #[tokio::test] async fn test_load_collection_array_icechunk() { - let storage = icechunk::new_local_filesystem_storage(Path::new("data/icechunk")) + let wrapper = get_local_icechunk_store("data/ice_array").await; + let path = wrapper.get_store_path(); + let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await .unwrap(); let config = RepositoryConfig::default(); @@ -94,42 +97,42 @@ async fn test_load_date_array() { let wrapper = get_local_zarr_store("data/date.zarr").await; let path = wrapper.get_store_path(); - { - let store = Arc::new(FilesystemStore::new(path).unwrap()); + let store = Arc::new(FilesystemStore::new(path).unwrap()); - // Open the date array from the /meta/date path - let date_array = Array::open(store, "/meta/date").unwrap(); - - // Print array metadata - println!("Array shape: {:?}", date_array.shape()); - println!("Data type: {:?}", date_array.data_type()); + // Open the date array from the /meta/date path + let date_array = Array::open(store, "/meta/date").unwrap(); - // Create array subset for the entire array (shape is [3]) - let array_subset = ArraySubset::new_with_shape(date_array.shape().to_vec()); + // Print array metadata + println!("Array shape: {:?}", date_array.shape()); + println!("Data type: {:?}", date_array.data_type()); - // Read the entire array as i64 milliseconds (datetime64[ms]) - let data: Vec = date_array - .retrieve_array_subset_elements(&array_subset) - .unwrap(); + // Create array subset for the entire array (shape is [3]) + let array_subset = ArraySubset::new_with_shape(date_array.shape().to_vec()); - println!("Date array contents (milliseconds since epoch):"); - for (i, ms) in data.iter().enumerate() { - println!(" [{}]: {} ms", i, ms); - } + // Read the entire array as i64 milliseconds (datetime64[ms]) + let data: Vec = date_array + .retrieve_array_subset_elements(&array_subset) + .unwrap(); - // Basic assertions - assert!(!data.is_empty(), "Date array should not be empty"); - assert_eq!( - date_array.shape(), - &[3], - "Date array should have 3 elements" - ); + println!("Date array contents (milliseconds since epoch):"); + for (i, ms) in data.iter().enumerate() { + println!(" [{}]: {} ms", i, ms); } + + // Basic assertions + assert!(!data.is_empty(), "Date array should not be empty"); + assert_eq!( + date_array.shape(), + &[3], + "Date array should have 3 elements" + ); } #[tokio::test] async fn test_load_date_array_icechunk() { - let storage = icechunk::new_local_filesystem_storage(Path::new("data/icechunk")) + let wrapper = get_local_icechunk_store("data/ice_date_array").await; + let path = wrapper.get_store_path(); + let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await .unwrap(); let config = RepositoryConfig::default(); diff --git a/src/testing/utils.rs b/src/testing/utils.rs index 26b87a3..055d956 100644 --- a/src/testing/utils.rs +++ b/src/testing/utils.rs @@ -1,16 +1,32 @@ use futures::executor::block_on; +use icechunk::{ObjectStorage, Repository}; use object_store::local::LocalFileSystem; +use std::collections::HashMap; use std::fs; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::Arc; use zarrs::array::{ArrayBuilder, DataType, FillValue}; use zarrs::array_subset::ArraySubset; use zarrs::metadata_ext::data_type::NumpyTimeUnit; +use zarrs_icechunk::AsyncIcechunkStore; use zarrs_object_store::AsyncObjectStore; use zarrs_storage::{ AsyncReadableWritableListableStorageTraits, AsyncWritableStorageTraits, StorePrefix, }; +/// Helper function to cleanup a store and remove its directory +#[cfg(test)] +fn cleanup_store_and_directory(store: &dyn AsyncWritableStorageTraits, path: &Path) { + // First, clear all data through the store interface + let prefix = StorePrefix::new("").unwrap(); + let _ = block_on(store.erase_prefix(&prefix)); + + // Then recursively remove the directory and all its contents + if path.exists() { + let _ = fs::remove_dir_all(path); + } +} + pub(crate) struct LocalZarrStoreWrapper { store: Arc>, path: PathBuf, @@ -52,15 +68,53 @@ impl LocalZarrStoreWrapper { // Include drop to remove store when it goes out of test scope impl Drop for LocalZarrStoreWrapper { fn drop(&mut self) { - // First, clear all data through the store interface - let prefix = StorePrefix::new("").unwrap(); - let _ = block_on(self.store.erase_prefix(&prefix)); + cleanup_store_and_directory(self.store.as_ref(), &self.path); + } +} - // Then recursively remove the directory and all its contents - if self.path.exists() { - let _ = fs::remove_dir_all(&self.path); +pub(crate) struct LocalIcechunkStoreWrapper { + store: Arc, + path: PathBuf, +} + +// Note that this wrapper should use unique store names to avoid collisons with +// running concurrent binary test execution. +#[cfg(test)] +impl LocalIcechunkStoreWrapper { + pub(crate) async fn new(store_name: String) -> Self { + if store_name.is_empty() { + panic!("name for test icechunk repo cannot be empty!") + } + let p = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(store_name); + fs::create_dir(p.clone()).unwrap(); + let repo = Repository::create( + None, + Arc::new(ObjectStorage::new_local_filesystem(&p).await.unwrap()), + HashMap::new(), + ) + .await + .unwrap(); + let session = repo.writable_session("main").await.unwrap(); + Self { + store: Arc::new(AsyncIcechunkStore::new(session)), + path: p, } } + + pub(crate) fn get_store(&self) -> Arc { + self.store.clone() + } + + pub(crate) fn get_store_path(&self) -> String { + self.path.to_str().unwrap().into() + } +} + +// Include drop to remove store when it goes out of test scope +impl Drop for LocalIcechunkStoreWrapper { + fn drop(&mut self) { + cleanup_store_and_directory(self.store.as_ref(), &self.path); + } } /// Creates three arrays in /meta group: @@ -169,3 +223,19 @@ pub(crate) async fn get_local_zarr_store(dir_name: &str) -> LocalZarrStoreWrappe .expect("Failed to generate test data arrays"); wrapper } + +pub(crate) async fn get_local_icechunk_store(dir_name: &str) -> LocalIcechunkStoreWrapper { + let wrapper = LocalIcechunkStoreWrapper::new(dir_name.into()).await; + let store = wrapper.get_store(); + generate_test_data_arrays(store.clone()) + .await + .expect("Failed to generate test data arrays"); + let _ = store + .session() + .write() + .await + .commit("test data", None) + .await + .unwrap(); + wrapper +} From abefc9351f6e82cb309efe59c9737acc79cf81ba Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 25 Mar 2026 10:56:04 -0400 Subject: [PATCH 3/6] Move Python test zarr generation to pytest fixtures. --- python/pyproject.toml | 21 +++++ .../python/zarr_datafusion_search/__init__.py | 3 +- python/tests/conftest.py | 51 +++++++++++ python/tests/fixtures.py | 89 +++++++++++++++++++ python/tests/test_datafusion.py | 31 +++---- scripts/generate_data.py | 37 -------- scripts/generate_icechunk.py | 41 --------- 7 files changed, 178 insertions(+), 95 deletions(-) create mode 100644 python/tests/conftest.py create mode 100644 python/tests/fixtures.py delete mode 100644 scripts/generate_data.py delete mode 100644 scripts/generate_icechunk.py diff --git a/python/pyproject.toml b/python/pyproject.toml index 0e81389..b1fb1f1 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -31,3 +31,24 @@ dev-dependencies = [ "pytest", "shapely", ] + + +[tool.ruff] +line-length = 88 +target-version = "py311" + +[tool.ruff.lint] +select = ["E", "F", "I"] +per-file-ignores = {} + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" + +[tool.pytest.ini_options] +# Directories to search for tests +testpaths = [ + "tests", +] diff --git a/python/python/zarr_datafusion_search/__init__.py b/python/python/zarr_datafusion_search/__init__.py index df4465a..e542958 100644 --- a/python/python/zarr_datafusion_search/__init__.py +++ b/python/python/zarr_datafusion_search/__init__.py @@ -1,7 +1,6 @@ from __future__ import annotations -from ._rust import ZarrTable -from ._rust import ___version +from ._rust import ZarrTable, ___version __version__: str = ___version() diff --git a/python/tests/conftest.py b/python/tests/conftest.py new file mode 100644 index 0000000..86793d7 --- /dev/null +++ b/python/tests/conftest.py @@ -0,0 +1,51 @@ +"""Pytest fixtures for zarr-datafusion tests.""" +import pytest + +from .fixtures import create_test_icechunk_data, create_test_zarr_data + + +@pytest.fixture +def zarr_store(tmp_path): + """Create a temporary zarr store with test data. + + Yields the path to the zarr store, which is automatically cleaned up after the test. + """ + store_path = tmp_path / "zarr_store.zarr" + create_test_zarr_data(store_path) + yield store_path + + +@pytest.fixture +def icechunk_store(tmp_path): + """Create a temporary icechunk store with test data. + + Yields the path to the icechunk store, which is automatically cleaned up + after the test. + """ + store_path = tmp_path / "icechunk" + create_test_icechunk_data(store_path) + yield store_path + + +@pytest.fixture(scope="session") +def session_zarr_store(tmp_path_factory): + """Create a session-scoped zarr store with test data. + + This fixture is created once per test session and shared across all tests. + Useful for read-only tests that don't modify the data. + """ + store_path = tmp_path_factory.mktemp("data") / "zarr_store.zarr" + create_test_zarr_data(store_path) + yield store_path + + +@pytest.fixture(scope="session") +def session_icechunk_store(tmp_path_factory): + """Create a session-scoped icechunk store with test data. + + This fixture is created once per test session and shared across all tests. + Useful for read-only tests that don't modify the data. + """ + store_path = tmp_path_factory.mktemp("data") / "icechunk" + create_test_icechunk_data(store_path) + yield store_path diff --git a/python/tests/fixtures.py b/python/tests/fixtures.py new file mode 100644 index 0000000..2952697 --- /dev/null +++ b/python/tests/fixtures.py @@ -0,0 +1,89 @@ +from pathlib import Path + +import icechunk +import numpy as np +import shapely +import zarr +from zarr.core.group import Group +from zarr.dtype import VariableLengthBytes, VariableLengthUTF8 + + +def generate_test_data(root: Group) -> None: + """Create test zarr data + + Adds to a zarr store: + - a /meta group which contains + - date: datetime64[ms] array with dates [2023-01-01, 2023-01-02, + 2023-01-03] + - collection: variable length UTF8 array with ["collection_a", + "collection_b", "collection_c"] + - bbox: variable length bytes (WKB) array with boxes at (-10,-10,10,10), + (-20,-20,20,20), (-30,-30,30,30) + + Args: + root: The root Group in the target Zarr or Icechunk store. + """ + meta = root.create_group("meta") + + # Create date array + date_data = np.array( + ["2023-01-01", "2023-01-02", "2023-01-03"], dtype="datetime64[ms]" + ) + meta.create_array("date", data=date_data) + + # Create collection array + collection_data = ["collection_a", "collection_b", "collection_c"] + collection_array = meta.create_array( + "collection", + shape=(len(collection_data),), + dtype=VariableLengthUTF8(), + ) + collection_array[:] = collection_data + + # Create bbox array with WKB-encoded geometries + bbox_data = shapely.to_wkb( + [ + shapely.box(-10.0, -10.0, 10.0, 10.0), + shapely.box(-20.0, -20.0, 20.0, 20.0), + shapely.box(-30.0, -30.0, 30.0, 30.0), + ] + ) + + bbox_array = meta.create_array( + "bbox", + shape=(len(bbox_data),), + dtype=VariableLengthBytes(), + ) + bbox_array[:] = bbox_data + + +def create_test_zarr_data(store_path: Path | str) -> None: + """Create test store data at the specified path. + + Args: + store_path: Path where the Zarr store will be created. Can be string + or Path object. + + """ + store_path = Path(store_path) if isinstance(store_path, str) else store_path + root = zarr.open_group(store_path, mode="w", zarr_format=3) + generate_test_data(root=root) + + +def create_test_icechunk_data(store_path: Path | str) -> None: + """Create test icechunk store at the specified path. + + Args: + store_path: Path where the Icechunk repository will be created. + Can be string or Path object. + + """ + store_path = Path(store_path) if isinstance(store_path, str) else store_path + + storage = icechunk.local_filesystem_storage(str(store_path)) + repo = icechunk.Repository.create(storage) + session = repo.writable_session("main") + + root = zarr.open_group(session.store, mode="w", zarr_format=3) + generate_test_data(root=root) + session.commit("Initial test data") diff --git a/python/tests/test_datafusion.py b/python/tests/test_datafusion.py index ce6cc37..fb3fdc1 100644 --- a/python/tests/test_datafusion.py +++ b/python/tests/test_datafusion.py @@ -1,5 +1,3 @@ -from pathlib import Path - import icechunk import pytest from datafusion import SessionContext @@ -7,13 +5,11 @@ from obstore.store import LocalStore from zarr_datafusion_search import ZarrTable -ROOT_DIR = Path(__file__).parent.parent.parent - -def test_zarr_scan(): +def test_zarr_scan(session_zarr_store): + """Test basic zarr scanning with DataFusion.""" ctx = SessionContext() - zarr_path = ROOT_DIR / "data" / "zarr_store.zarr" - zarr_table = ZarrTable(str(zarr_path), "/meta") + zarr_table = ZarrTable(str(session_zarr_store), "/meta") ctx.register_table("zarr_data", zarr_table) @@ -21,23 +17,27 @@ def test_zarr_scan(): df = ctx.sql(sql) df.show() -def test_spatial_functions_registered(): + +def test_spatial_functions_registered(session_zarr_store): + """Test that spatial functions work with zarr data.""" ctx = SessionContext() register_all(ctx) - zarr_path = ROOT_DIR / "data" / "zarr_store.zarr" - zarr_table = ZarrTable(str(zarr_path), "/meta") + zarr_table = ZarrTable(str(session_zarr_store), "/meta") ctx.register_table("zarr_data", zarr_table) sql = ( "SELECT collection FROM zarr_data " - "WHERE ST_Intersects(bbox, ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))'))" + "WHERE ST_Intersects(bbox, " + "ST_GeomFromText('POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))'))" ) df = ctx.sql(sql) df.show() + @pytest.mark.asyncio -async def test_zarr_scan_from_obstore(): - store = LocalStore(ROOT_DIR / "data" / "zarr_store.zarr") +async def test_zarr_scan_from_obstore(session_zarr_store): + """Test zarr scanning from object store.""" + store = LocalStore(session_zarr_store) zarr_table = await ZarrTable.from_obstore(store, "/meta") ctx = SessionContext() @@ -51,8 +51,9 @@ async def test_zarr_scan_from_obstore(): @pytest.mark.asyncio -async def test_zarr_scan_from_icechunk(): - storage = icechunk.local_filesystem_storage(ROOT_DIR / "data" / "icechunk") +async def test_zarr_scan_from_icechunk(session_icechunk_store): + """Test zarr scanning from icechunk.""" + storage = icechunk.local_filesystem_storage(session_icechunk_store) repo = icechunk.Repository.open(storage) session = repo.readonly_session("main") diff --git a/scripts/generate_data.py b/scripts/generate_data.py deleted file mode 100644 index 3ce1a3c..0000000 --- a/scripts/generate_data.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -import shapely -import zarr -from zarr.dtype import VariableLengthBytes, VariableLengthUTF8 - - -# Root of the Zarr store -root = zarr.open_group("data/zarr_store.zarr", mode="w", zarr_format=3) - -meta = root.create_group("meta") - -date_data = np.array(["2023-01-01", "2023-01-02", "2023-01-03"], dtype="datetime64[ms]") -meta.create_array("date", data=date_data) - -collection_data = ["collection_a", "collection_b", "collection_c"] -collection_array = meta.create_array( - "collection", - shape=(len(collection_data),), - dtype=VariableLengthUTF8(), -) -collection_array[:] = collection_data - - -bbox_data = shapely.to_wkb( - [ - shapely.box(-10.0, -10.0, 10.0, 10.0), - shapely.box(-20.0, -20.0, 20.0, 20.0), - shapely.box(-30.0, -30.0, 30.0, 30.0), - ] -) - -bbox_array = meta.create_array( - "bbox", - shape=(len(bbox_data),), - dtype=VariableLengthBytes(), -) -bbox_array[:] = bbox_data diff --git a/scripts/generate_icechunk.py b/scripts/generate_icechunk.py deleted file mode 100644 index 5400df8..0000000 --- a/scripts/generate_icechunk.py +++ /dev/null @@ -1,41 +0,0 @@ -import icechunk -import zarr -import numpy as np -import shapely - -from zarr.dtype import VariableLengthBytes, VariableLengthUTF8 - -location = "data/icechunk" -storage = icechunk.local_filesystem_storage(location) -repo = icechunk.Repository.create(storage) -session = repo.writable_session("main") - -root = zarr.open_group(session.store, mode="w", zarr_format=3) -meta = root.create_group("meta") - -date_data = np.array(["2023-01-01", "2023-01-02", "2023-01-03"], dtype="datetime64[ms]") -meta.create_array("date", data=date_data) - -meta.create_array( - "collection", - shape=(3,), - dtype=VariableLengthUTF8(), -) -meta["collection"][...] = ["collection_a", "collection_b", "collection_c"] - -bbox_data = shapely.to_wkb( - [ - shapely.box(-10.0, -10.0, 10.0, 10.0), - shapely.box(-20.0, -20.0, 20.0, 20.0), - shapely.box(-30.0, -30.0, 30.0, 30.0), - ] -) - -meta.create_array( - "bbox", - shape=len(bbox_data), - dtype=VariableLengthBytes(), -) -meta["bbox"][...] = bbox_data - -session.commit("First") From a70f03caae968ba06e1dea908fa2eb4f87b1c073 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 25 Mar 2026 11:09:50 -0400 Subject: [PATCH 4/6] Fix clippy warnings. --- src/testing/utils.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/testing/utils.rs b/src/testing/utils.rs index 055d956..13e4807 100644 --- a/src/testing/utils.rs +++ b/src/testing/utils.rs @@ -135,11 +135,9 @@ pub(crate) async fn generate_test_data_arrays( let group = zarrs::group::GroupBuilder::new().build(store.clone(), "/meta")?; group.async_store_metadata().await?; - let dates = vec![ - NaiveDate::from_ymd_opt(2023, 1, 1).unwrap(), + let dates = [NaiveDate::from_ymd_opt(2023, 1, 1).unwrap(), NaiveDate::from_ymd_opt(2023, 1, 2).unwrap(), - NaiveDate::from_ymd_opt(2023, 1, 3).unwrap(), - ]; + NaiveDate::from_ymd_opt(2023, 1, 3).unwrap()]; // Convert to milliseconds since Unix epoch let date_data: Vec = dates From 88fe5e02ecfa27e460eb90dc4bce49d57cc050d4 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 25 Mar 2026 11:32:26 -0400 Subject: [PATCH 5/6] Fix ensuing cargo fmt error. --- src/testing/utils.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/testing/utils.rs b/src/testing/utils.rs index 13e4807..409bc41 100644 --- a/src/testing/utils.rs +++ b/src/testing/utils.rs @@ -135,9 +135,11 @@ pub(crate) async fn generate_test_data_arrays( let group = zarrs::group::GroupBuilder::new().build(store.clone(), "/meta")?; group.async_store_metadata().await?; - let dates = [NaiveDate::from_ymd_opt(2023, 1, 1).unwrap(), + let dates = [ + NaiveDate::from_ymd_opt(2023, 1, 1).unwrap(), NaiveDate::from_ymd_opt(2023, 1, 2).unwrap(), - NaiveDate::from_ymd_opt(2023, 1, 3).unwrap()]; + NaiveDate::from_ymd_opt(2023, 1, 3).unwrap(), + ]; // Convert to milliseconds since Unix epoch let date_data: Vec = dates From 6093cc04d2933928fd0d27edd70533b8a939a2d6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 25 Mar 2026 12:11:03 -0400 Subject: [PATCH 6/6] Use TempDir for Zarr and Icechunk test stores. --- Cargo.lock | 1 + Cargo.toml | 1 + src/schema.rs | 2 +- src/table_provider.rs | 10 ++-- src/testing/datafusion.rs | 2 +- src/testing/iter_group.rs | 4 +- src/testing/load_into_arrow.rs | 4 +- src/testing/load_zarrs.rs | 10 ++-- src/testing/utils.rs | 85 +++++++++------------------------- 9 files changed, 39 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 965bce9..c411d1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6149,6 +6149,7 @@ dependencies = [ "geodatafusion", "icechunk", "object_store 0.12.5", + "tempfile", "thiserror 2.0.18", "tokio", "wkb", diff --git a/Cargo.toml b/Cargo.toml index 4573ea2..59e7cfe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,5 +27,6 @@ zarrs_storage = { version = "0.4.0", features = ["async"] } chrono = "0.4.44" geo = "0.31" geoarrow-array = "0.7.0" +tempfile = "3" tokio = { version = "1.48", features = ["macros", "rt-multi-thread"] } wkb = "0.9" diff --git a/src/schema.rs b/src/schema.rs index 9edb2b0..2197abb 100644 --- a/src/schema.rs +++ b/src/schema.rs @@ -136,7 +136,7 @@ mod tests { #[tokio::test] async fn test_schema_from_zarr_group() { - let wrapper = get_local_zarr_store("data/schema.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); { diff --git a/src/table_provider.rs b/src/table_provider.rs index dd7e7c1..42d01b8 100644 --- a/src/table_provider.rs +++ b/src/table_provider.rs @@ -465,7 +465,7 @@ mod tests { #[tokio::test] async fn test_basic_table_provider() { - let wrapper = get_local_zarr_store("data/basic_table_provider.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); let provider = ZarrTableProvider::new_filesystem(path, "/meta").unwrap(); @@ -489,7 +489,7 @@ mod tests { #[tokio::test] #[ignore = "Projection support"] async fn test_table_provider_with_sql() { - let wrapper = get_local_zarr_store("data/provider_sql.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); let provider = ZarrTableProvider::new_filesystem(path, "/meta").unwrap(); @@ -527,7 +527,7 @@ mod tests { async fn test_st_intersects_selects_matching_record() { use arrow_array::Array; - let wrapper = get_local_zarr_store("data/intersects_matching.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); let ctx = SessionContext::new(); @@ -581,7 +581,7 @@ mod tests { /// Test that ST_Intersects correctly returns no records when query geometry doesn't intersect. #[tokio::test] async fn test_st_intersects_no_match() { - let wrapper = get_local_zarr_store("data/intersects_no_path.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); let ctx = SessionContext::new(); @@ -613,7 +613,7 @@ mod tests { async fn test_st_intersects_multiple_matches() { use arrow_array::Array; - let wrapper = get_local_zarr_store("data/intersects_multiple_match.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); let ctx = SessionContext::new(); diff --git a/src/testing/datafusion.rs b/src/testing/datafusion.rs index ab07f8b..938357e 100644 --- a/src/testing/datafusion.rs +++ b/src/testing/datafusion.rs @@ -14,7 +14,7 @@ use crate::testing::utils::get_local_icechunk_store; async fn test_datafusion() { let ctx = SessionContext::new(); - let wrapper = get_local_icechunk_store("data/ice_df").await; + let wrapper = get_local_icechunk_store().await; let path = wrapper.get_store_path(); let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await diff --git a/src/testing/iter_group.rs b/src/testing/iter_group.rs index 63eab42..16fcf1a 100644 --- a/src/testing/iter_group.rs +++ b/src/testing/iter_group.rs @@ -10,7 +10,7 @@ use zarrs_icechunk::AsyncIcechunkStore; #[tokio::test] async fn test_load_group() { - let wrapper = get_local_zarr_store("data/load_group.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); { @@ -22,7 +22,7 @@ async fn test_load_group() { #[tokio::test] async fn test_load_group_icechunk() { - let wrapper = get_local_icechunk_store("data/ice_group").await; + let wrapper = get_local_icechunk_store().await; let path = wrapper.get_store_path(); let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await diff --git a/src/testing/load_into_arrow.rs b/src/testing/load_into_arrow.rs index f90c6a8..c8e0a69 100644 --- a/src/testing/load_into_arrow.rs +++ b/src/testing/load_into_arrow.rs @@ -16,7 +16,7 @@ use zarrs_icechunk::AsyncIcechunkStore; #[tokio::test] async fn test_load_zarrs_into_arrow_record_batch() { - let wrapper = get_local_zarr_store("data/arrow_batch.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); { @@ -113,7 +113,7 @@ async fn test_load_zarrs_into_arrow_record_batch() { #[tokio::test] async fn test_load_zarrs_into_arrow_record_batch_icechunk() { - let wrapper = get_local_icechunk_store("data/ice_arrow").await; + let wrapper = get_local_icechunk_store().await; let path = wrapper.get_store_path(); let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await diff --git a/src/testing/load_zarrs.rs b/src/testing/load_zarrs.rs index f4cffd7..5309f14 100644 --- a/src/testing/load_zarrs.rs +++ b/src/testing/load_zarrs.rs @@ -11,7 +11,7 @@ use zarrs_icechunk::AsyncIcechunkStore; #[tokio::test] async fn test_load_collection_array() { - let wrapper = get_local_zarr_store("data/collection.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); { @@ -49,7 +49,7 @@ async fn test_load_collection_array() { #[tokio::test] async fn test_load_collection_array_icechunk() { - let wrapper = get_local_icechunk_store("data/ice_array").await; + let wrapper = get_local_icechunk_store().await; let path = wrapper.get_store_path(); let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await @@ -94,7 +94,7 @@ async fn test_load_collection_array_icechunk() { #[tokio::test] async fn test_load_date_array() { - let wrapper = get_local_zarr_store("data/date.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); let store = Arc::new(FilesystemStore::new(path).unwrap()); @@ -130,7 +130,7 @@ async fn test_load_date_array() { #[tokio::test] async fn test_load_date_array_icechunk() { - let wrapper = get_local_icechunk_store("data/ice_date_array").await; + let wrapper = get_local_icechunk_store().await; let path = wrapper.get_store_path(); let storage = icechunk::new_local_filesystem_storage(Path::new(&path)) .await @@ -175,7 +175,7 @@ async fn test_load_date_array_icechunk() { #[tokio::test] async fn test_load_bbox_array() { - let wrapper = get_local_zarr_store("data/bbox.zarr").await; + let wrapper = get_local_zarr_store().await; let path = wrapper.get_store_path(); { diff --git a/src/testing/utils.rs b/src/testing/utils.rs index 409bc41..b2122ea 100644 --- a/src/testing/utils.rs +++ b/src/testing/utils.rs @@ -1,58 +1,32 @@ -use futures::executor::block_on; use icechunk::{ObjectStorage, Repository}; use object_store::local::LocalFileSystem; use std::collections::HashMap; -use std::fs; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::sync::Arc; +use tempfile::TempDir; use zarrs::array::{ArrayBuilder, DataType, FillValue}; use zarrs::array_subset::ArraySubset; use zarrs::metadata_ext::data_type::NumpyTimeUnit; use zarrs_icechunk::AsyncIcechunkStore; use zarrs_object_store::AsyncObjectStore; -use zarrs_storage::{ - AsyncReadableWritableListableStorageTraits, AsyncWritableStorageTraits, StorePrefix, -}; - -/// Helper function to cleanup a store and remove its directory -#[cfg(test)] -fn cleanup_store_and_directory(store: &dyn AsyncWritableStorageTraits, path: &Path) { - // First, clear all data through the store interface - let prefix = StorePrefix::new("").unwrap(); - let _ = block_on(store.erase_prefix(&prefix)); - - // Then recursively remove the directory and all its contents - if path.exists() { - let _ = fs::remove_dir_all(path); - } -} +use zarrs_storage::AsyncReadableWritableListableStorageTraits; pub(crate) struct LocalZarrStoreWrapper { + _temp_dir: TempDir, store: Arc>, path: PathBuf, } -// Note that this wrapper should use unique store names to avoid collisons with -// running concurrent binary test execution. #[cfg(test)] impl LocalZarrStoreWrapper { - pub(crate) fn new(store_name: String) -> Self { - if store_name.is_empty() { - panic!("name for test zarr store cannot be empty!") - } - - let p = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(store_name); - - // Clean up any existing directory first (from failed tests or parallel execution) - if p.exists() { - let _ = fs::remove_dir_all(&p); - } - - fs::create_dir(p.clone()).unwrap(); - let store = AsyncObjectStore::new(LocalFileSystem::new_with_prefix(p.clone()).unwrap()); + pub(crate) fn new() -> Self { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().to_path_buf(); + let store = AsyncObjectStore::new(LocalFileSystem::new_with_prefix(&path).unwrap()); Self { + _temp_dir: temp_dir, store: Arc::new(store), - path: p, + path, } } @@ -65,39 +39,29 @@ impl LocalZarrStoreWrapper { } } -// Include drop to remove store when it goes out of test scope -impl Drop for LocalZarrStoreWrapper { - fn drop(&mut self) { - cleanup_store_and_directory(self.store.as_ref(), &self.path); - } -} - pub(crate) struct LocalIcechunkStoreWrapper { + _temp_dir: TempDir, store: Arc, path: PathBuf, } -// Note that this wrapper should use unique store names to avoid collisons with -// running concurrent binary test execution. #[cfg(test)] impl LocalIcechunkStoreWrapper { - pub(crate) async fn new(store_name: String) -> Self { - if store_name.is_empty() { - panic!("name for test icechunk repo cannot be empty!") - } - let p = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join(store_name); - fs::create_dir(p.clone()).unwrap(); + pub(crate) async fn new() -> Self { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().to_path_buf(); let repo = Repository::create( None, - Arc::new(ObjectStorage::new_local_filesystem(&p).await.unwrap()), + Arc::new(ObjectStorage::new_local_filesystem(&path).await.unwrap()), HashMap::new(), ) .await .unwrap(); let session = repo.writable_session("main").await.unwrap(); Self { + _temp_dir: temp_dir, store: Arc::new(AsyncIcechunkStore::new(session)), - path: p, + path, } } @@ -110,13 +74,6 @@ impl LocalIcechunkStoreWrapper { } } -// Include drop to remove store when it goes out of test scope -impl Drop for LocalIcechunkStoreWrapper { - fn drop(&mut self) { - cleanup_store_and_directory(self.store.as_ref(), &self.path); - } -} - /// Creates three arrays in /meta group: /// - date: datetime64[ms] array with dates [2023-01-01, 2023-01-02, 2023-01-03] /// - collection: variable length UTF8 array with ["collection_a", "collection_b", "collection_c"] @@ -215,8 +172,8 @@ pub(crate) async fn generate_test_data_arrays( Ok(()) } -pub(crate) async fn get_local_zarr_store(dir_name: &str) -> LocalZarrStoreWrapper { - let wrapper = LocalZarrStoreWrapper::new(dir_name.into()); +pub(crate) async fn get_local_zarr_store() -> LocalZarrStoreWrapper { + let wrapper = LocalZarrStoreWrapper::new(); let store = wrapper.get_store(); generate_test_data_arrays(store) .await @@ -224,8 +181,8 @@ pub(crate) async fn get_local_zarr_store(dir_name: &str) -> LocalZarrStoreWrappe wrapper } -pub(crate) async fn get_local_icechunk_store(dir_name: &str) -> LocalIcechunkStoreWrapper { - let wrapper = LocalIcechunkStoreWrapper::new(dir_name.into()).await; +pub(crate) async fn get_local_icechunk_store() -> LocalIcechunkStoreWrapper { + let wrapper = LocalIcechunkStoreWrapper::new().await; let store = wrapper.get_store(); generate_test_data_arrays(store.clone()) .await