Skip to content

Commit 5506e69

Browse files
authored
Revert arrow upgrade and related changes (#50)
* Revert "Upgrade arrow/parquet to 56.0.0 (apache#16690)" This reverts commit fa1f8c1. * Revert "refactor: use upstream inline_key_fast (apache#17044)" This reverts commit 71b92bc. * Revert "fix: respect inexact flags in row group metadata (apache#16412)" This reverts commit afc90f7. * Revert "Test grouping by FixedSizeList (apache#17415)" This reverts commit 03f39e5. * Spelling (got reverted) * Also allow Byt from tests * Adjust sqllogictests
1 parent 0be6cd9 commit 5506e69

37 files changed

Lines changed: 635 additions & 970 deletions

File tree

Cargo.lock

Lines changed: 104 additions & 62 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,20 +90,20 @@ ahash = { version = "0.8", default-features = false, features = [
9090
"runtime-rng",
9191
] }
9292
apache-avro = { version = "0.20", default-features = false }
93-
arrow = { version = "56.0.0", features = [
93+
arrow = { version = "55.2.0", features = [
9494
"prettyprint",
9595
"chrono-tz",
9696
] }
97-
arrow-buffer = { version = "56.0.0", default-features = false }
98-
arrow-flight = { version = "56.0.0", features = [
97+
arrow-buffer = { version = "55.2.0", default-features = false }
98+
arrow-flight = { version = "55.2.0", features = [
9999
"flight-sql-experimental",
100100
] }
101-
arrow-ipc = { version = "56.0.0", default-features = false, features = [
101+
arrow-ipc = { version = "55.2.0", default-features = false, features = [
102102
"lz4",
103103
] }
104-
arrow-ord = { version = "56.0.0", default-features = false }
105-
arrow-schema = { version = "56.0.0", default-features = false }
106-
async-trait = "0.1.89"
104+
arrow-ord = { version = "55.2.0", default-features = false }
105+
arrow-schema = { version = "55.2.0", default-features = false }
106+
async-trait = "0.1.88"
107107
bigdecimal = "0.4.8"
108108
bytes = "1.10"
109109
chrono = { version = "0.4.41", default-features = false }
@@ -157,7 +157,7 @@ itertools = "0.14"
157157
log = "^0.4"
158158
object_store = { version = "0.12.3", default-features = false }
159159
parking_lot = "0.12"
160-
parquet = { version = "56.0.0", default-features = false, features = [
160+
parquet = { version = "55.2.0", default-features = false, features = [
161161
"arrow",
162162
"async",
163163
"object_store",

datafusion-examples/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ serde_json = { workspace = true }
8181
tempfile = { workspace = true }
8282
test-utils = { path = "../test-utils" }
8383
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
84-
tonic = "0.13.1"
84+
tonic = "0.12.1"
8585
tracing = { version = "0.1" }
8686
tracing-subscriber = { version = "0.3" }
8787
url = { workspace = true }

datafusion-testing

Submodule datafusion-testing updated 84 files

datafusion/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ log = { workspace = true }
7171
object_store = { workspace = true, optional = true }
7272
parquet = { workspace = true, optional = true, default-features = true }
7373
paste = "1.0.15"
74-
pyo3 = { version = "0.25", optional = true }
74+
pyo3 = { version = "0.24.2", optional = true }
7575
recursive = { workspace = true, optional = true }
7676
sqlparser = { workspace = true }
7777
tokio = { workspace = true }

datafusion/common/src/config.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -602,6 +602,13 @@ config_namespace! {
602602
/// default parquet writer setting
603603
pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())
604604

605+
/// (writing) Sets max statistics size for any column. If NULL, uses
606+
/// default parquet writer setting
607+
/// max_statistics_size is deprecated, currently it is not being used
608+
// TODO: remove once deprecated
609+
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
610+
pub max_statistics_size: Option<usize>, default = Some(4096)
611+
605612
/// (writing) Target maximum number of rows in each row group (defaults to 1M
606613
/// rows). Writing larger row groups requires more memory to write, but
607614
/// can get better compression and be faster to read.
@@ -615,7 +622,7 @@ config_namespace! {
615622

616623
/// (writing) Sets statistics truncate length. If NULL, uses
617624
/// default parquet writer setting
618-
pub statistics_truncate_length: Option<usize>, default = Some(64)
625+
pub statistics_truncate_length: Option<usize>, default = None
619626

620627
/// (writing) Sets best effort maximum number of rows in data page
621628
pub data_page_row_count_limit: usize, default = 20_000
@@ -2134,6 +2141,13 @@ config_namespace_with_hashmap! {
21342141
/// Sets bloom filter number of distinct values. If NULL, uses
21352142
/// default parquet options
21362143
pub bloom_filter_ndv: Option<u64>, default = None
2144+
2145+
/// Sets max statistics size for the column path. If NULL, uses
2146+
/// default parquet options
2147+
/// max_statistics_size is deprecated, currently it is not being used
2148+
// TODO: remove once deprecated
2149+
#[deprecated(since = "45.0.0", note = "Setting does not do anything")]
2150+
pub max_statistics_size: Option<usize>, default = None
21372151
}
21382152
}
21392153

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use parquet::{
3535
metadata::KeyValue,
3636
properties::{
3737
EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion,
38-
DEFAULT_STATISTICS_ENABLED,
38+
DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED,
3939
},
4040
},
4141
schema::types::ColumnPath,
@@ -160,6 +160,16 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
160160
builder =
161161
builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv);
162162
}
163+
164+
// max_statistics_size is deprecated, currently it is not being used
165+
// TODO: remove once deprecated
166+
#[allow(deprecated)]
167+
if let Some(max_statistics_size) = options.max_statistics_size {
168+
builder = {
169+
#[allow(deprecated)]
170+
builder.set_column_max_statistics_size(path, max_statistics_size)
171+
}
172+
}
163173
}
164174

165175
Ok(builder)
@@ -208,6 +218,7 @@ impl ParquetOptions {
208218
dictionary_enabled,
209219
dictionary_page_size_limit,
210220
statistics_enabled,
221+
max_statistics_size,
211222
max_row_group_size,
212223
created_by,
213224
column_index_truncate_length,
@@ -253,6 +264,13 @@ impl ParquetOptions {
253264
.set_data_page_row_count_limit(*data_page_row_count_limit)
254265
.set_bloom_filter_enabled(*bloom_filter_on_write);
255266

267+
builder = {
268+
#[allow(deprecated)]
269+
builder.set_max_statistics_size(
270+
max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE),
271+
)
272+
};
273+
256274
if let Some(bloom_filter_fpp) = bloom_filter_fpp {
257275
builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp);
258276
};
@@ -445,10 +463,12 @@ mod tests {
445463
fn column_options_with_non_defaults(
446464
src_col_defaults: &ParquetOptions,
447465
) -> ParquetColumnOptions {
466+
#[allow(deprecated)] // max_statistics_size
448467
ParquetColumnOptions {
449468
compression: Some("zstd(22)".into()),
450469
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
451470
statistics_enabled: Some("none".into()),
471+
max_statistics_size: Some(72),
452472
encoding: Some("RLE".into()),
453473
bloom_filter_enabled: Some(true),
454474
bloom_filter_fpp: Some(0.72),
@@ -473,6 +493,7 @@ mod tests {
473493
dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)),
474494
dictionary_page_size_limit: 42,
475495
statistics_enabled: Some("chunk".into()),
496+
max_statistics_size: Some(42),
476497
max_row_group_size: 42,
477498
created_by: "wordy".into(),
478499
column_index_truncate_length: Some(42),
@@ -530,6 +551,7 @@ mod tests {
530551
),
531552
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
532553
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
554+
max_statistics_size: Some(props.max_statistics_size(&col)),
533555
}
534556
}
535557

@@ -586,6 +608,7 @@ mod tests {
586608
compression: default_col_props.compression,
587609
dictionary_enabled: default_col_props.dictionary_enabled,
588610
statistics_enabled: default_col_props.statistics_enabled,
611+
max_statistics_size: default_col_props.max_statistics_size,
589612
bloom_filter_on_write: default_col_props
590613
.bloom_filter_enabled
591614
.unwrap_or_default(),

datafusion/common/src/scalar/mod.rs

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -904,10 +904,11 @@ pub fn dict_from_values<K: ArrowDictionaryKeyType>(
904904
.map(|index| {
905905
if values_array.is_valid(index) {
906906
let native_index = K::Native::from_usize(index).ok_or_else(|| {
907-
_internal_datafusion_err!(
908-
"Can not create index of type {} from value {index}",
909-
K::DATA_TYPE
910-
)
907+
DataFusionError::Internal(format!(
908+
"Can not create index of type {} from value {}",
909+
K::DATA_TYPE,
910+
index
911+
))
911912
})?;
912913
Ok(Some(native_index))
913914
} else {
@@ -2202,16 +2203,6 @@ impl ScalarValue {
22022203
}
22032204

22042205
let array: ArrayRef = match &data_type {
2205-
DataType::Decimal32(_precision, _scale) => {
2206-
return _not_impl_err!(
2207-
"Decimal32 not supported in ScalarValue::iter_to_array"
2208-
);
2209-
}
2210-
DataType::Decimal64(_precision, _scale) => {
2211-
return _not_impl_err!(
2212-
"Decimal64 not supported in ScalarValue::iter_to_array"
2213-
);
2214-
}
22152206
DataType::Decimal128(precision, scale) => {
22162207
let decimal_array =
22172208
ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?;

datafusion/common/src/types/native.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -407,10 +407,7 @@ impl From<DataType> for NativeType {
407407
DataType::Union(union_fields, _) => {
408408
Union(LogicalUnionFields::from(&union_fields))
409409
}
410-
DataType::Decimal32(p, s)
411-
| DataType::Decimal64(p, s)
412-
| DataType::Decimal128(p, s)
413-
| DataType::Decimal256(p, s) => Decimal(p, s),
410+
DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s),
414411
DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
415412
DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(),
416413
DataType::RunEndEncoded(_, field) => field.data_type().clone().into(),

datafusion/core/src/datasource/file_format/parquet.rs

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -523,23 +523,11 @@ mod tests {
523523
let dic_array = DictionaryArray::<Int32Type>::try_new(keys, Arc::new(values))?;
524524
let c_dic: ArrayRef = Arc::new(dic_array);
525525

526-
// Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null]
527-
let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![
528-
Some("a".repeat(128)),
529-
None,
530-
Some("b".repeat(128)),
531-
None,
532-
]));
533-
534-
let batch1 = RecordBatch::try_from_iter(vec![
535-
("c_dic", c_dic),
536-
("string_truncation", string_truncation),
537-
])?;
526+
let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)])?;
538527

539528
// Use store_parquet to write each batch to its own file
540529
// . batch1 written into first file and includes:
541530
// - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available.
542-
// - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact.
543531
let store = Arc::new(RequestCountingObjectStore::new(Arc::new(
544532
LocalFileSystem::new(),
545533
)));
@@ -575,19 +563,6 @@ mod tests {
575563
Precision::Exact(Utf8(Some("a".into())))
576564
);
577565

578-
// column string_truncation
579-
let string_truncation_stats = &stats.column_statistics[1];
580-
581-
assert_eq!(string_truncation_stats.null_count, Precision::Exact(2));
582-
assert_eq!(
583-
string_truncation_stats.max_value,
584-
Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c")))
585-
);
586-
assert_eq!(
587-
string_truncation_stats.min_value,
588-
Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64))))
589-
);
590-
591566
Ok(())
592567
}
593568

0 commit comments

Comments
 (0)