Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2397,8 +2397,8 @@ def data_file_statistics_from_parquet_metadata(
split_offsets.sort()

for field_id in invalidate_col:
del col_aggs[field_id]
del null_value_counts[field_id]
col_aggs.pop(field_id, None)
null_value_counts.pop(field_id, None)

return DataFileStatistics(
record_count=parquet_metadata.num_rows,
Expand Down
67 changes: 67 additions & 0 deletions tests/io/test_pyarrow_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,73 @@ def test_stats_types(table_schema_nested: Schema) -> None:
]


def construct_test_table_without_stats() -> Tuple[pq.FileMetaData, Union[TableMetadataV1, TableMetadataV2]]:
Comment thread
binayakd marked this conversation as resolved.
Outdated
table_metadata = {
"format-version": 2,
"location": "s3://bucket/test/location",
"last-column-id": 7,
"current-schema-id": 0,
"schemas": [
{
"type": "struct",
"schema-id": 0,
"fields": [
{"id": 1, "name": "strings", "required": False, "type": "string"},
{"id": 2, "name": "floats", "required": False, "type": "float"}
]
}
],
"default-spec-id": 0,
"partition-specs": [{"spec-id": 0, "fields": []}],
"properties": {},
}

table_metadata = TableMetadataUtil.parse_obj(table_metadata)
arrow_schema = schema_to_pyarrow(table_metadata.schemas[0])
_strings = ["zzzzzzzzzzzzzzzzzzzz", "rrrrrrrrrrrrrrrrrrrr", None, "aaaaaaaaaaaaaaaaaaaa"]
_floats = [3.14, math.nan, 1.69, 100]

table = pa.Table.from_pydict(
{
"strings": _strings,
"floats": _floats
},
schema=arrow_schema,
)

metadata_collector: List[Any] = []

with pa.BufferOutputStream() as f:
with pq.ParquetWriter(f, table.schema, metadata_collector=metadata_collector, write_statistics=False) as writer:
writer.write_table(table)

return metadata_collector[0], table_metadata


def test_is_stats_set_false() -> None:
Comment thread
binayakd marked this conversation as resolved.
Outdated
metadata, table_metadata = construct_test_table_without_stats()
schema = get_current_schema(table_metadata)
statistics = data_file_statistics_from_parquet_metadata(
parquet_metadata=metadata,
stats_columns=compute_statistics_plan(schema, table_metadata.properties),
parquet_column_mapping=parquet_path_to_id_mapping(schema),
)
datafile = DataFile(**statistics.to_serialized_dict())

# assert attributes except for column_aggregates and null_value_counts are present
Comment thread
binayakd marked this conversation as resolved.
Outdated
assert datafile.record_count == 4

assert len(datafile.column_sizes) == 2
assert datafile.column_sizes[1] > 0
assert datafile.column_sizes[2] > 0

assert len(datafile.nan_value_counts) == 0

assert datafile.split_offsets is not None
assert len(datafile.split_offsets) == 1
assert datafile.split_offsets[0] == 4


# This is commented out for now because write_to_dataset drops the partition
# columns making it harder to calculate the mapping from the column index to
# datatype id
Expand Down