Skip to content

Commit 53d7842

Browse files
clee704Copilot
andauthored
GH-3457: Validate column index for null_pages/null_counts contradiction (#3458)
* Validate column index for null_pages/null_counts contradiction A null_pages entry of true indicates the page consists entirely of null values; a corresponding null_counts entry of zero contradicts this, as it indicates the page has no null values at all. This inconsistency is a sign of invalid metadata that would cause incorrect page skipping during column index filtering — the reader treats such pages as all-null and silently excludes them from query results for predicates requiring non-null values. Add validation in ColumnIndexBuilder.build(PrimitiveType) to detect this contradiction and return null, consistent with the existing pattern for invalid min/max values. The null propagates through the read path, causing the filter to fall back to reading all pages for the affected column. Also fix a pre-existing NPE in the static build() method where build(type) could return null but was not null-checked before accessing boundaryOrder. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 8ec46d2 commit 53d7842

3 files changed

Lines changed: 170 additions & 4 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/ColumnIndexBuilder.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,9 @@ public static ColumnIndex build(
616616

617617
builder.fill(nullPages, nullCounts, minValues, maxValues, repLevelHistogram, defLevelHistogram);
618618
ColumnIndexBase<?> columnIndex = builder.build(type);
619+
if (columnIndex == null) {
620+
return null;
621+
}
619622
columnIndex.boundaryOrder = requireNonNull(boundaryOrder);
620623
return columnIndex;
621624
}
@@ -750,6 +753,19 @@ private ColumnIndexBase<?> build(PrimitiveType type) {
750753
if (!nullCounts.isEmpty()) {
751754
columnIndex.nullCounts = nullCounts.toLongArray();
752755
}
756+
// A null_pages entry of true indicates the page consists entirely of null values.
757+
// When null_counts is present, a null_counts entry of zero means the page has no
758+
// null values at all. These two fields directly contradict each other: a page
759+
// cannot both consist entirely of nulls and contain zero nulls. Since null_pages
760+
// is the field that causes the reader to skip pages during column index filtering,
761+
// the safe response is to discard the column index for this column entirely.
762+
if (columnIndex.nullCounts != null) {
763+
for (int i = 0; i < columnIndex.nullPages.length; i++) {
764+
if (columnIndex.nullPages[i] && columnIndex.nullCounts[i] <= 0L) {
765+
return null;
766+
}
767+
}
768+
}
753769
columnIndex.pageIndexes = pageIndexes.toIntArray();
754770
// Repetition and definition level histograms are optional so keep them null if the builder has no values
755771
if (repLevelHistogram != null && !repLevelHistogram.isEmpty()) {

parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestBoundaryOrder.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -250,9 +250,11 @@ public String toString() {
250250
RAND_DESCENDING = (ColumnIndexBase<?>) builder.build();
251251

252252
builder = ColumnIndexBuilder.getBuilder(TYPE, Integer.MAX_VALUE);
253-
// Add a couple of empty stats so column index will contain null pages only
253+
// Add a couple of all-null stats so column index will contain null pages only
254254
for (int i = 0; i < 10; ++i) {
255-
builder.add(Statistics.createStats(TYPE));
255+
Statistics<?> nullStats = Statistics.createStats(TYPE);
256+
nullStats.incrementNumNulls(10);
257+
builder.add(nullStats);
256258
}
257259
ALL_NULL_PAGES = (ColumnIndexBase<?>) builder.build();
258260
}

parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -904,11 +904,11 @@ public void testStaticBuildBoolean() {
904904
Types.required(BOOLEAN).named("test_boolean"),
905905
BoundaryOrder.DESCENDING,
906906
List.of(false, true, false, true, false, true),
907-
List.of(9l, 8l, 7l, 6l, 5l, 0l),
907+
List.of(9l, 8l, 7l, 6l, 5l, 4l),
908908
toBBList(false, null, false, null, true, null),
909909
toBBList(true, null, false, null, true, null));
910910
assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
911-
assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 0);
911+
assertCorrectNullCounts(columnIndex, 9, 8, 7, 6, 5, 4);
912912
assertCorrectNullPages(columnIndex, false, true, false, true, false, true);
913913
assertCorrectValues(columnIndex.getMaxValues(), true, null, false, null, true, null);
914914
assertCorrectValues(columnIndex.getMinValues(), false, null, false, null, true, null);
@@ -1865,4 +1865,152 @@ long getMinMaxSize() {
18651865
private static void assertCorrectFiltering(ColumnIndex ci, FilterPredicate predicate, int... expectedIndexes) {
18661866
TestIndexIterator.assertEquals(predicate.accept(ci), expectedIndexes);
18671867
}
1868+
1869+
@Test
1870+
public void testBuildReturnsNullForNullPageCountContradiction() {
1871+
// null_pages[i]=true indicates the page is entirely null, but null_counts[i]=0
1872+
// says there are zero null values. This contradiction indicates invalid metadata.
1873+
// build() should return null to prevent incorrect page skipping.
1874+
PrimitiveType type = Types.required(INT32).named("test_col");
1875+
1876+
// Pages 1-3 have null_pages=true with null_counts=0 — contradictory
1877+
assertNull(
1878+
"Column index with null_pages=true and null_counts=0 should be rejected",
1879+
ColumnIndexBuilder.build(
1880+
type,
1881+
BoundaryOrder.ASCENDING,
1882+
List.of(false, true, true, true),
1883+
List.of(0L, 0L, 0L, 0L),
1884+
toBBList(Integer.valueOf(-99), null, null, null),
1885+
toBBList(Integer.valueOf(5), null, null, null)));
1886+
1887+
// Contradiction on a single page (last page) should also be rejected
1888+
assertNull(
1889+
"Single contradictory page should cause rejection",
1890+
ColumnIndexBuilder.build(
1891+
type,
1892+
BoundaryOrder.UNORDERED,
1893+
List.of(false, false, true),
1894+
List.of(0L, 5L, 0L),
1895+
toBBList(Integer.valueOf(1), Integer.valueOf(50), null),
1896+
toBBList(Integer.valueOf(49), Integer.valueOf(99), null)));
1897+
1898+
// Contradiction on the first page
1899+
assertNull(
1900+
"Contradictory first page should cause rejection",
1901+
ColumnIndexBuilder.build(
1902+
type,
1903+
BoundaryOrder.UNORDERED,
1904+
List.of(true, false, false),
1905+
List.of(0L, 0L, 5L),
1906+
toBBList(null, Integer.valueOf(1), Integer.valueOf(50)),
1907+
toBBList(null, Integer.valueOf(49), Integer.valueOf(99))));
1908+
1909+
// Single page with contradiction
1910+
assertNull(
1911+
"Single-page column index with contradiction should be rejected",
1912+
ColumnIndexBuilder.build(
1913+
type,
1914+
BoundaryOrder.UNORDERED,
1915+
List.of(true),
1916+
List.of(0L),
1917+
toBBList((Integer) null),
1918+
toBBList((Integer) null)));
1919+
1920+
// All pages are contradictory null pages
1921+
assertNull(
1922+
"All-contradictory column index should be rejected",
1923+
ColumnIndexBuilder.build(
1924+
type,
1925+
BoundaryOrder.UNORDERED,
1926+
List.of(true, true, true),
1927+
List.of(0L, 0L, 0L),
1928+
toBBList((Integer) null, null, null),
1929+
toBBList((Integer) null, null, null)));
1930+
}
1931+
1932+
@Test
1933+
public void testBuildPreservesValidColumnIndex() {
1934+
PrimitiveType type = Types.required(INT32).named("test_col");
1935+
1936+
// Legitimate null page: null_pages=true with null_counts > 0 — valid
1937+
ColumnIndex ci = ColumnIndexBuilder.build(
1938+
type,
1939+
BoundaryOrder.ASCENDING,
1940+
List.of(false, true, false),
1941+
List.of(0L, 100L, 0L),
1942+
toBBList(Integer.valueOf(1), null, Integer.valueOf(50)),
1943+
toBBList(Integer.valueOf(49), null, Integer.valueOf(99)));
1944+
assertCorrectNullPages(ci, false, true, false);
1945+
assertCorrectNullCounts(ci, 0, 100, 0);
1946+
assertCorrectValues(ci.getMinValues(), 1, null, 50);
1947+
assertCorrectValues(ci.getMaxValues(), 49, null, 99);
1948+
1949+
// All non-null pages — valid
1950+
ColumnIndex ci2 = ColumnIndexBuilder.build(
1951+
type,
1952+
BoundaryOrder.ASCENDING,
1953+
List.of(false, false, false),
1954+
List.of(0L, 5L, 10L),
1955+
toBBList(Integer.valueOf(1), Integer.valueOf(50), Integer.valueOf(100)),
1956+
toBBList(Integer.valueOf(49), Integer.valueOf(99), Integer.valueOf(150)));
1957+
assertCorrectNullPages(ci2, false, false, false);
1958+
assertCorrectNullCounts(ci2, 0, 5, 10);
1959+
1960+
// Single non-null page
1961+
ColumnIndex ci3 = ColumnIndexBuilder.build(
1962+
type,
1963+
BoundaryOrder.UNORDERED,
1964+
List.of(false),
1965+
List.of(0L),
1966+
toBBList(Integer.valueOf(42)),
1967+
toBBList(Integer.valueOf(42)));
1968+
assertCorrectNullPages(ci3, false);
1969+
assertCorrectNullCounts(ci3, 0);
1970+
1971+
// Single legitimate all-null page (null_pages=true, null_counts > 0)
1972+
ColumnIndex ci4 = ColumnIndexBuilder.build(
1973+
type, BoundaryOrder.UNORDERED, List.of(true), List.of(50L), toBBList((Integer) null), toBBList((Integer)
1974+
null));
1975+
assertCorrectNullPages(ci4, true);
1976+
assertCorrectNullCounts(ci4, 50);
1977+
1978+
// All pages legitimately null
1979+
ColumnIndex ci5 = ColumnIndexBuilder.build(
1980+
type,
1981+
BoundaryOrder.UNORDERED,
1982+
List.of(true, true, true),
1983+
List.of(10L, 20L, 30L),
1984+
toBBList((Integer) null, null, null),
1985+
toBBList((Integer) null, null, null));
1986+
assertCorrectNullPages(ci5, true, true, true);
1987+
assertCorrectNullCounts(ci5, 10, 20, 30);
1988+
1989+
// Boundary: null_counts=1 on a null page (minimum valid value) — should NOT be rejected
1990+
ColumnIndex ci6 = ColumnIndexBuilder.build(
1991+
type,
1992+
BoundaryOrder.UNORDERED,
1993+
List.of(false, true),
1994+
List.of(0L, 1L),
1995+
toBBList(Integer.valueOf(1), null),
1996+
toBBList(Integer.valueOf(99), null));
1997+
assertCorrectNullPages(ci6, false, true);
1998+
assertCorrectNullCounts(ci6, 0, 1);
1999+
}
2000+
2001+
@Test
2002+
public void testBuildWithoutNullCountsIsNotRejected() {
2003+
PrimitiveType type = Types.required(INT32).named("test_col");
2004+
2005+
// null_counts absent (optional field) — cannot detect contradiction, should build normally
2006+
ColumnIndex ci = ColumnIndexBuilder.build(
2007+
type,
2008+
BoundaryOrder.UNORDERED,
2009+
List.of(false, true, true),
2010+
null,
2011+
toBBList(Integer.valueOf(1), null, null),
2012+
toBBList(Integer.valueOf(99), null, null));
2013+
assertCorrectNullPages(ci, false, true, true);
2014+
assertNull("null_counts should be null when not provided", ci.getNullCounts());
2015+
}
18682016
}

0 commit comments

Comments
 (0)