apache · yadavay-amzn · Jun 5, 2026 · Jun 8, 2026 · Jun 13, 2026 · wgtmac
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java
@@ -67,6 +67,7 @@ public class ParquetProperties {
   public static final boolean DEFAULT_STATISTICS_ENABLED = true;
   public static final boolean DEFAULT_SIZE_STATISTICS_ENABLED = true;
 
+  public static final long DEFAULT_DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES = 0;
   public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
 
   /**
@@ -131,6 +132,7 @@ public static WriterVersion fromString(String name) {
   private final int rowGroupRowCountLimit;
   private final int pageRowCountLimit;
   private final boolean pageWriteChecksumEnabled;
+  private final long dictionaryCheckThresholdRawSizeBytes;
   private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
   private final Map<String, String> extraMetaData;
   private final ColumnProperty<Boolean> statistics;
@@ -163,6 +165,7 @@ private ParquetProperties(Builder builder) {
     this.rowGroupRowCountLimit = builder.rowGroupRowCountLimit;
     this.pageRowCountLimit = builder.pageRowCountLimit;
     this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
+    this.dictionaryCheckThresholdRawSizeBytes = builder.dictionaryCheckThresholdRawSizeBytes;
     this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
     this.extraMetaData = builder.extraMetaData;
     this.statistics = builder.statistics.build();
@@ -322,6 +325,17 @@ public boolean getPageWriteChecksumEnabled() {
     return pageWriteChecksumEnabled;
   }
 
+  /**
+   * Returns the byte threshold after which the dictionary compression check is performed.
+   * A value of 0 means check on the first page (backward compatible default). Higher values
+   * delay the check until that many raw bytes have been accumulated across pages.
+   *
+   * @return the byte threshold for the dictionary compression check
+   */
+  public long getDictionaryCheckThresholdRawSizeBytes() {
+    return dictionaryCheckThresholdRawSizeBytes;
+  }
+
   public OptionalLong getBloomFilterNDV(ColumnDescriptor column) {
     Long ndv = bloomFilterNDVs.getValue(column);
     return ndv == null ? OptionalLong.empty() : OptionalLong.of(ndv);
@@ -415,6 +429,7 @@ public static class Builder {
     private int rowGroupRowCountLimit = DEFAULT_ROW_GROUP_ROW_COUNT_LIMIT;
     private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
     private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
+    private long dictionaryCheckThresholdRawSizeBytes = DEFAULT_DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES;
     private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
     private Map<String, String> extraMetaData = new HashMap<>();
     private final ColumnProperty.Builder<Boolean> statistics;
@@ -450,6 +465,7 @@ private Builder(ParquetProperties toCopy) {
       this.allocator = toCopy.allocator;
       this.pageRowCountLimit = toCopy.pageRowCountLimit;
       this.pageWriteChecksumEnabled = toCopy.pageWriteChecksumEnabled;
+      this.dictionaryCheckThresholdRawSizeBytes = toCopy.dictionaryCheckThresholdRawSizeBytes;
       this.bloomFilterNDVs = ColumnProperty.builder(toCopy.bloomFilterNDVs);
       this.bloomFilterFPPs = ColumnProperty.builder(toCopy.bloomFilterFPPs);
       this.bloomFilterEnabled = ColumnProperty.builder(toCopy.bloomFilterEnabled);
@@ -709,6 +725,20 @@ public Builder withPageWriteChecksumEnabled(boolean val) {
       return this;
     }
 
+    /**
+     * Set the raw data byte threshold after which the dictionary compression check is performed.
+     * A value of 0 means check on the first page (backward compatible default). Higher values
+     * delay the check until that many raw bytes have been accumulated across pages.
+     *
+     * @param val byte threshold (default: 0)
+     * @return this builder for method chaining
+     */
+    public Builder withDictionaryCheckThresholdRawSizeBytes(long val) {
+      Preconditions.checkArgument(val >= 0, "dictionaryCheckThresholdRawSizeBytes must be >= 0");
+      this.dictionaryCheckThresholdRawSizeBytes = val;
+      return this;
+    }
+
     public Builder withExtraMetaData(Map<String, String> extraMetaData) {
       this.extraMetaData = extraMetaData;
       return this;

diff --git a/...mn/src/main/java/org/apache/parquet/column/values/factory/DefaultValuesWriterFactory.java b/...mn/src/main/java/org/apache/parquet/column/values/factory/DefaultValuesWriterFactory.java
@@ -111,7 +111,9 @@ static ValuesWriter dictWriterWithFallBack(
       ValuesWriter writerToFallBackTo) {
     if (parquetProperties.isDictionaryEnabled(path)) {
       return FallbackValuesWriter.of(
-          dictionaryWriter(path, parquetProperties, dictPageEncoding, dataPageEncoding), writerToFallBackTo);
+          dictionaryWriter(path, parquetProperties, dictPageEncoding, dataPageEncoding),
+          writerToFallBackTo,
+          parquetProperties.getDictionaryCheckThresholdRawSizeBytes());
     } else {
       return writerToFallBackTo;
     }

diff --git a/...-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java b/...-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java
@@ -30,7 +30,12 @@ public class FallbackValuesWriter<I extends ValuesWriter & RequiresFallback, F e
 
   public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter> FallbackValuesWriter<I, F> of(
       I initialWriter, F fallBackWriter) {
-    return new FallbackValuesWriter<>(initialWriter, fallBackWriter);
+    return new FallbackValuesWriter<>(initialWriter, fallBackWriter, /*checkAfterBytes=*/ 0);
+  }
+
+  public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter> FallbackValuesWriter<I, F> of(
+      I initialWriter, F fallBackWriter, long checkAfterBytes) {
+    return new FallbackValuesWriter<>(initialWriter, fallBackWriter, checkAfterBytes);
   }
 
   /**
@@ -43,6 +48,11 @@ public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter
   public final F fallBackWriter;
 
   private boolean fellBackAlready = false;
+  private boolean compressionChecked = false;
+  private final long checkAfterBytes;
+  /** Accumulates raw bytes across pages (only reset in resetDictionary) so the
+   * threshold check works even when individual pages are smaller than checkAfterBytes. */
+  private long cumulativeRawBytes = 0;
 
   /**
    * writer currently written to
@@ -57,16 +67,16 @@ public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter
    */
   private long rawDataByteSize = 0;
 
-  /**
-   * indicates if this is the first page being processed
-   */
-  private boolean firstPage = true;
-
   public FallbackValuesWriter(I initialWriter, F fallBackWriter) {
+    this(initialWriter, fallBackWriter, /*checkAfterBytes=*/ 0);
+  }
+
+  public FallbackValuesWriter(I initialWriter, F fallBackWriter, long checkAfterBytes) {
     super();
     this.initialWriter = initialWriter;
     this.fallBackWriter = fallBackWriter;
     this.currentWriter = initialWriter;
+    this.checkAfterBytes = checkAfterBytes;
   }
 
   @Override
@@ -79,8 +89,9 @@ public long getBufferedSize() {
 
   @Override
   public BytesInput getBytes() {
-    if (!fellBackAlready && firstPage) {
-      // we use the first page to decide if we're going to use this encoding
+    cumulativeRawBytes += rawDataByteSize;
+    if (!fellBackAlready && !compressionChecked && cumulativeRawBytes >= checkAfterBytes) {
+      compressionChecked = true;
       BytesInput bytes = initialWriter.getBytes();
       if (!initialWriter.isCompressionSatisfying(rawDataByteSize, bytes.size())) {
         fallBack();
@@ -103,7 +114,6 @@ public Encoding getEncoding() {
   @Override
   public void reset() {
     rawDataByteSize = 0;
-    firstPage = false;
     currentWriter.reset();
   }
 
@@ -131,8 +141,9 @@ public void resetDictionary() {
     }
     currentWriter = initialWriter;
     fellBackAlready = false;
+    compressionChecked = false;
+    cumulativeRawBytes = 0;
     initialUsedAndHadDictionary = false;
-    firstPage = true;
   }
 
   @Override

diff --git a/...umn/src/test/java/org/apache/parquet/column/values/fallback/TestFallbackValuesWriter.java b/...umn/src/test/java/org/apache/parquet/column/values/fallback/TestFallbackValuesWriter.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.fallback;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.parquet.bytes.DirectByteBufferAllocator;
+import org.apache.parquet.bytes.TrackingByteBufferAllocator;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
+import org.apache.parquet.column.values.plain.PlainValuesWriter;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestFallbackValuesWriter {
+
+  private TrackingByteBufferAllocator allocator;
+
+  @Before
+  public void initAllocator() {
+    allocator = TrackingByteBufferAllocator.wrap(new DirectByteBufferAllocator());
+  }
+
+  @After
+  public void closeAllocator() {
+    allocator.close();
+  }
+
+  /**
+   * With threshold=0, the check fires on the first page and falls back for high-cardinality data.
+   */
+  @Test
+  public void testThresholdZeroFallsBackImmediately() throws Exception {
+    int dictPageSize = 1024 * 1024;
+
+    PlainIntegerDictionaryValuesWriter dictWriter = new PlainIntegerDictionaryValuesWriter(
+        dictPageSize, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN_DICTIONARY, allocator);
+    PlainValuesWriter plainWriter = new PlainValuesWriter(1024, 1024 * 1024, allocator);
+    FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> writer =
+        FallbackValuesWriter.of(dictWriter, plainWriter, 0);
+
+    try {
+      for (int i = 0; i < 1000; i++) {
+        writer.writeInteger(i);
+      }
+      writer.getBytes();
+
+      assertFalse(
+          "Should fall back to plain encoding with threshold=0 and high cardinality",
+          writer.getEncoding().usesDictionary());
+    } finally {
+      writer.close();
+    }
+  }
+
+  /**
+   * With a large threshold, the check never fires and dictionary encoding is preserved.
+   */
+  @Test
+  public void testLargeThresholdPreservesDictionary() throws Exception {
+    int dictPageSize = 1024 * 1024;
+
+    PlainIntegerDictionaryValuesWriter dictWriter = new PlainIntegerDictionaryValuesWriter(
+        dictPageSize, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN_DICTIONARY, allocator);
+    PlainValuesWriter plainWriter = new PlainValuesWriter(1024, 1024 * 1024, allocator);
+    FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> writer =
+        FallbackValuesWriter.of(dictWriter, plainWriter, Long.MAX_VALUE);
+
+    try {
+      for (int i = 0; i < 1000; i++) {
+        writer.writeInteger(i);
+      }
+      writer.getBytes();
+
+      assertTrue(
+          "Dictionary encoding should be preserved with large threshold",
+          writer.getEncoding().usesDictionary());
+    } finally {
+      writer.close();
+    }
+  }
+
+  /**
+   * Threshold is crossed only after a reset() (page flush). cumulativeRawBytes accumulates
+   * across pages while rawDataByteSize resets per page.
+   */
+  @Test
+  public void testThresholdCrossedAfterReset() throws Exception {
+    int dictPageSize = 1024 * 1024;
+    long threshold = 500;
+
+    PlainIntegerDictionaryValuesWriter dictWriter = new PlainIntegerDictionaryValuesWriter(
+        dictPageSize, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN_DICTIONARY, allocator);
+    PlainValuesWriter plainWriter = new PlainValuesWriter(1024, 1024 * 1024, allocator);
+    FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> writer =
+        FallbackValuesWriter.of(dictWriter, plainWriter, threshold);
+
+    try {
+      // Write ~300 bytes (75 ints * 4 bytes = 300) — below threshold
+      for (int i = 0; i < 75; i++) {
+        writer.writeInteger(i);
+      }
+      // Simulate page flush — check should NOT fire (cumulative = 300 < 500)
+      writer.getBytes();
+      assertTrue(
+          "Should still use dictionary before threshold is crossed",
+          writer.getEncoding().usesDictionary());
+      writer.reset();
+
+      // Write another ~300 bytes (75 ints * 4 = 300, cumulative now 600 > 500)
+      for (int i = 75; i < 150; i++) {
+        writer.writeInteger(i);
+      }
+      // Check SHOULD fire now and fall back (high cardinality, bad compression)
+      writer.getBytes();
+      assertFalse(
+          "Should fall back after cumulative bytes cross threshold",
+          writer.getEncoding().usesDictionary());
+    } finally {
+      writer.close();
+    }
+  }
+}
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java
@@ -161,6 +161,15 @@ public static enum JobSummaryLevel {
   public static final String BLOCK_ROW_COUNT_LIMIT = "parquet.block.row.count.limit";
   public static final String PAGE_ROW_COUNT_LIMIT = "parquet.page.row.count.limit";
   public static final String PAGE_WRITE_CHECKSUM_ENABLED = "parquet.page.write-checksum.enabled";
+  /**
+   * Raw data byte threshold after which the dictionary compression check is performed.
+   * Once cumulative raw bytes (excluding nulls) written to a column chunk reach this value,
+   * the writer evaluates whether dictionary encoding is effective. If not, it falls back to
+   * plain encoding. A value of 0 means check on the first page (backward compatible default).
+   */
+  public static final String DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES =
+      "parquet.dictionary.check.threshold.raw.size.bytes";
+
   public static final String STATISTICS_ENABLED = "parquet.column.statistics.enabled";
   public static final String SIZE_STATISTICS_ENABLED = "parquet.size.statistics.enabled";
 
@@ -412,6 +421,16 @@ public static boolean getPageWriteChecksumEnabled(Configuration conf) {
     return conf.getBoolean(PAGE_WRITE_CHECKSUM_ENABLED, ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
   }
 
+  public static void setDictionaryCheckThresholdRawSizeBytes(Configuration conf, long val) {
+    conf.setLong(DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES, val);
+  }
+
+  public static long getDictionaryCheckThresholdRawSizeBytes(Configuration conf) {
+    return conf.getLong(
+        DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES,
+        ParquetProperties.DEFAULT_DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES);
+  }
+
   public static void setStatisticsEnabled(JobContext jobContext, boolean enabled) {
     getConfiguration(jobContext).setBoolean(STATISTICS_ENABLED, enabled);
   }
@@ -526,6 +545,7 @@ public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, Comp
         .withRowGroupRowCountLimit(getBlockRowCountLimit(conf))
         .withPageRowCountLimit(getPageRowCountLimit(conf))
         .withPageWriteChecksumEnabled(getPageWriteChecksumEnabled(conf))
+        .withDictionaryCheckThresholdRawSizeBytes(getDictionaryCheckThresholdRawSizeBytes(conf))
         .withStatisticsEnabled(getStatisticsEnabled(conf));
     new ColumnConfigParser()
         .withColumnConfig(

diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
@@ -771,6 +771,17 @@ public SELF withPageWriteChecksumEnabled(boolean enablePageWriteChecksum) {
       return self();
     }
 
+    /**
+     * Set the raw data byte threshold after which the dictionary compression check is performed.
+     *
+     * @param val byte threshold (0 means check on the first page / preserve previous behavior)
+     * @return this builder for method chaining.
+     */
+    public SELF withDictionaryCheckThresholdRawSizeBytes(long val) {
+      encodingPropsBuilder.withDictionaryCheckThresholdRawSizeBytes(val);
+      return self();
+    }
+
     /**
      * Set max Bloom filter bytes for related columns.
      *