Skip to content

Commit a04c28a

Browse files
yadavay-amznwgtmac
andcommitted
Apply suggestion from @wgtmac
Co-authored-by: Gang Wu <ustcwg@gmail.com>
1 parent 2be0d8b commit a04c28a

4 files changed

Lines changed: 22 additions & 16 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,8 @@ public boolean getPageWriteChecksumEnabled() {
327327

328328
/**
329329
* Returns the byte threshold after which the dictionary compression check is performed.
330-
* A value of 0 means check on the first page (backward compatible default). Higher values
331-
* delay the check until that many raw bytes have been accumulated across pages.
330+
* A value of 0 means check on the first page. Higher values delay the check until that
331+
* many raw bytes have been accumulated across pages.
332332
*
333333
* @return the byte threshold for the dictionary compression check
334334
*/
@@ -727,8 +727,8 @@ public Builder withPageWriteChecksumEnabled(boolean val) {
727727

728728
/**
729729
* Set the raw data byte threshold after which the dictionary compression check is performed.
730-
* A value of 0 means check on the first page (backward compatible default). Higher values
731-
* delay the check until that many raw bytes have been accumulated across pages.
730+
* A value of 0 means check on the first page. Higher values delay the check until that
731+
* many raw bytes have been accumulated across pages.
732732
*
733733
* @param val byte threshold (default: 0)
734734
* @return this builder for method chaining

parquet-column/src/main/java/org/apache/parquet/column/values/fallback/FallbackValuesWriter.java

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@ public class FallbackValuesWriter<I extends ValuesWriter & RequiresFallback, F e
3030

3131
public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter> FallbackValuesWriter<I, F> of(
3232
I initialWriter, F fallBackWriter) {
33-
return new FallbackValuesWriter<>(initialWriter, fallBackWriter, /*checkAfterBytes=*/ 0);
33+
return new FallbackValuesWriter<>(initialWriter, fallBackWriter, /*dictionaryCheckThresholdRawSizeBytes=*/ 0);
3434
}
3535

3636
public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter> FallbackValuesWriter<I, F> of(
37-
I initialWriter, F fallBackWriter, long checkAfterBytes) {
38-
return new FallbackValuesWriter<>(initialWriter, fallBackWriter, checkAfterBytes);
37+
I initialWriter, F fallBackWriter, long dictionaryCheckThresholdRawSizeBytes) {
38+
return new FallbackValuesWriter<>(initialWriter, fallBackWriter, dictionaryCheckThresholdRawSizeBytes);
3939
}
4040

4141
/**
@@ -49,9 +49,11 @@ public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter
4949

5050
private boolean fellBackAlready = false;
5151
private boolean compressionChecked = false;
52-
private final long checkAfterBytes;
52+
private final long dictionaryCheckThresholdRawSizeBytes;
5353
/** Accumulates raw bytes across pages (only reset in resetDictionary) so the
54-
* threshold check works even when individual pages are smaller than checkAfterBytes. */
54+
* threshold check works even when individual pages are smaller than the threshold.
55+
* Overflow is not a concern: a long would require writing over 9.2 exabytes to a single
56+
* column chunk, which is physically impossible. */
5557
private long cumulativeRawBytes = 0;
5658

5759
/**
@@ -68,15 +70,15 @@ public static <I extends ValuesWriter & RequiresFallback, F extends ValuesWriter
6870
private long rawDataByteSize = 0;
6971

7072
public FallbackValuesWriter(I initialWriter, F fallBackWriter) {
71-
this(initialWriter, fallBackWriter, /*checkAfterBytes=*/ 0);
73+
this(initialWriter, fallBackWriter, /*dictionaryCheckThresholdRawSizeBytes=*/ 0);
7274
}
7375

74-
public FallbackValuesWriter(I initialWriter, F fallBackWriter, long checkAfterBytes) {
76+
public FallbackValuesWriter(I initialWriter, F fallBackWriter, long dictionaryCheckThresholdRawSizeBytes) {
7577
super();
7678
this.initialWriter = initialWriter;
7779
this.fallBackWriter = fallBackWriter;
7880
this.currentWriter = initialWriter;
79-
this.checkAfterBytes = checkAfterBytes;
81+
this.dictionaryCheckThresholdRawSizeBytes = dictionaryCheckThresholdRawSizeBytes;
8082
}
8183

8284
@Override
@@ -89,8 +91,12 @@ public long getBufferedSize() {
8991

9092
@Override
9193
public BytesInput getBytes() {
92-
cumulativeRawBytes += rawDataByteSize;
93-
if (!fellBackAlready && !compressionChecked && cumulativeRawBytes >= checkAfterBytes) {
94+
try {
95+
cumulativeRawBytes = Math.addExact(cumulativeRawBytes, rawDataByteSize);
96+
} catch (ArithmeticException e) {
97+
// overflow, keep the previous value
98+
}
99+
if (!fellBackAlready && !compressionChecked && cumulativeRawBytes >= dictionaryCheckThresholdRawSizeBytes) {
94100
compressionChecked = true;
95101
BytesInput bytes = initialWriter.getBytes();
96102
if (!initialWriter.isCompressionSatisfying(rawDataByteSize, bytes.size())) {

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ public static enum JobSummaryLevel {
165165
* Raw data byte threshold after which the dictionary compression check is performed.
166166
* Once cumulative raw bytes (excluding nulls) written to a column chunk reach this value,
167167
* the writer evaluates whether dictionary encoding is effective. If not, it falls back to
168-
* plain encoding. A value of 0 means check on the first page (backward compatible default).
168+
* plain encoding. A value of 0 means check on the first page.
169169
*/
170170
public static final String DICTIONARY_CHECK_THRESHOLD_RAW_SIZE_BYTES =
171171
"parquet.dictionary.check.threshold.raw.size.bytes";

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,7 @@ public SELF withPageWriteChecksumEnabled(boolean enablePageWriteChecksum) {
774774
/**
775775
* Set the raw data byte threshold after which the dictionary compression check is performed.
776776
*
777-
* @param val byte threshold (0 means check on the first page / preserve previous behavior)
777+
* @param val byte threshold (0 means checking on the first page for every column chunk)
778778
* @return this builder for method chaining.
779779
*/
780780
public SELF withDictionaryCheckThresholdRawSizeBytes(long val) {

0 commit comments

Comments
 (0)