Skip to content

Commit cd877c6

Browse files
committed
add option to not write path_in_schema
1 parent a33480f commit cd877c6

File tree

6 files changed

+136
-11
lines changed

6 files changed

+136
-11
lines changed

parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public class ParquetProperties {
6666
public static final int DEFAULT_BLOOM_FILTER_CANDIDATES_NUMBER = 5;
6767
public static final boolean DEFAULT_STATISTICS_ENABLED = true;
6868
public static final boolean DEFAULT_SIZE_STATISTICS_ENABLED = true;
69+
public static final boolean DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED = true;
6970

7071
public static final boolean DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED = true;
7172

@@ -120,6 +121,7 @@ public static WriterVersion fromString(String name) {
120121
private final int statisticsTruncateLength;
121122
private final boolean statisticsEnabled;
122123
private final boolean sizeStatisticsEnabled;
124+
private final boolean writePathInSchemaEnabled;
123125

124126
// The expected NDV (number of distinct values) for each columns
125127
private final ColumnProperty<Long> bloomFilterNDVs;
@@ -154,6 +156,7 @@ private ParquetProperties(Builder builder) {
154156
this.statisticsTruncateLength = builder.statisticsTruncateLength;
155157
this.statisticsEnabled = builder.statisticsEnabled;
156158
this.sizeStatisticsEnabled = builder.sizeStatisticsEnabled;
159+
this.writePathInSchemaEnabled = builder.writePathInSchemaEnabled;
157160
this.bloomFilterNDVs = builder.bloomFilterNDVs.build();
158161
this.bloomFilterFPPs = builder.bloomFilterFPPs.build();
159162
this.bloomFilterEnabled = builder.bloomFilterEnabled.build();
@@ -322,6 +325,10 @@ public boolean getPageWriteChecksumEnabled() {
322325
return pageWriteChecksumEnabled;
323326
}
324327

328+
public boolean getWritePathInSchemaEnabled() {
329+
return writePathInSchemaEnabled;
330+
}
331+
325332
public OptionalLong getBloomFilterNDV(ColumnDescriptor column) {
326333
Long ndv = bloomFilterNDVs.getValue(column);
327334
return ndv == null ? OptionalLong.empty() : OptionalLong.of(ndv);
@@ -406,6 +413,7 @@ public static class Builder {
406413
private int statisticsTruncateLength = DEFAULT_STATISTICS_TRUNCATE_LENGTH;
407414
private boolean statisticsEnabled = DEFAULT_STATISTICS_ENABLED;
408415
private boolean sizeStatisticsEnabled = DEFAULT_SIZE_STATISTICS_ENABLED;
416+
private boolean writePathInSchemaEnabled = DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED;
409417
private final ColumnProperty.Builder<Long> bloomFilterNDVs;
410418
private final ColumnProperty.Builder<Double> bloomFilterFPPs;
411419
private int maxBloomFilterBytes = DEFAULT_MAX_BLOOM_FILTER_BYTES;
@@ -756,6 +764,24 @@ public Builder withSizeStatisticsEnabled(String columnPath, boolean enabled) {
756764
return this;
757765
}
758766

767+
/**
768+
* Sets whether to write the path_in_schema field in ColumnMetaData.
769+
*
770+
* The path_in_schema field in the Thrift metadata is redundant and wastes a great
771+
* deal of space. Parquet file footers can be made much smaller by omitting this field.
772+
* Because the field was originally a mandatory field, this property defaults to true
773+
* to maintain compatibility with older readers that expect this field to be present.
774+
* If one knows that all readers one plans to use are tolerant of the absense of this field,
775+
* this may be safely set to false.
776+
*
777+
* @param enabled whether to write path_in_schema
778+
* @return this builder for method chaining
779+
*/
780+
public Builder withWritePathInSchemaEnabled(boolean enabled) {
781+
this.writePathInSchemaEnabled = enabled;
782+
return this;
783+
}
784+
759785
public ParquetProperties build() {
760786
ParquetProperties properties = new ParquetProperties(this);
761787
// we pass a constructed but uninitialized factory to ParquetProperties above as currently

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -163,14 +163,19 @@ public class ParquetMetadataConverter {
163163
new ConvertedTypeConverterVisitor();
164164
private final int statisticsTruncateLength;
165165
private final boolean useSignedStringMinMax;
166+
private final boolean writePathInSchema;
166167
private final ParquetReadOptions options;
167168

168169
public ParquetMetadataConverter() {
169170
this(false);
170171
}
171172

172173
public ParquetMetadataConverter(int statisticsTruncateLength) {
173-
this(false, statisticsTruncateLength);
174+
this(false, statisticsTruncateLength, ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED);
175+
}
176+
177+
public ParquetMetadataConverter(int statisticsTruncateLength, boolean writePathInSchema) {
178+
this(false, statisticsTruncateLength, writePathInSchema);
174179
}
175180

176181
/**
@@ -183,24 +188,36 @@ public ParquetMetadataConverter(Configuration conf) {
183188
}
184189

185190
public ParquetMetadataConverter(ParquetReadOptions options) {
186-
this(options.useSignedStringMinMax(), ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH, options);
191+
this(
192+
options.useSignedStringMinMax(),
193+
ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
194+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED,
195+
options);
187196
}
188197

189198
private ParquetMetadataConverter(boolean useSignedStringMinMax) {
190-
this(useSignedStringMinMax, ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
199+
this(
200+
useSignedStringMinMax,
201+
ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
202+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED);
191203
}
192204

193-
private ParquetMetadataConverter(boolean useSignedStringMinMax, int statisticsTruncateLength) {
194-
this(useSignedStringMinMax, statisticsTruncateLength, null);
205+
private ParquetMetadataConverter(
206+
boolean useSignedStringMinMax, int statisticsTruncateLength, boolean writePathInSchema) {
207+
this(useSignedStringMinMax, statisticsTruncateLength, writePathInSchema, null);
195208
}
196209

197210
private ParquetMetadataConverter(
198-
boolean useSignedStringMinMax, int statisticsTruncateLength, ParquetReadOptions options) {
211+
boolean useSignedStringMinMax,
212+
int statisticsTruncateLength,
213+
boolean writePathInSchema,
214+
ParquetReadOptions options) {
199215
if (statisticsTruncateLength <= 0) {
200216
throw new IllegalArgumentException("Truncate length should be greater than 0");
201217
}
202218
this.useSignedStringMinMax = useSignedStringMinMax;
203219
this.statisticsTruncateLength = statisticsTruncateLength;
220+
this.writePathInSchema = writePathInSchema;
204221
this.options = options;
205222
}
206223

@@ -618,7 +635,7 @@ private void addRowGroup(
618635
|| columnMetaData.hasDictionaryPage()) {
619636
metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
620637
}
621-
if (path != null) {
638+
if (path != null && this.writePathInSchema) {
622639
metaData.setPath_in_schema(path.toList());
623640
}
624641
long bloomFilterOffset = columnMetaData.getBloomFilterOffset();

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java

Lines changed: 65 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,8 @@ public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode, long ro
333333
maxPaddingSize,
334334
ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
335335
ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
336-
ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
336+
ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED,
337+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED);
337338
}
338339

339340
@FunctionalInterface
@@ -373,8 +374,35 @@ private void withAbortOnFailure(IORunnable action) throws IOException {
373374
* @param columnIndexTruncateLength the length which the min/max values in column indexes tried to be truncated to
374375
* @param statisticsTruncateLength the length which the min/max values in row groups tried to be truncated to
375376
* @param pageWriteChecksumEnabled whether to write out page level checksums
377+
* @param writePathInSchemaEnabled whether to write path_in_schema to the column metadata
376378
* @throws IOException if the file can not be created
377379
*/
380+
public ParquetFileWriter(
381+
OutputFile file,
382+
MessageType schema,
383+
Mode mode,
384+
long rowGroupSize,
385+
int maxPaddingSize,
386+
int columnIndexTruncateLength,
387+
int statisticsTruncateLength,
388+
boolean pageWriteChecksumEnabled,
389+
boolean writePathInSchemaEnabled)
390+
throws IOException {
391+
this(
392+
file,
393+
schema,
394+
mode,
395+
rowGroupSize,
396+
maxPaddingSize,
397+
columnIndexTruncateLength,
398+
statisticsTruncateLength,
399+
pageWriteChecksumEnabled,
400+
writePathInSchemaEnabled,
401+
null,
402+
null,
403+
null);
404+
}
405+
378406
public ParquetFileWriter(
379407
OutputFile file,
380408
MessageType schema,
@@ -394,7 +422,34 @@ public ParquetFileWriter(
394422
columnIndexTruncateLength,
395423
statisticsTruncateLength,
396424
pageWriteChecksumEnabled,
425+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED,
426+
null,
397427
null,
428+
null);
429+
}
430+
431+
public ParquetFileWriter(
432+
OutputFile file,
433+
MessageType schema,
434+
Mode mode,
435+
long rowGroupSize,
436+
int maxPaddingSize,
437+
int columnIndexTruncateLength,
438+
int statisticsTruncateLength,
439+
boolean pageWriteChecksumEnabled,
440+
FileEncryptionProperties encryptionProperties)
441+
throws IOException {
442+
this(
443+
file,
444+
schema,
445+
mode,
446+
rowGroupSize,
447+
maxPaddingSize,
448+
columnIndexTruncateLength,
449+
statisticsTruncateLength,
450+
pageWriteChecksumEnabled,
451+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED,
452+
encryptionProperties,
398453
null,
399454
null);
400455
}
@@ -408,6 +463,7 @@ public ParquetFileWriter(
408463
int columnIndexTruncateLength,
409464
int statisticsTruncateLength,
410465
boolean pageWriteChecksumEnabled,
466+
boolean writePathInSchemaEnabled,
411467
FileEncryptionProperties encryptionProperties)
412468
throws IOException {
413469
this(
@@ -419,6 +475,7 @@ public ParquetFileWriter(
419475
columnIndexTruncateLength,
420476
statisticsTruncateLength,
421477
pageWriteChecksumEnabled,
478+
writePathInSchemaEnabled,
422479
encryptionProperties,
423480
null,
424481
null);
@@ -442,6 +499,7 @@ public ParquetFileWriter(
442499
props.getColumnIndexTruncateLength(),
443500
props.getStatisticsTruncateLength(),
444501
props.getPageWriteChecksumEnabled(),
502+
props.getWritePathInSchemaEnabled(),
445503
encryptionProperties,
446504
null,
447505
props.getAllocator());
@@ -468,6 +526,7 @@ public ParquetFileWriter(
468526
columnIndexTruncateLength,
469527
statisticsTruncateLength,
470528
pageWriteChecksumEnabled,
529+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED,
471530
null,
472531
encryptor,
473532
null);
@@ -482,6 +541,7 @@ private ParquetFileWriter(
482541
int columnIndexTruncateLength,
483542
int statisticsTruncateLength,
484543
boolean pageWriteChecksumEnabled,
544+
boolean writePathInSchemaEnabled,
485545
FileEncryptionProperties encryptionProperties,
486546
InternalFileEncryptor encryptor,
487547
ByteBufferAllocator allocator)
@@ -512,7 +572,7 @@ private ParquetFileWriter(
512572
? ReusingByteBufferAllocator.strict(allocator == null ? new HeapByteBufferAllocator() : allocator)
513573
: null;
514574

515-
this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength);
575+
this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength, writePathInSchemaEnabled);
516576

517577
if (null == encryptionProperties && null == encryptor) {
518578
this.fileEncryptor = null;
@@ -584,7 +644,9 @@ private ParquetFileWriter(
584644
this.crcAllocator = pageWriteChecksumEnabled
585645
? ReusingByteBufferAllocator.strict(allocator == null ? new HeapByteBufferAllocator() : allocator)
586646
: null;
587-
this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
647+
this.metadataConverter = new ParquetMetadataConverter(
648+
ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
649+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED);
588650
this.fileEncryptor = null;
589651
}
590652

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetOutputFormat.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ public RecordWriter<Void, T> getRecordWriter(Configuration conf, Path file, Comp
508508
throws IOException, InterruptedException {
509509
final WriteSupport<T> writeSupport = getWriteSupport(conf);
510510

511+
// TODO(ets): add write_path_in_schema to conf?
511512
ParquetProperties.Builder propsBuilder = ParquetProperties.builder()
512513
.withPageSize(getPageSize(conf))
513514
.withDictionaryPageSize(getDictionaryPageSize(conf))

parquet-hadoop/src/main/java/org/apache/parquet/hadoop/rewrite/ParquetRewriter.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ public ParquetRewriter(RewriteOptions options) throws IOException {
195195
DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
196196
DEFAULT_STATISTICS_TRUNCATE_LENGTH,
197197
ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED,
198+
ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED,
198199
options.getFileEncryptionProperties());
199200
writer.start();
200201
// column nullification requires a separate encryptor and forcing other columns encryption initialization

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,8 @@ private void testBinaryStatsWithTruncation(int truncateLen, int minLen, int maxL
856856
byte[] max = generateRandomString("b", maxLen).getBytes();
857857
stats.updateStats(Binary.fromConstantByteArray(min));
858858
stats.updateStats(Binary.fromConstantByteArray(max));
859-
ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(truncateLen);
859+
ParquetMetadataConverter metadataConverter =
860+
new ParquetMetadataConverter(truncateLen, ParquetProperties.DEFAULT_WRITE_PATH_IN_SCHEMA_ENABLED);
860861
org.apache.parquet.format.Statistics formatStats = metadataConverter.toParquetStatistics(stats);
861862

862863
if (minLen + maxLen >= ParquetMetadataConverter.MAX_STATS_SIZE) {
@@ -1976,4 +1977,21 @@ public void testEdgeInterpolationAlgorithmConversion() {
19761977
assertNull(ParquetMetadataConverter.fromParquetEdgeInterpolationAlgorithm(null));
19771978
assertNull(ParquetMetadataConverter.toParquetEdgeInterpolationAlgorithm(null));
19781979
}
1980+
1981+
@Test
1982+
public void testSkipPathInSchema() throws IOException {
1983+
ParquetMetadata origMetaData = createParquetMetaData(null, Encoding.PLAIN);
1984+
ParquetMetadataConverter converter =
1985+
new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH, false);
1986+
1987+
// Without path_in_schema
1988+
FileMetaData footer = converter.toParquetMetadata(1, origMetaData);
1989+
assertFalse(
1990+
footer.getRow_groups().get(0).getColumns().get(0).getMeta_data().isSetPath_in_schema());
1991+
1992+
// With path_in_schema
1993+
converter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH, true);
1994+
assertFalse(
1995+
footer.getRow_groups().get(0).getColumns().get(0).getMeta_data().isSetPath_in_schema());
1996+
}
19791997
}

0 commit comments

Comments
 (0)