Skip to content

Commit ec6408d

Browse files
committed
Fine-tune JMH benchmarks for targeted per-branch measurement
- Add FileReadBenchmark / FileWriteBenchmark with SS warmup=5, measurement=10 - Add RowGroupFlushBenchmark with warmup=3, measurement=5 - Add RleDictionaryIndexDecodingBenchmark with encodeDictionaryIds() and ValuesReader-level decode benchmarks (decodeValuesReader, decodeValuesReaderBatch) - Add BlackHoleOutputFile for write benchmarks without I/O overhead - Adapt RLE decoder instantiation to use InputStream (par13 API)
1 parent d738c58 commit ec6408d

5 files changed

Lines changed: 665 additions & 0 deletions

File tree

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.benchmarks;
20+
21+
import java.io.IOException;
22+
import org.apache.parquet.io.OutputFile;
23+
import org.apache.parquet.io.PositionOutputStream;
24+
25+
/**
26+
* A no-op {@link OutputFile} that discards all written data.
27+
* Useful for isolating CPU/encoding cost from filesystem I/O in write benchmarks.
28+
*/
29+
public final class BlackHoleOutputFile implements OutputFile {
30+
31+
public static final BlackHoleOutputFile INSTANCE = new BlackHoleOutputFile();
32+
33+
private BlackHoleOutputFile() {}
34+
35+
@Override
36+
public boolean supportsBlockSize() {
37+
return false;
38+
}
39+
40+
@Override
41+
public long defaultBlockSize() {
42+
return -1L;
43+
}
44+
45+
@Override
46+
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
47+
return create(blockSizeHint);
48+
}
49+
50+
@Override
51+
public PositionOutputStream create(long blockSizeHint) {
52+
return new PositionOutputStream() {
53+
private long pos;
54+
55+
@Override
56+
public long getPos() throws IOException {
57+
return pos;
58+
}
59+
60+
@Override
61+
public void write(int b) throws IOException {
62+
++pos;
63+
}
64+
65+
@Override
66+
public void write(byte[] b, int off, int len) throws IOException {
67+
pos += len;
68+
}
69+
};
70+
}
71+
72+
@Override
73+
public String getPath() {
74+
return "/dev/null";
75+
}
76+
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.benchmarks;
20+
21+
import java.io.File;
22+
import java.io.IOException;
23+
import java.util.concurrent.TimeUnit;
24+
import org.apache.parquet.column.ParquetProperties.WriterVersion;
25+
import org.apache.parquet.example.data.Group;
26+
import org.apache.parquet.hadoop.ParquetFileWriter;
27+
import org.apache.parquet.hadoop.ParquetReader;
28+
import org.apache.parquet.hadoop.ParquetWriter;
29+
import org.apache.parquet.hadoop.api.ReadSupport;
30+
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
31+
import org.apache.parquet.hadoop.example.GroupReadSupport;
32+
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
33+
import org.apache.parquet.io.InputFile;
34+
import org.apache.parquet.io.LocalInputFile;
35+
import org.apache.parquet.io.LocalOutputFile;
36+
import org.openjdk.jmh.annotations.Benchmark;
37+
import org.openjdk.jmh.annotations.BenchmarkMode;
38+
import org.openjdk.jmh.annotations.Fork;
39+
import org.openjdk.jmh.annotations.Level;
40+
import org.openjdk.jmh.annotations.Measurement;
41+
import org.openjdk.jmh.annotations.Mode;
42+
import org.openjdk.jmh.annotations.OutputTimeUnit;
43+
import org.openjdk.jmh.annotations.Param;
44+
import org.openjdk.jmh.annotations.Scope;
45+
import org.openjdk.jmh.annotations.Setup;
46+
import org.openjdk.jmh.annotations.State;
47+
import org.openjdk.jmh.annotations.TearDown;
48+
import org.openjdk.jmh.annotations.Warmup;
49+
import org.openjdk.jmh.infra.Blackhole;
50+
51+
/**
52+
* File-level read benchmarks measuring end-to-end Parquet read throughput through the
53+
* example {@link Group} API. A temporary file is generated once during setup from
54+
* pre-generated rows using {@link LocalOutputFile}, then read repeatedly during the
55+
* benchmark.
56+
*
57+
* <p>Parameterized across compression codec and writer version. The footer parse
58+
* (via {@link LocalInputFile} open) is included in the timed section so the result
59+
* reflects the full open-and-read cost a typical caller would observe.
60+
*
61+
* <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
62+
* (a full read of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
63+
* amortization across invocations is unnecessary. Ten measurement iterations
64+
* provide stable statistics for SS mode.
65+
*/
66+
@BenchmarkMode(Mode.SingleShotTime)
67+
@Fork(1)
68+
@Warmup(iterations = 5, batchSize = 1)
69+
@Measurement(iterations = 10, batchSize = 1)
70+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
71+
@State(Scope.Benchmark)
72+
public class FileReadBenchmark {
73+
74+
@Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP"})
75+
public String codec;
76+
77+
@Param({"PARQUET_1_0", "PARQUET_2_0"})
78+
public String writerVersion;
79+
80+
private File tempFile;
81+
82+
@Setup(Level.Trial)
83+
public void setup() throws IOException {
84+
tempFile = File.createTempFile("parquet-read-bench-", ".parquet");
85+
tempFile.deleteOnExit();
86+
tempFile.delete(); // remove so the writer can create it
87+
88+
Group[] rows = TestDataFactory.generateRows(
89+
TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
90+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(new LocalOutputFile(tempFile.toPath()))
91+
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
92+
.withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
93+
.withCompressionCodec(CompressionCodecName.valueOf(codec))
94+
.withWriterVersion(WriterVersion.valueOf(writerVersion))
95+
.withDictionaryEncoding(true)
96+
.build()) {
97+
for (Group row : rows) {
98+
writer.write(row);
99+
}
100+
}
101+
}
102+
103+
@TearDown(Level.Trial)
104+
public void tearDown() {
105+
if (tempFile != null && tempFile.exists()) {
106+
tempFile.delete();
107+
}
108+
}
109+
110+
@Benchmark
111+
public void readFile(Blackhole bh) throws IOException {
112+
InputFile inputFile = new LocalInputFile(tempFile.toPath());
113+
try (ParquetReader<Group> reader = new ParquetReader.Builder<Group>(inputFile) {
114+
@Override
115+
protected ReadSupport<Group> getReadSupport() {
116+
return new GroupReadSupport();
117+
}
118+
}.build()) {
119+
Group group;
120+
while ((group = reader.read()) != null) {
121+
bh.consume(group);
122+
}
123+
}
124+
}
125+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.benchmarks;
20+
21+
import java.io.IOException;
22+
import java.util.concurrent.TimeUnit;
23+
import org.apache.parquet.column.ParquetProperties.WriterVersion;
24+
import org.apache.parquet.example.data.Group;
25+
import org.apache.parquet.hadoop.ParquetFileWriter;
26+
import org.apache.parquet.hadoop.ParquetWriter;
27+
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
28+
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
29+
import org.openjdk.jmh.annotations.Benchmark;
30+
import org.openjdk.jmh.annotations.BenchmarkMode;
31+
import org.openjdk.jmh.annotations.Fork;
32+
import org.openjdk.jmh.annotations.Level;
33+
import org.openjdk.jmh.annotations.Measurement;
34+
import org.openjdk.jmh.annotations.Mode;
35+
import org.openjdk.jmh.annotations.OutputTimeUnit;
36+
import org.openjdk.jmh.annotations.Param;
37+
import org.openjdk.jmh.annotations.Scope;
38+
import org.openjdk.jmh.annotations.Setup;
39+
import org.openjdk.jmh.annotations.State;
40+
import org.openjdk.jmh.annotations.Warmup;
41+
42+
/**
43+
* File-level write benchmarks measuring end-to-end Parquet write throughput through the
44+
* example {@link Group} API. Row contents are pre-generated during setup so compression
45+
* and writer settings dominate the timed section, while writes still flow through the
46+
* full Parquet writer path.
47+
*
48+
* <p>Writes are sent to a {@link BlackHoleOutputFile} to isolate CPU and encoding cost
49+
* from filesystem I/O. Parameterized across compression codec, writer version, and
50+
* dictionary encoding.
51+
*
52+
* <p>{@link Mode#SingleShotTime} is used because each invocation does enough work
53+
* (a full write of {@value TestDataFactory#DEFAULT_ROW_COUNT} rows) that JIT
54+
* amortization across invocations is unnecessary. Ten measurement iterations
55+
* provide stable statistics for SS mode.
56+
*/
57+
@BenchmarkMode(Mode.SingleShotTime)
58+
@Fork(1)
59+
@Warmup(iterations = 5, batchSize = 1)
60+
@Measurement(iterations = 10, batchSize = 1)
61+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
62+
@State(Scope.Benchmark)
63+
public class FileWriteBenchmark {
64+
65+
@Param({"UNCOMPRESSED", "SNAPPY", "ZSTD", "GZIP"})
66+
public String codec;
67+
68+
@Param({"PARQUET_1_0", "PARQUET_2_0"})
69+
public String writerVersion;
70+
71+
@Param({"true", "false"})
72+
public String dictionary;
73+
74+
private Group[] rows;
75+
76+
@Setup(Level.Trial)
77+
public void setup() {
78+
rows = TestDataFactory.generateRows(
79+
TestDataFactory.newGroupFactory(), TestDataFactory.DEFAULT_ROW_COUNT, TestDataFactory.DEFAULT_SEED);
80+
}
81+
82+
@Benchmark
83+
public void writeFile() throws IOException {
84+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(BlackHoleOutputFile.INSTANCE)
85+
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
86+
.withType(TestDataFactory.FILE_BENCHMARK_SCHEMA)
87+
.withCompressionCodec(CompressionCodecName.valueOf(codec))
88+
.withWriterVersion(WriterVersion.valueOf(writerVersion))
89+
.withDictionaryEncoding(Boolean.parseBoolean(dictionary))
90+
.build()) {
91+
for (Group row : rows) {
92+
writer.write(row);
93+
}
94+
}
95+
}
96+
}

0 commit comments

Comments
 (0)