Skip to content

Commit f5416d7

Browse files
committed
GH-561 Variant benchmarks
* Move under variant package to access private members * add "variant" group to run.sh
1 parent b9a5d32 commit f5416d7

File tree

4 files changed

+163
-35
lines changed

4 files changed

+163
-35
lines changed

parquet-benchmarks/run.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ read | Reading files with different compression, page and block sizes.
4343
write | Writing files.
4444
checksum | Reading and writing with and without CRC checksums.
4545
filter | Filtering column indexes
46+
variant | Variant performance
4647
4748
Examples:
4849
@@ -101,7 +102,7 @@ else
101102
BENCHMARK_REGEX="org.apache.parquet.benchmarks.FilteringBenchmarks"
102103
;;
103104
"variant")
104-
BENCHMARK_REGEX="org.apache.parquet.benchmarks.Variant*"
105+
BENCHMARK_REGEX="org.apache.parquet.variant.Variant*"
105106
;;
106107
esac
107108

parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/VariantBenchmark.java renamed to parquet-benchmarks/src/main/java/org/apache/parquet/variant/VariantBuilderBenchmark.java

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* specific language governing permissions and limitations
1717
* under the License.
1818
*/
19-
package org.apache.parquet.benchmarks;
19+
package org.apache.parquet.variant;
2020

2121
import java.io.ByteArrayOutputStream;
2222
import java.io.EOFException;
@@ -54,12 +54,6 @@
5454
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
5555
import org.apache.parquet.schema.Type.Repetition;
5656
import org.apache.parquet.schema.Types;
57-
import org.apache.parquet.variant.ImmutableMetadata;
58-
import org.apache.parquet.variant.Variant;
59-
import org.apache.parquet.variant.VariantBuilder;
60-
import org.apache.parquet.variant.VariantConverters;
61-
import org.apache.parquet.variant.VariantObjectBuilder;
62-
import org.apache.parquet.variant.VariantValueWriter;
6357
import org.openjdk.jmh.annotations.Benchmark;
6458
import org.openjdk.jmh.annotations.BenchmarkMode;
6559
import org.openjdk.jmh.annotations.Fork;
@@ -99,7 +93,7 @@
9993
*
10094
* <pre>
10195
* ./mvnw --projects parquet-benchmarks -amd -DskipTests -Denforcer.skip=true clean package
102-
* ./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.VariantBenchmark \
96+
* ./parquet-benchmarks/run.sh all org.apache.parquet.variant.VariantBuilderBenchmark \
10397
* -wi 5 -i 5 -f 1 -rff target/results.json
10498
* </pre>
10599
*/
@@ -110,9 +104,9 @@
110104
@BenchmarkMode(Mode.SingleShotTime)
111105
@OutputTimeUnit(TimeUnit.MICROSECONDS)
112106
@Timeout(time = 10, timeUnit = TimeUnit.MINUTES)
113-
public class VariantBenchmark {
107+
public class VariantBuilderBenchmark {
114108

115-
private static final Logger LOG = LoggerFactory.getLogger(VariantBenchmark.class);
109+
private static final Logger LOG = LoggerFactory.getLogger(VariantBuilderBenchmark.class);
116110

117111
/** Whether to include nested sub-objects in the field values. */
118112
public enum Depth {
@@ -121,7 +115,6 @@ public enum Depth {
121115
/** Nested values. */
122116
Nested,
123117
}
124-
125118
/**
126119
* Iterations on the small benchmarks whose operations are so fast that clocks, especially ARM clocks,
127120
* can't reliably measure them.
@@ -232,27 +225,26 @@ public FieldEntry(final FieldType type, final Object value) {
232225

233226
/** Ordered list of top-level field names, e.g. "field_0" … "field_N-1". */
234227
private List<String> fieldNames;
235-
236228
/**
237229
* Some types are pregenerated to keep RNG costs out of the benchmark, placed in generic object map then
238230
* cast to the correct type.
239231
*/
240232
private FieldEntry[] fieldValues;
241-
242233
/**
243234
* Indices of fields that are strings, used when constructing nested sub-objects so that nested
244235
* fields share the top-level field-name dictionary.
245236
*/
246237
private int[] stringFieldIndices;
247238

239+
/**
240+
* How many string fields were generated.
241+
*/
248242
private int stringFieldCount;
249-
250243
/**
251244
* A pre-built {@link Variant} for all benchmarks which want to keep build costs out
252245
* of their measurements.
253246
*/
254247
private Variant preBuiltVariant;
255-
256248
/**
257249
* Fixed random seed for reproducibility across runs. The same seed is used in the Iceberg
258250
* benchmark.
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.parquet.variant;
20+
21+
import static java.util.concurrent.TimeUnit.MICROSECONDS;
22+
23+
import java.nio.charset.StandardCharsets;
24+
import java.util.Random;
25+
import java.util.concurrent.TimeUnit;
26+
import org.apache.parquet.io.api.Binary;
27+
import org.openjdk.jmh.annotations.Benchmark;
28+
import org.openjdk.jmh.annotations.BenchmarkMode;
29+
import org.openjdk.jmh.annotations.Fork;
30+
import org.openjdk.jmh.annotations.Measurement;
31+
import org.openjdk.jmh.annotations.Mode;
32+
import org.openjdk.jmh.annotations.OutputTimeUnit;
33+
import org.openjdk.jmh.annotations.Param;
34+
import org.openjdk.jmh.annotations.Scope;
35+
import org.openjdk.jmh.annotations.Setup;
36+
import org.openjdk.jmh.annotations.State;
37+
import org.openjdk.jmh.annotations.Timeout;
38+
import org.openjdk.jmh.annotations.Warmup;
39+
import org.openjdk.jmh.infra.Blackhole;
40+
import org.slf4j.Logger;
41+
import org.slf4j.LoggerFactory;
42+
43+
/**
44+
* Benchmark {@link VariantConverters}. These converters are used
45+
* when reconstructing shredded variants, so their performance
46+
* and memory consumption is on the critical path of queries reading
47+
* variants.
48+
* <p>Run:
49+
* <pre>
50+
* ./run.sh all org.apache.parquet.variant.VariantConverterBenchmark \
51+
* -f 1 -foe true -rf json -rff target/results.json
52+
* </pre>
53+
* <p>Profile
54+
* <pre>
55+
* java -jar target/parquet-benchmarks.jar VariantProjectionBenchmark \
56+
* -prof "async:output=flamegraph;dir=target/perf" -rf json -rff target/results.json
57+
* </pre>
58+
* */
59+
@Fork(1)
60+
@State(Scope.Benchmark)
61+
@Warmup(iterations = 100)
62+
@Measurement(iterations = 500)
63+
@BenchmarkMode(Mode.SingleShotTime)
64+
@OutputTimeUnit(MICROSECONDS)
65+
@Timeout(time = 10, timeUnit = TimeUnit.MINUTES)
66+
public class VariantConverterBenchmark {
67+
68+
private static final Logger LOG = LoggerFactory.getLogger(VariantConverterBenchmark.class);
69+
70+
/** Number of iterations within each benchmark.
71+
* This compensates for the low resolution of ARM CPUs,
72+
* while also ensuring profile graphs are filled out in detail. */
73+
private static final int INNER_ITERATIONS = 100_000;
74+
75+
/** How long is the string to convert? */
76+
@Param({"16", "512", "2048"})
77+
int stringLength;
78+
79+
/**
80+
* An input string, created in setup from random ASCII characters of length {@link #stringLength}
81+
*/
82+
private String inputString;
83+
84+
/** {@link #inputString} as a binary. */
85+
private Binary inputBinary;
86+
87+
@Setup
88+
public void setup() {
89+
byte[] bytes = new byte[stringLength];
90+
Random random = new Random(42);
91+
for (int i = 0; i < stringLength; i++) {
92+
bytes[i] = (byte) ('a' + random.nextInt(26));
93+
}
94+
inputString = new String(bytes, StandardCharsets.UTF_8);
95+
inputBinary = Binary.fromConstantByteArray(bytes);
96+
LOG.info("Setup: stringLength={}", stringLength);
97+
}
98+
99+
/**
100+
* Benchmark converting a {@link Binary} as a string value and building a {@link Variant}.
101+
* This exercises the path taken by {@link VariantConverters} when decoding a shredded
102+
* {@code typed_value} string column.
103+
*/
104+
@Benchmark
105+
public void appendStringAsString(Blackhole blackhole) {
106+
for (int i = 0; i < INNER_ITERATIONS; i++) {
107+
VariantBuilder builder = new VariantBuilder();
108+
builder.appendString(inputString);
109+
blackhole.consume(builder.build());
110+
}
111+
}
112+
113+
/**
114+
* Benchmark converting a {@link Binary} as a string value and building a {@link Variant}.
115+
* This exercises the path taken by {@link VariantConverters} when decoding a shredded
116+
* {@code typed_value} string column.
117+
*/
118+
@Benchmark
119+
public void appendStringAsBinary(Blackhole blackhole) {
120+
for (int i = 0; i < INNER_ITERATIONS; i++) {
121+
VariantBuilder builder = new VariantBuilder();
122+
builder.appendAsString(inputBinary);
123+
blackhole.consume(builder.build());
124+
}
125+
}
126+
127+
/**
128+
* Benchmark appending a {@link Binary} as a binary value and building a {@link Variant}.
129+
* This exercises the path taken by {@link VariantConverters} when decoding a shredded
130+
* {@code typed_value} binary column.
131+
*/
132+
@Benchmark
133+
public void appendBinary(Blackhole blackhole) {
134+
for (int i = 0; i < INNER_ITERATIONS; i++) {
135+
VariantBuilder builder = new VariantBuilder();
136+
builder.appendBinary(inputBinary.toByteBuffer());
137+
blackhole.consume(builder.build());
138+
}
139+
}
140+
}

parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/VariantProjectionBenchmark.java renamed to parquet-benchmarks/src/main/java/org/apache/parquet/variant/VariantProjectionBenchmark.java

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
* limitations under the License.
1717
*/
1818

19-
package org.apache.parquet.benchmarks;
19+
package org.apache.parquet.variant;
2020

2121
import static java.util.concurrent.TimeUnit.MILLISECONDS;
2222
import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
@@ -31,6 +31,7 @@
3131
import org.apache.hadoop.fs.FileStatus;
3232
import org.apache.hadoop.fs.FileSystem;
3333
import org.apache.hadoop.fs.Path;
34+
import org.apache.parquet.benchmarks.BenchmarkFiles;
3435
import org.apache.parquet.hadoop.ParquetReader;
3536
import org.apache.parquet.hadoop.ParquetWriter;
3637
import org.apache.parquet.hadoop.api.InitContext;
@@ -47,12 +48,6 @@
4748
import org.apache.parquet.io.api.RecordMaterializer;
4849
import org.apache.parquet.schema.GroupType;
4950
import org.apache.parquet.schema.MessageType;
50-
import org.apache.parquet.variant.ImmutableMetadata;
51-
import org.apache.parquet.variant.Variant;
52-
import org.apache.parquet.variant.VariantBuilder;
53-
import org.apache.parquet.variant.VariantConverters;
54-
import org.apache.parquet.variant.VariantObjectBuilder;
55-
import org.apache.parquet.variant.VariantValueWriter;
5651
import org.openjdk.jmh.annotations.Benchmark;
5752
import org.openjdk.jmh.annotations.BenchmarkMode;
5853
import org.openjdk.jmh.annotations.Fork;
@@ -85,15 +80,15 @@
8580
*
8681
* <pre>
8782
* ./mvnw --projects parquet-benchmarks -amd -DskipTests -Denforcer.skip=true clean package
88-
* ./parquet-benchmarks/run.sh all org.apache.parquet.benchmarks.VariantProjectionBenchmark \
83+
* ./parquet-benchmarks/run.sh all org.apache.parquet.variant.VariantProjectionBenchmark \
8984
* -wi 3 -i 5 -f 1 -foe true -rf json -rff target/results.json
9085
* </pre>
9186
*
9287
*/
9388
@Fork(1)
9489
@State(Scope.Benchmark)
95-
@Warmup(iterations = 3)
96-
@Measurement(iterations = 5)
90+
@Warmup(iterations = 5)
91+
@Measurement(iterations = 10)
9792
@BenchmarkMode(Mode.SingleShotTime)
9893
@OutputTimeUnit(MILLISECONDS)
9994
@Timeout(time = 10, timeUnit = TimeUnit.MINUTES)
@@ -109,9 +104,9 @@ public class VariantProjectionBenchmark {
109104
* The per-record metadata declares that.
110105
*/
111106
public static final String UNSHREDDED_SCHEMA = "message vschema {"
112-
+ "required int64 id;"
113-
+ "required int32 category;"
114-
+ "optional group nested (VARIANT(1)) {"
107+
+ "required int64 id = 1;"
108+
+ "required int32 category = 2;"
109+
+ "required group nested (VARIANT(1)) = 3 {"
115110
+ " required binary metadata;"
116111
+ " required binary value;"
117112
+ " }"
@@ -121,9 +116,9 @@ public class VariantProjectionBenchmark {
121116
* Detailed specification declaring all the columns as shredded variants.
122117
*/
123118
public static final String SHREDDED_SCHEMA = "message vschema {"
124-
+ "required int64 id;"
125-
+ "required int32 category;"
126-
+ "optional group nested (VARIANT(1)) {"
119+
+ "required int64 id = 1;"
120+
+ "required int32 category = 2;"
121+
+ "required group nested (VARIANT(1)) = 3 {"
127122
+ " required binary metadata;"
128123
+ " optional binary value;"
129124
+ " optional group typed_value {"
@@ -152,9 +147,9 @@ public class VariantProjectionBenchmark {
152147
* only the variant column desired.
153148
*/
154149
public static final String SELECT_SCHEMA = "message vschema {"
155-
+ "required int64 id;"
156-
+ "required int32 category;"
157-
+ "optional group nested (VARIANT(1)) {"
150+
+ "required int64 id = 1;"
151+
+ "required int32 category = 2;"
152+
+ "required group nested (VARIANT(1)) = 3 {"
158153
+ " required binary metadata;"
159154
+ " optional binary value;"
160155
+ " optional group typed_value {"

0 commit comments

Comments
 (0)