Skip to content

Commit c1c333d

Browse files
committed
GH-561 ongoing work
1 parent 77bfce4 commit c1c333d

File tree

2 files changed

+163
-212
lines changed

2 files changed

+163
-212
lines changed

parquet-benchmarks/src/main/java/org/apache/parquet/benchmarks/VariantBenchmark.java

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -429,17 +429,19 @@ public void serializeVariant(Blackhole bh) {
429429
* the field-name lookup and type dispatch that a query engine performs on every row. Nested
430430
* objects are recursively traversed so that {@code depth=Nested} incurs the full deserialization
431431
* cost of sub-objects.
432+
* @param blackhole black hole.
432433
*/
433434
@Benchmark
434-
public void deserializeVariant(Blackhole bh) {
435+
public void deserializeVariant(Blackhole blackhole) {
435436
for (int j = 0; j < ITERATIONS; j++) {
436-
deserializeAndConsume(preBuiltVariant, bh);
437+
deserializeAndConsume(preBuiltVariant, blackhole);
437438
}
438439
}
439440

440441
/**
441442
* Shred the pre-built variant into a fully typed schema. Measures the cost of type dispatch,
442443
* field matching, and recursive decomposition that {@link VariantValueWriter} perform
444+
* @param blackhole black hole.
443445
*/
444446
@Benchmark
445447
public void consumeRecordsShredded(Blackhole blackhole) {
@@ -453,6 +455,7 @@ public void consumeRecordsShredded(Blackhole blackhole) {
453455
* Write {@link #FILE_ROWS} rows of the pre-built variant to an in-memory Parquet file using the
454456
* shredded schema. Measures end-to-end Parquet encoding cost including page/row-group framing.
455457
* Compare with {@link #consumeRecordsShredded} to quantify the overhead over raw schema traversal.
458+
* @param blackhole black hole.
456459
*/
457460
@Benchmark
458461
public void writeToMemoryFile(Blackhole blackhole) throws IOException {
@@ -463,6 +466,7 @@ public void writeToMemoryFile(Blackhole blackhole) throws IOException {
463466
* Write the pre-built variant to an unshredded schema (metadata + value only).
464467
* This is the baseline: the entire variant is written as a single binary blob.
465468
* Compare with {@link #consumeRecordsShredded} to see the cost of shredding.
469+
* @param blackhole black hole.
466470
*/
467471
@Benchmark
468472
public void consumeRecordsUnshredded(Blackhole blackhole) {
@@ -475,6 +479,7 @@ public void consumeRecordsUnshredded(Blackhole blackhole) {
475479
/**
476480
* Write {@link #FILE_ROWS} rows of the pre-built variant to an in-memory Parquet file using the
477481
* unshredded schema (metadata + value binary blobs only). Baseline for {@link #writeToMemoryFile}.
482+
* @param blackhole black hole.
478483
*/
479484
@Benchmark
480485
public void writeToMemoryUnshredded(Blackhole blackhole) throws IOException {
@@ -484,6 +489,8 @@ public void writeToMemoryUnshredded(Blackhole blackhole) throws IOException {
484489
/**
485490
* Read all rows from the pre-written shredded Parquet file in memory. Measures full Parquet
486491
* decode cost including typed column decoding and Variant reassembly.
492+
* @param blackhole black hole.
493+
* @throws IOException IO failure.
487494
*/
488495
@Benchmark
489496
public void readFileShredded(Blackhole blackhole) throws IOException {
@@ -494,10 +501,12 @@ public void readFileShredded(Blackhole blackhole) throws IOException {
494501
/**
495502
* Read all rows from the pre-written unshredded Parquet file in memory. Baseline for
496503
* {@link #readFileShredded}: measures raw binary blob read with no typed column decoding.
504+
* @param blackhole black hole.
505+
* @throws IOException IO failure.
497506
*/
498507
@Benchmark
499-
public void readFileUnshredded(Blackhole bh) throws IOException {
500-
consumeInputFile(bh, new ByteArrayInputFile(unshreddedFileBytes));
508+
public void readFileUnshredded(Blackhole blackhole) throws IOException {
509+
consumeInputFile(blackhole, new ByteArrayInputFile(unshreddedFileBytes));
501510
}
502511

503512
// ------------------------------------------------------------------
@@ -522,13 +531,15 @@ private Variant buildVariant() {
522531

523532
/**
524533
* Append the value for field {@code i} to {@code ob} according to its type, building nested objects on demand.
534+
* @param ob object
535+
* @param index index
525536
*/
526-
private void appendFieldValue(VariantObjectBuilder ob, int i) {
527-
final FieldEntry entry = fieldValues[i];
537+
private void appendFieldValue(VariantObjectBuilder ob, int index) {
538+
final FieldEntry entry = fieldValues[index];
528539
// special handling of nested.
529540
if (entry.type == FieldType.Nested) {
530541
if (depth == Depth.Nested && stringFieldCount > 0) {
531-
appendNestedObject(ob, i);
542+
appendNestedObject(ob, index);
532543
} else {
533544
// outlier.
534545
ob.appendNull();
@@ -541,6 +552,9 @@ private void appendFieldValue(VariantObjectBuilder ob, int i) {
541552
/**
542553
* Append a nested sub-object with {@link #NESTED_FIELD_COUNT} string fields. Field names are
543554
* drawn from the set of top-level string fields so the nested dictionary overlaps with the parent.
555+
*
556+
* @param parentOb parent object.
557+
* @param parentIndex parent index.
544558
*/
545559
private void appendNestedObject(VariantObjectBuilder parentOb, int parentIndex) {
546560
// VariantObjectBuilder does not expose startObject() for nesting directly;
@@ -563,6 +577,7 @@ private void appendNestedObject(VariantObjectBuilder parentOb, int parentIndex)
563577
/**
564578
* Build a shredded schema with typed_value columns matching each field's type.
565579
* For nested fields, the typed_value is an object group with string sub-fields.
580+
* @return the group type for a shredded object.
566581
*/
567582
private GroupType buildShreddedSchema() {
568583
Types.GroupBuilder<GroupType> typedValueBuilder = Types.optionalGroup();
@@ -586,16 +601,21 @@ private GroupType buildShreddedSchema() {
586601
.named("variant_field");
587602
}
588603

589-
/** Recursively deserialize a variant object, descending into any nested objects. */
590-
private void deserializeAndConsume(Variant v, Blackhole bh) {
591-
int n = v.numObjectElements();
604+
/**
605+
* Recursively deserialize a variant object, descending into any nested objects.
606+
*
607+
* @param variant variant to deserialize.
608+
* @param blackhole black hole.
609+
*/
610+
private void deserializeAndConsume(Variant variant, Blackhole blackhole) {
611+
int n = variant.numObjectElements();
592612
for (int i = 0; i < n; i++) {
593-
Variant.ObjectField field = v.getFieldAtIndex(i);
594-
bh.consume(field.key);
613+
Variant.ObjectField field = variant.getFieldAtIndex(i);
614+
blackhole.consume(field.key);
595615
if (field.value.getType() == Variant.Type.OBJECT) {
596-
deserializeAndConsume(field.value, bh);
616+
deserializeAndConsume(field.value, blackhole);
597617
} else {
598-
bh.consume(field.value.getValueBuffer());
618+
blackhole.consume(field.value.getValueBuffer());
599619
}
600620
}
601621
}
@@ -604,6 +624,10 @@ private void deserializeAndConsume(Variant v, Blackhole bh) {
604624
* Write {@link #FILE_ROWS} copies of {@link #preBuiltVariant} to a fresh in-memory Parquet file
605625
* using the given schema. Used both in {@link #setupTrial()} to pre-build read buffers and as the
606626
* body of the write-file benchmarks.
627+
*
628+
* @param schema group schema.
629+
* @return the byte of an in-memory parquet file.
630+
* @throws IOException IO failure.
607631
*/
608632
private byte[] writeVariantsToMemory(GroupType schema) throws IOException {
609633
ByteArrayOutputFile out = new ByteArrayOutputFile();
@@ -631,6 +655,7 @@ private void writeToMemory(final Blackhole blackhole, final GroupType schema) th
631655
* Consume an Input file.
632656
* @param blackhole black hole
633657
* @param inputFile input file
658+
* @throws IOException IO failure.
634659
*/
635660
private static void consumeInputFile(final Blackhole blackhole, final ByteArrayInputFile inputFile)
636661
throws IOException {

0 commit comments

Comments
 (0)