Skip to content

Commit 27d92e6

Browse files
committed
GH-3561 Harden variant decoding
- reject oversized metadata/value declarations - range checking Only low cost checks are made. There's also a depth check consistent with the json parser; it's arguable as to whether that is needed. It will defend against StackOverflowExceptions by anything trying to treewalk, but should that code be the place to do the checks? The new test suite TestHardenedReader can be configured to actually emit the malformed files, to see how applications deal with them. Contains contributions from Claude AI.
1 parent 7be05b4 commit 27d92e6

11 files changed

Lines changed: 821 additions & 122 deletions

File tree

parquet-variant/pom.xml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,17 @@
6969
<version>${slf4j.version}</version>
7070
<scope>test</scope>
7171
</dependency>
72+
<dependency>
73+
<groupId>org.apache.parquet</groupId>
74+
<artifactId>parquet-hadoop</artifactId>
75+
<version>${project.version}</version>
76+
<scope>test</scope>
77+
</dependency>
78+
<dependency>
79+
<groupId>org.apache.hadoop</groupId>
80+
<artifactId>hadoop-client</artifactId>
81+
<scope>test</scope>
82+
</dependency>
7283
</dependencies>
7384

7485
<build>

parquet-variant/src/main/java/org/apache/parquet/variant/Variant.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ public final class Variant {
6161
* Lazy cache for the parsed array header.
6262
*/
6363
private VariantUtil.ArrayInfo cachedArrayInfo;
64+
/** Nesting depth of this Variant relative to the top-level value (0 = top-level). */
65+
private final int depth;
6466

6567
/**
6668
* The threshold to switch from linear search to binary search when looking up a field by key in
@@ -84,7 +86,7 @@ public Variant(byte[] value, int valuePos, int valueLength, byte[] metadata, int
8486
public Variant(ByteBuffer value, ByteBuffer metadata) {
8587
this.value = value.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
8688
this.metadata = metadata.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
87-
89+
this.depth = 0;
8890
// There is currently only one allowed version.
8991
if ((metadata.get(metadata.position()) & VariantUtil.VERSION_MASK) != VariantUtil.VERSION) {
9092
throw new UnsupportedOperationException(String.format(
@@ -108,16 +110,23 @@ public Variant(ByteBuffer value, ByteBuffer metadata) {
108110
this.dictSize = 0;
109111
}
110112
this.metadataCache = null;
113+
VariantUtil.validateValueShallow(this.value, dictSize);
111114
}
112115

113116
/**
114117
* Package-private constructor that shares pre-parsed metadata state from a parent Variant.
115118
*/
116-
Variant(ByteBuffer value, ByteBuffer metadata, String[] metadataCache, int dictSize) {
119+
Variant(ByteBuffer value, ByteBuffer metadata, String[] metadataCache, int dictSize, int depth) {
120+
Preconditions.checkArgument(
121+
depth <= VariantUtil.MAX_VARIANT_DEPTH,
122+
"variant nesting depth exceeds maximum %s",
123+
VariantUtil.MAX_VARIANT_DEPTH);
117124
this.value = value.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
118125
this.metadata = metadata.asReadOnlyBuffer().order(ByteOrder.LITTLE_ENDIAN);
119126
this.metadataCache = metadataCache;
120127
this.dictSize = dictSize;
128+
this.depth = depth;
129+
VariantUtil.validateValueShallow(this.value, dictSize);
121130
}
122131

123132
public ByteBuffer getValueBuffer() {
@@ -361,7 +370,7 @@ public Variant getElementAtIndex(int index) {
361370
* Creates a child Variant that shares this instance's metadata cache.
362371
*/
363372
private Variant childVariant(ByteBuffer childValue) {
364-
return new Variant(childValue, metadata, metadataCache, dictSize);
373+
return new Variant(childValue, metadata, metadataCache, dictSize, depth + 1);
365374
}
366375

367376
/**

parquet-variant/src/main/java/org/apache/parquet/variant/VariantJsonParser.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
*/
1717
package org.apache.parquet.variant;
1818

19+
import static org.apache.parquet.variant.VariantUtil.MAX_VARIANT_DEPTH;
20+
1921
import com.fasterxml.jackson.core.JsonFactory;
2022
import com.fasterxml.jackson.core.JsonParseException;
2123
import com.fasterxml.jackson.core.JsonParser;
@@ -37,7 +39,7 @@ public final class VariantJsonParser {
3739

3840
private static final JsonFactory JSON_FACTORY = JsonFactory.builder()
3941
.streamReadConstraints(StreamReadConstraints.builder()
40-
.maxNestingDepth(500)
42+
.maxNestingDepth(MAX_VARIANT_DEPTH)
4143
.maxStringLength(10_000_000)
4244
.maxDocumentLength(50_000_000L)
4345
.build())

parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,12 @@ class VariantUtil {
189189
// The size (in bytes) of a UUID.
190190
static final int UUID_SIZE = 16;
191191

192+
/**
193+
* Maximum permitted nesting depth of a Variant value.
194+
* same limit as in VariantJsonParser.
195+
*/
196+
static final int MAX_VARIANT_DEPTH = 500;
197+
192198
// header bytes
193199
static final byte HEADER_NULL = primitiveHeader(NULL);
194200
static final byte HEADER_LONG_STRING = primitiveHeader(LONG_STR);
@@ -874,6 +880,160 @@ static HashMap<String, Integer> getMetadataMap(ByteBuffer metadata) {
874880
return result;
875881
}
876882

883+
/**
884+
* Bounds-checks the metadata buffer: header version, dictionary offset table and string data
885+
* region all fit within the buffer extent. It does not perform any deep checks into
886+
* the metadata itself.
887+
*
888+
* @param metadata the variant metadata buffer
889+
* @return the dictionary size
890+
* @throws IllegalArgumentException if the metadata buffer is not well-formed
891+
*/
892+
static int validateMetadata(ByteBuffer metadata) {
893+
int pos = metadata.position();
894+
Preconditions.checkArgument(pos >= 0 && pos < metadata.limit(), "variant metadata is empty");
895+
int header = metadata.get(pos) & 0xFF;
896+
Preconditions.checkArgument(
897+
(header & VERSION_MASK) == VERSION, "Unsupported variant metadata version: %s", header & VERSION_MASK);
898+
int offsetSize = ((header >> 6) & 0x3) + 1;
899+
long remaining = (long) metadata.limit() - pos;
900+
long offsetListStart = 1L + offsetSize;
901+
Preconditions.checkArgument(offsetListStart <= remaining, "variant metadata truncated");
902+
int dictSize = readUnsigned(metadata, pos + 1, offsetSize);
903+
long offsetBytes = (long) (dictSize + 1) * offsetSize;
904+
long dataStart = offsetListStart + offsetBytes;
905+
Preconditions.checkArgument(
906+
dataStart <= remaining, "variant metadata dictionary table extends past buffer: dictSize=%s", dictSize);
907+
return dictSize;
908+
}
909+
910+
/**
911+
* Bounds-checks a single Variant value node against its buffer slot. Performs no recursion
912+
* into nested children: child nodes are checked on demand when callers descend into them.
913+
*
914+
* <p>Cost: O(1) for primitives and short strings, O(numElements) for objects and arrays.
915+
* Validation of nested structures is deferred so that opening a large well-formed Variant
916+
* is not penalised by sub-trees the caller never inspects.
917+
*
918+
* @param value the variant value buffer (position/limit define the extent of this node's slot)
919+
* @param dictSize the metadata dictionary size, used to bound object field ids
920+
* @throws IllegalArgumentException if the value header or container table does not fit within
921+
* the buffer slot, or if any object field id is out of range
922+
*/
923+
static void validateValueShallow(ByteBuffer value, int dictSize) {
924+
int s = value.position();
925+
Preconditions.checkArgument(s >= 0 && s < value.limit(), "variant value is empty");
926+
long slot = (long) value.limit() - s;
927+
int header = value.get(s) & 0xFF;
928+
int basicType = header & BASIC_TYPE_MASK;
929+
int typeInfo = (header >> BASIC_TYPE_BITS) & PRIMITIVE_TYPE_MASK;
930+
switch (basicType) {
931+
case SHORT_STR:
932+
Preconditions.checkArgument(1L + typeInfo <= slot, "variant short string extends past buffer");
933+
return;
934+
case OBJECT:
935+
validateContainerShallow(value, s, slot, dictSize, true, typeInfo);
936+
return;
937+
case ARRAY:
938+
validateContainerShallow(value, s, slot, dictSize, false, typeInfo);
939+
return;
940+
default:
941+
validatePrimitiveShallow(value, s, slot, typeInfo);
942+
}
943+
}
944+
945+
private static void validateContainerShallow(
946+
ByteBuffer value, int s, long slot, int dictSize, boolean isObject, int typeInfo) {
947+
boolean largeSize;
948+
int idSize;
949+
if (isObject) {
950+
largeSize = ((typeInfo >> 4) & 0x1) != 0;
951+
idSize = ((typeInfo >> 2) & 0x3) + 1;
952+
} else {
953+
largeSize = ((typeInfo >> 2) & 0x1) != 0;
954+
idSize = 0;
955+
}
956+
int offsetSize = (typeInfo & 0x3) + 1;
957+
int sizeBytes = largeSize ? U32_SIZE : 1;
958+
Preconditions.checkArgument(1L + sizeBytes <= slot, "variant container header truncated");
959+
int numElements = readUnsigned(value, s + 1, sizeBytes);
960+
long idStart = 1L + sizeBytes;
961+
long idBytes = isObject ? (long) numElements * idSize : 0L;
962+
long offsetStart = idStart + idBytes;
963+
long offsetBytes = (long) (numElements + 1) * offsetSize;
964+
long dataStart = offsetStart + offsetBytes;
965+
Preconditions.checkArgument(
966+
dataStart <= slot, "variant container offset table extends past buffer: numElements=%s", numElements);
967+
long dataLen = slot - dataStart;
968+
if (isObject) {
969+
for (int i = 0; i < numElements; i++) {
970+
int id = readUnsigned(value, s + (int) idStart + i * idSize, idSize);
971+
Preconditions.checkArgument(
972+
id < dictSize, "variant object key id %s out of range (dictSize=%s)", id, dictSize);
973+
}
974+
}
975+
// Each child offset must lie within the data region. Children may overlap or leave gaps;
976+
// the trailing terminator offset is range-checked for the same reason.
977+
for (int i = 0; i <= numElements; i++) {
978+
// O(elements)
979+
int off = readUnsigned(value, s + (int) offsetStart + i * offsetSize, offsetSize);
980+
Preconditions.checkArgument(
981+
off <= dataLen, "variant child offset out of range: %s (data length %s)", off, dataLen);
982+
}
983+
}
984+
985+
private static void validatePrimitiveShallow(ByteBuffer value, int s, long slot, int typeInfo) {
986+
long size;
987+
switch (typeInfo) {
988+
case NULL:
989+
case TRUE:
990+
case FALSE:
991+
size = 1;
992+
break;
993+
case INT8:
994+
size = 2;
995+
break;
996+
case INT16:
997+
size = 3;
998+
break;
999+
case INT32:
1000+
case DATE:
1001+
case FLOAT:
1002+
size = 5;
1003+
break;
1004+
case INT64:
1005+
case DOUBLE:
1006+
case TIMESTAMP_TZ:
1007+
case TIMESTAMP_NTZ:
1008+
case TIME:
1009+
case TIMESTAMP_NANOS_TZ:
1010+
case TIMESTAMP_NANOS_NTZ:
1011+
size = 9;
1012+
break;
1013+
case DECIMAL4:
1014+
size = 6;
1015+
break;
1016+
case DECIMAL8:
1017+
size = 10;
1018+
break;
1019+
case DECIMAL16:
1020+
size = 18;
1021+
break;
1022+
case BINARY:
1023+
case LONG_STR: {
1024+
Preconditions.checkArgument(1L + U32_SIZE <= slot, "variant string/binary length field truncated");
1025+
size = 1L + U32_SIZE + readUnsigned(value, s + 1, U32_SIZE);
1026+
break;
1027+
}
1028+
case UUID:
1029+
size = 1L + UUID_SIZE;
1030+
break;
1031+
default:
1032+
throw new IllegalArgumentException(String.format("Unknown primitive type in variant: %d", typeInfo));
1033+
}
1034+
Preconditions.checkArgument(size <= slot, "variant value extends past buffer");
1035+
}
1036+
8771037
/**
8781038
* Computes the actual size (in bytes) of the Variant value.
8791039
* @param value The Variant value binary

0 commit comments

Comments
 (0)