|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | +package org.apache.parquet.variant; |
| 20 | + |
| 21 | +import java.nio.ByteBuffer; |
| 22 | +import java.nio.ByteOrder; |
| 23 | +import java.nio.charset.StandardCharsets; |
| 24 | +import java.security.SecureRandom; |
| 25 | +import java.time.LocalDate; |
| 26 | +import java.util.*; |
| 27 | +import java.util.function.Consumer; |
| 28 | +import org.junit.Assert; |
| 29 | +import org.junit.Test; |
| 30 | +import org.slf4j.Logger; |
| 31 | +import org.slf4j.LoggerFactory; |
| 32 | + |
| 33 | +public class TestVariantArray { |
| 34 | + private static final Logger LOG = LoggerFactory.getLogger(TestVariantArray.class); |
| 35 | + private static final String RANDOM_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; |
| 36 | + |
| 37 | + /** Random number generator for generating random strings */ |
| 38 | + private static SecureRandom random = new SecureRandom(new byte[] {1, 2, 3, 4, 5}); |
| 39 | + |
| 40 | + private static final ByteBuffer EMPTY_METADATA = ByteBuffer.wrap(new byte[] {0b1}); |
| 41 | + |
| 42 | + private static final byte[] VALUE_NULL = new byte[] {primitiveHeader(0)}; |
| 43 | + private static final byte[] VALUE_BOOL = new byte[] {primitiveHeader(1)}; |
| 44 | + private static final byte[] VALUE_INT = new byte[] {primitiveHeader(5), (byte) 0xD2, 0x02, (byte) 0x96, 0x49}; |
| 45 | + private static final byte[] VALUE_STRING = |
| 46 | + new byte[] {primitiveHeader(16), 0x07, 0x00, 0x00, 0x00, 'v', 'a', 'r', 'i', 'a', 'n', 't'}; |
| 47 | + private static final byte[] VALUE_SHORT_STRING = new byte[] {0b101, 'c'}; |
| 48 | + private static final byte[] VALUE_DATE = new byte[] {0b101100, (byte) 0xE3, 0x4E, 0x00, 0x00}; |
| 49 | + |
| 50 | + private void checkType(Variant v, int expectedBasicType, VariantUtil.Type expectedType) { |
| 51 | + Assert.assertEquals(expectedBasicType, v.value.get(v.value.position()) & VariantUtil.BASIC_TYPE_MASK); |
| 52 | + Assert.assertEquals(expectedType, v.getType()); |
| 53 | + } |
| 54 | + |
| 55 | + private String randomString(int len) { |
| 56 | + StringBuilder sb = new StringBuilder(len); |
| 57 | + for (int i = 0; i < len; i++) { |
| 58 | + sb.append(RANDOM_CHARS.charAt(random.nextInt(RANDOM_CHARS.length()))); |
| 59 | + } |
| 60 | + return sb.toString(); |
| 61 | + } |
| 62 | + |
| 63 | + private void testVariant(Variant v, Consumer<Variant> consumer) { |
| 64 | + consumer.accept(v); |
| 65 | + // Create new Variant with different byte offsets |
| 66 | + byte[] newValue = new byte[v.value.capacity() + 50]; |
| 67 | + byte[] newMetadata = new byte[v.metadata.capacity() + 50]; |
| 68 | + Arrays.fill(newValue, (byte) 0xFF); |
| 69 | + Arrays.fill(newMetadata, (byte) 0xFF); |
| 70 | + v.value.position(0); |
| 71 | + v.value.get(newValue, 25, v.value.capacity()); |
| 72 | + v.value.position(0); |
| 73 | + v.metadata.position(0); |
| 74 | + v.metadata.get(newMetadata, 25, v.metadata.capacity()); |
| 75 | + v.metadata.position(0); |
| 76 | + Variant v2 = new Variant( |
| 77 | + ByteBuffer.wrap(newValue, 25, v.value.capacity()), |
| 78 | + ByteBuffer.wrap(newMetadata, 25, v.metadata.capacity())); |
| 79 | + consumer.accept(v2); |
| 80 | + } |
| 81 | + |
| 82 | + private static byte primitiveHeader(int type) { |
| 83 | + return (byte) (type << 2); |
| 84 | + } |
| 85 | + |
| 86 | + private static int getMinIntegerSize(int value) { |
| 87 | + return (value <= 0xFF) ? 1 : (value <= 0xFFFF) ? 2 : (value <= 0xFFFFFF) ? 3 : 4; |
| 88 | + } |
| 89 | + |
| 90 | + private static void writeVarlenInt(ByteBuffer buffer, int value, int valueSize) { |
| 91 | + if (valueSize == 1) { |
| 92 | + buffer.put((byte) value); |
| 93 | + } else if (valueSize == 2) { |
| 94 | + buffer.putShort((short) value); |
| 95 | + } else if (valueSize == 3) { |
| 96 | + buffer.put((byte) (value & 0xFF)); |
| 97 | + buffer.put((byte) ((value >> 8) & 0xFF)); |
| 98 | + buffer.put((byte) ((value >> 16) & 0xFF)); |
| 99 | + } else { |
| 100 | + buffer.putInt(value); |
| 101 | + } |
| 102 | + } |
| 103 | + |
| 104 | + private static byte[] constructString(String value) { |
| 105 | + return ByteBuffer.allocate(value.length() + 5) |
| 106 | + .order(ByteOrder.LITTLE_ENDIAN) |
| 107 | + .put(primitiveHeader(16)) |
| 108 | + .putInt(value.length()) |
| 109 | + .put(value.getBytes(StandardCharsets.UTF_8)) |
| 110 | + .array(); |
| 111 | + } |
| 112 | + |
| 113 | + private static byte[] constructArray(byte[]... elements) { |
| 114 | + int dataSize = 0; |
| 115 | + for (byte[] element : elements) { |
| 116 | + dataSize += element.length; |
| 117 | + } |
| 118 | + |
| 119 | + boolean isLarge = elements.length > 0xFF; |
| 120 | + int offsetSize = getMinIntegerSize(dataSize); |
| 121 | + int headerSize = 1 + (isLarge ? 4 : 1) + (elements.length + 1) * offsetSize; |
| 122 | + |
| 123 | + ByteBuffer output = ByteBuffer.allocate(headerSize + dataSize).order(ByteOrder.LITTLE_ENDIAN); |
| 124 | + |
| 125 | + output.put(VariantUtil.arrayHeader(isLarge, offsetSize)); |
| 126 | + |
| 127 | + if (isLarge) { |
| 128 | + output.putInt(elements.length); |
| 129 | + } else { |
| 130 | + output.put((byte) elements.length); |
| 131 | + } |
| 132 | + |
| 133 | + int currOffset = 0; |
| 134 | + for (int i = 0; i < elements.length; ++i) { |
| 135 | + writeVarlenInt(output, currOffset, offsetSize); |
| 136 | + currOffset += elements[i].length; |
| 137 | + } |
| 138 | + writeVarlenInt(output, currOffset, offsetSize); |
| 139 | + |
| 140 | + for (int i = 0; i < elements.length; ++i) { |
| 141 | + output.put(elements[i]); |
| 142 | + } |
| 143 | + output.flip(); |
| 144 | + return output.array(); |
| 145 | + } |
| 146 | + |
| 147 | + @Test |
| 148 | + public void testEmptyArray() { |
| 149 | + Variant value = new Variant(ByteBuffer.wrap(new byte[] {0b0011, 0x00}), EMPTY_METADATA); |
| 150 | + testVariant(value, v -> { |
| 151 | + checkType(v, VariantUtil.ARRAY, VariantUtil.Type.ARRAY); |
| 152 | + Assert.assertEquals(0, v.numArrayElements()); |
| 153 | + }); |
| 154 | + } |
| 155 | + |
| 156 | + @Test |
| 157 | + public void testEmptyLargeArray() { |
| 158 | + Variant value = new Variant(ByteBuffer.wrap(new byte[] {0b10011, 0x00, 0x00, 0x00, 0x00}), EMPTY_METADATA); |
| 159 | + testVariant(value, v -> { |
| 160 | + checkType(v, VariantUtil.ARRAY, VariantUtil.Type.ARRAY); |
| 161 | + Assert.assertEquals(0, v.numArrayElements()); |
| 162 | + }); |
| 163 | + } |
| 164 | + |
| 165 | + @Test |
| 166 | + public void testLargeArraySize() { |
| 167 | + Variant value = new Variant( |
| 168 | + ByteBuffer.wrap(new byte[] {0b10011, (byte) 0xFF, (byte) 0x01, 0x00, 0x00}), EMPTY_METADATA); |
| 169 | + testVariant(value, v -> { |
| 170 | + checkType(v, VariantUtil.ARRAY, VariantUtil.Type.ARRAY); |
| 171 | + Assert.assertEquals(511, v.numArrayElements()); |
| 172 | + }); |
| 173 | + } |
| 174 | + |
| 175 | + @Test |
| 176 | + public void testMixedArray() { |
| 177 | + byte[] nested = constructArray(VALUE_INT, VALUE_NULL, VALUE_SHORT_STRING); |
| 178 | + Variant value = new Variant( |
| 179 | + ByteBuffer.wrap(constructArray(VALUE_DATE, VALUE_BOOL, VALUE_INT, VALUE_STRING, nested)), |
| 180 | + EMPTY_METADATA); |
| 181 | + |
| 182 | + testVariant(value, v -> { |
| 183 | + checkType(v, VariantUtil.ARRAY, VariantUtil.Type.ARRAY); |
| 184 | + Assert.assertEquals(5, v.numArrayElements()); |
| 185 | + checkType(v.getElementAtIndex(0), VariantUtil.PRIMITIVE, VariantUtil.Type.DATE); |
| 186 | + Assert.assertEquals( |
| 187 | + LocalDate.parse("2025-04-17"), |
| 188 | + LocalDate.ofEpochDay(v.getElementAtIndex(0).getInt())); |
| 189 | + checkType(v.getElementAtIndex(1), VariantUtil.PRIMITIVE, VariantUtil.Type.BOOLEAN); |
| 190 | + Assert.assertTrue(v.getElementAtIndex(1).getBoolean()); |
| 191 | + checkType(v.getElementAtIndex(2), VariantUtil.PRIMITIVE, VariantUtil.Type.INT); |
| 192 | + Assert.assertEquals(1234567890, v.getElementAtIndex(2).getInt()); |
| 193 | + checkType(v.getElementAtIndex(3), VariantUtil.PRIMITIVE, VariantUtil.Type.STRING); |
| 194 | + Assert.assertEquals("variant", v.getElementAtIndex(3).getString()); |
| 195 | + checkType(v.getElementAtIndex(4), VariantUtil.ARRAY, VariantUtil.Type.ARRAY); |
| 196 | + |
| 197 | + Variant nestedV = v.getElementAtIndex(4); |
| 198 | + Assert.assertEquals(3, nestedV.numArrayElements()); |
| 199 | + checkType(nestedV.getElementAtIndex(0), VariantUtil.PRIMITIVE, VariantUtil.Type.INT); |
| 200 | + Assert.assertEquals(1234567890, nestedV.getElementAtIndex(0).getInt()); |
| 201 | + checkType(nestedV.getElementAtIndex(1), VariantUtil.PRIMITIVE, VariantUtil.Type.NULL); |
| 202 | + checkType(nestedV.getElementAtIndex(2), VariantUtil.SHORT_STR, VariantUtil.Type.STRING); |
| 203 | + Assert.assertEquals("c", nestedV.getElementAtIndex(2).getString()); |
| 204 | + }); |
| 205 | + } |
| 206 | + |
| 207 | + public void testArrayOffsetSize(String randomString) { |
| 208 | + Variant value = new Variant( |
| 209 | + ByteBuffer.wrap(constructArray(constructString(randomString), VALUE_BOOL, VALUE_INT)), EMPTY_METADATA); |
| 210 | + |
| 211 | + testVariant(value, v -> { |
| 212 | + checkType(v, VariantUtil.ARRAY, VariantUtil.Type.ARRAY); |
| 213 | + Assert.assertEquals(3, v.numArrayElements()); |
| 214 | + checkType(v.getElementAtIndex(0), VariantUtil.PRIMITIVE, VariantUtil.Type.STRING); |
| 215 | + Assert.assertEquals(randomString, v.getElementAtIndex(0).getString()); |
| 216 | + checkType(v.getElementAtIndex(1), VariantUtil.PRIMITIVE, VariantUtil.Type.BOOLEAN); |
| 217 | + Assert.assertTrue(v.getElementAtIndex(1).getBoolean()); |
| 218 | + checkType(v.getElementAtIndex(2), VariantUtil.PRIMITIVE, VariantUtil.Type.INT); |
| 219 | + Assert.assertEquals(1234567890, v.getElementAtIndex(2).getInt()); |
| 220 | + }); |
| 221 | + } |
| 222 | + |
| 223 | + @Test |
| 224 | + public void testArrayTwoByteOffset() { |
| 225 | + // a string larger than 255 bytes to push the value offset size above 1 byte |
| 226 | + testArrayOffsetSize(randomString(300)); |
| 227 | + } |
| 228 | + |
| 229 | + @Test |
| 230 | + public void testArrayThreeByteOffset() { |
| 231 | + // a string larger than 65535 bytes to push the value offset size above 2 bytes |
| 232 | + testArrayOffsetSize(randomString(70_000)); |
| 233 | + } |
| 234 | + |
| 235 | + @Test |
| 236 | + public void testArrayFourByteOffset() { |
| 237 | + // a string larger than 16777215 bytes to push the value offset size above 3 bytes |
| 238 | + testArrayOffsetSize(randomString(16_800_000)); |
| 239 | + } |
| 240 | +} |
0 commit comments