Skip to content

Commit 88190cc

Browse files
authored
Enforce UTF8 when decoding byte[] to string in ValueReader (#16608)
1 parent 6f2b8f9 commit 88190cc

2 files changed

Lines changed: 38 additions & 1 deletion

File tree

pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import java.io.Closeable;
2222
import java.math.BigDecimal;
23+
import java.nio.charset.StandardCharsets;
2324
import org.apache.pinot.spi.utils.BigDecimalUtils;
2425
import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
2526

@@ -63,7 +64,7 @@ default byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer)
6364
*/
6465
default String getUnpaddedString(int index, int numBytesPerValue, byte[] buffer) {
6566
int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
66-
return new String(buffer, 0, length);
67+
return new String(buffer, 0, length, StandardCharsets.UTF_8);
6768
}
6869

6970
/**

pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,40 @@ public void testFixedByteValueReaderWriter(int maxStringLength, int configuredMa
7474
}
7575
}
7676
}
77+
78+
@Test(dataProvider = "params")
79+
public void testFixedByteValueReaderWriterNonAscii(int maxStringLength, int configuredMaxLength, ByteOrder byteOrder)
80+
throws IOException {
81+
byte[] bytes = new byte[configuredMaxLength];
82+
// Use a multi-byte UTF-8 character (é = 0xC3 0xA9)
83+
byte[] nonAsciiChar = "é".getBytes(StandardCharsets.UTF_8);
84+
85+
try (PinotDataBuffer buffer = PinotDataBuffer.allocateDirect(configuredMaxLength * 1000L, byteOrder,
86+
"testFixedByteValueReaderWriterNonAscii")) {
87+
FixedByteValueReaderWriter readerWriter = new FixedByteValueReaderWriter(buffer);
88+
List<String> inputs = new ArrayList<>(1000);
89+
90+
for (int i = 0; i < 1000; i++) {
91+
// number of *characters* to write
92+
int charCount = ThreadLocalRandom.current().nextInt(maxStringLength);
93+
int byteCount = charCount * nonAsciiChar.length;
94+
if (byteCount > configuredMaxLength) {
95+
byteCount = configuredMaxLength - (configuredMaxLength % nonAsciiChar.length); // fit whole chars
96+
charCount = byteCount / nonAsciiChar.length;
97+
}
98+
99+
Arrays.fill(bytes, (byte) 0);
100+
for (int pos = 0; pos < byteCount; pos += nonAsciiChar.length) {
101+
System.arraycopy(nonAsciiChar, 0, bytes, pos, nonAsciiChar.length);
102+
}
103+
104+
readerWriter.writeBytes(i, configuredMaxLength, bytes);
105+
inputs.add("é".repeat(charCount));
106+
}
107+
108+
for (int i = 0; i < 1000; i++) {
109+
assertEquals(readerWriter.getUnpaddedString(i, configuredMaxLength, bytes), inputs.get(i));
110+
}
111+
}
112+
}
77113
}

0 commit comments

Comments
 (0)