Skip to content

Commit 6a6698d

Browse files
committed
optimize UTF-8 fallback after ASCII prefix
1 parent ab680be commit 6a6698d

6 files changed

Lines changed: 80 additions & 14 deletions

File tree

core/src/main/java/io/questdb/client/cutlass/http/client/WebSocketSendBuffer.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ public void putUtf8(String value) {
334334
int charLen = value.length();
335335
ensureCapacity(charLen);
336336

337-
// Single-pass ASCII path. Non-ASCII falls back to the shared UTF-8 utility.
337+
// Single-pass for ASCII. Mixed strings keep the ASCII prefix and resume UTF-8 encoding.
338338
long addr = bufPtr + writePos;
339339
int i = 0;
340340
for (; i < charLen; i++) {
@@ -348,9 +348,9 @@ public void putUtf8(String value) {
348348
if (i == charLen) {
349349
writePos += charLen;
350350
} else {
351-
int utf8Len = NativeBufferWriter.utf8Length(value);
352-
ensureCapacity(utf8Len);
353-
writePos += Utf8s.strCpyUtf8(value, bufPtr + writePos, utf8Len);
351+
int utf8Len = Utf8s.utf8Bytes(value, i, charLen);
352+
ensureCapacity(i + utf8Len);
353+
writePos += i + Utf8s.strCpyUtf8(value, i, bufPtr + writePos + i, utf8Len);
354354
}
355355
}
356356

core/src/main/java/io/questdb/client/cutlass/qwp/client/NativeBufferWriter.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -289,10 +289,9 @@ public void putUtf8(String value) {
289289
// All ASCII — done in a single pass
290290
position += charLen;
291291
} else {
292-
// Non-ASCII — fall back to two-pass (re-encodes from start)
293-
int utf8Len = utf8Length(value);
294-
ensureCapacity(utf8Len);
295-
encodeUtf8(value, utf8Len);
292+
int utf8Len = Utf8s.utf8Bytes(value, i, charLen);
293+
ensureCapacity(i + utf8Len);
294+
position += i + Utf8s.strCpyUtf8(value, i, bufferPtr + position + i, utf8Len);
296295
}
297296
}
298297

core/src/main/java/io/questdb/client/std/str/Utf8s.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,12 @@ public static void strCpyAscii(@NotNull CharSequence asciiSrc, int srcLo, int sr
181181
* @return the number of UTF-8 bytes written
182182
*/
183183
public static int strCpyUtf8(@NotNull CharSequence src, long destAddr, int maxBytes) {
184+
return strCpyUtf8(src, 0, destAddr, maxBytes);
185+
}
186+
187+
public static int strCpyUtf8(@NotNull CharSequence src, int srcLo, long destAddr, int maxBytes) {
184188
int pos = 0;
185-
for (int i = 0, n = src.length(); i < n; i++) {
189+
for (int i = srcLo, n = src.length(); i < n; i++) {
186190
char c = src.charAt(i);
187191
if (c < 0x80) {
188192
if (pos + 1 > maxBytes) {
@@ -263,18 +267,20 @@ public static String stringFromUtf8Bytes(@NotNull Utf8Sequence seq) {
263267
}
264268

265269
public static int utf8Bytes(@NotNull CharSequence sequence) {
266-
int count = 0;
267-
int len = sequence.length();
270+
return utf8Bytes(sequence, 0, sequence.length());
271+
}
268272

269-
for (int i = 0; i < len; i++) {
273+
public static int utf8Bytes(@NotNull CharSequence sequence, int lo, int hi) {
274+
int count = 0;
275+
for (int i = lo; i < hi; i++) {
270276
char ch = sequence.charAt(i);
271277
if (ch < 0x80) {
272278
count++;
273279
} else if (ch < 0x800) {
274280
count += 2;
275281
} else if (Character.isSurrogate(ch)) {
276282
if (Character.isHighSurrogate(ch)) {
277-
if (i + 1 < len && Character.isLowSurrogate(sequence.charAt(i + 1))) {
283+
if (i + 1 < hi && Character.isLowSurrogate(sequence.charAt(i + 1))) {
278284
count += 4;
279285
i++;
280286
} else {

core/src/test/java/io/questdb/client/test/cutlass/http/client/WebSocketSendBufferTest.java

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@
2626

2727
import io.questdb.client.cutlass.http.client.WebSocketSendBuffer;
2828
import io.questdb.client.std.Unsafe;
29-
import static io.questdb.client.test.tools.TestUtils.assertMemoryLeak;
29+
import io.questdb.client.std.str.Utf8s;
3030
import org.junit.Test;
3131

32+
import static io.questdb.client.test.tools.TestUtils.assertMemoryLeak;
3233
import static org.junit.Assert.assertEquals;
3334

3435
public class WebSocketSendBufferTest {
@@ -46,4 +47,19 @@ public void testPutUtf8InvalidSurrogatePair() throws Exception {
4647
}
4748
});
4849
}
50+
51+
@Test
52+
public void testPutUtf8MixedAsciiAndNonAsciiAfterGrow() throws Exception {
53+
assertMemoryLeak(() -> {
54+
try (WebSocketSendBuffer buf = new WebSocketSendBuffer(8)) {
55+
String value = "abcdefghijklmnop世界世界世界世界世界世界世界世界世界世界";
56+
57+
buf.putUtf8(value);
58+
59+
int utf8Len = Utf8s.utf8Bytes(value);
60+
assertEquals(utf8Len, buf.getWritePos());
61+
assertEquals(value, Utf8s.stringFromUtf8Bytes(buf.getBufferPtr(), buf.getBufferPtr() + utf8Len));
62+
}
63+
});
64+
}
4965
}

core/src/test/java/io/questdb/client/test/cutlass/qwp/client/NativeBufferWriterTest.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import io.questdb.client.cutlass.qwp.client.NativeBufferWriter;
2828
import io.questdb.client.std.Unsafe;
29+
import io.questdb.client.std.str.Utf8s;
2930
import org.junit.Assert;
3031
import org.junit.Test;
3132

@@ -522,6 +523,21 @@ public void testWriteUtf8MixedAsciiAndNonAscii() throws Exception {
522523
});
523524
}
524525

526+
@Test
527+
public void testWriteUtf8MixedAsciiAndNonAsciiAfterGrow() throws Exception {
528+
assertMemoryLeak(() -> {
529+
try (NativeBufferWriter writer = new NativeBufferWriter(8)) {
530+
String value = "abcdefghijklmnop世界世界世界世界世界世界世界世界世界世界";
531+
532+
writer.putUtf8(value);
533+
534+
int utf8Len = Utf8s.utf8Bytes(value);
535+
Assert.assertEquals(utf8Len, writer.getPosition());
536+
Assert.assertEquals(value, Utf8s.stringFromUtf8Bytes(writer.getBufferPtr(), writer.getBufferPtr() + utf8Len));
537+
}
538+
});
539+
}
540+
525541
@Test
526542
public void testWriteUtf8Ascii() throws Exception {
527543
assertMemoryLeak(() -> {

core/src/test/java/io/questdb/client/test/std/str/Utf8sTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,27 @@ public void testStrCpyUtf8() {
270270
}
271271
}
272272

273+
@Test
274+
public void testStrCpyUtf8FromOffset() {
275+
final int bufSize = 64;
276+
long mem = Unsafe.malloc(bufSize, MemoryTag.NATIVE_DEFAULT);
277+
try {
278+
final byte sentinel = 0x5A;
279+
String value = "ascii-prefixé世\uD83D\uDE00\uD800x";
280+
int lo = "ascii-prefix".length();
281+
int expectedBytes = Utf8s.utf8Bytes(value, lo, value.length());
282+
283+
fill(mem, bufSize, sentinel);
284+
Assert.assertEquals(expectedBytes, Utf8s.strCpyUtf8(value, lo, mem, expectedBytes));
285+
Assert.assertEquals("é世\uD83D\uDE00?x", readUtf8(mem, expectedBytes));
286+
for (int i = expectedBytes; i < bufSize; i++) {
287+
Assert.assertEquals("write past copied bytes at offset " + i, sentinel, Unsafe.getUnsafe().getByte(mem + i));
288+
}
289+
} finally {
290+
Unsafe.free(mem, bufSize, MemoryTag.NATIVE_DEFAULT);
291+
}
292+
}
293+
273294
@Test
274295
public void testStrCpyUtf8DoesNotSplitCharactersAtLimit() {
275296
final int bufSize = 16;
@@ -319,6 +340,14 @@ public void testUtf8Bytes() {
319340
Assert.assertEquals(1, Utf8s.utf8Bytes("\uDE00"));
320341
}
321342

343+
@Test
344+
public void testUtf8BytesRange() {
345+
String value = "xxAé世\uD83D\uDE00yy";
346+
Assert.assertEquals(10, Utf8s.utf8Bytes(value, 2, 7));
347+
Assert.assertEquals(1, Utf8s.utf8Bytes(value, 5, 6));
348+
Assert.assertEquals(4, Utf8s.utf8Bytes(value, 5, 7));
349+
}
350+
322351
@Test
323352
public void testUtf8BytesWithLimit() {
324353
Assert.assertEquals(0, Utf8s.utf8Bytes("hello", 0));

0 commit comments

Comments
 (0)