Skip to content

Commit 5d041c5

Browse files
mtopolnikclaude
andcommitted
Avoid per-byte ensureCapacity in UTF-8 encoding
Extract encodeUtf8() that writes directly to native memory via Unsafe.putByte on a running address pointer, with a single upfront ensureCapacity call for the full UTF-8 length. Both putUtf8() and putString() now compute utf8Length once, reserve capacity once, then delegate to encodeUtf8(). This also eliminates the double string scan that putString() previously performed (utf8Length + putUtf8's per-char loop with per-byte ensureCapacity). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 60d578d commit 5d041c5

1 file changed

Lines changed: 37 additions & 28 deletions

File tree

core/src/main/java/io/questdb/client/cutlass/qwp/client/NativeBufferWriter.java

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,8 @@ public void putString(String value) {
243243

244244
int utf8Len = utf8Length(value);
245245
putVarint(utf8Len);
246-
putUtf8(value);
246+
ensureCapacity(utf8Len);
247+
encodeUtf8(value);
247248
}
248249

249250
/**
@@ -254,33 +255,9 @@ public void putUtf8(String value) {
254255
if (value == null || value.isEmpty()) {
255256
return;
256257
}
257-
for (int i = 0, n = value.length(); i < n; i++) {
258-
char c = value.charAt(i);
259-
if (c < 0x80) {
260-
putByte((byte) c);
261-
} else if (c < 0x800) {
262-
putByte((byte) (0xC0 | (c >> 6)));
263-
putByte((byte) (0x80 | (c & 0x3F)));
264-
} else if (c >= 0xD800 && c <= 0xDBFF && i + 1 < n) {
265-
char c2 = value.charAt(++i);
266-
if (Character.isLowSurrogate(c2)) {
267-
int codePoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
268-
putByte((byte) (0xF0 | (codePoint >> 18)));
269-
putByte((byte) (0x80 | ((codePoint >> 12) & 0x3F)));
270-
putByte((byte) (0x80 | ((codePoint >> 6) & 0x3F)));
271-
putByte((byte) (0x80 | (codePoint & 0x3F)));
272-
} else {
273-
putByte((byte) '?');
274-
i--;
275-
}
276-
} else if (Character.isSurrogate(c)) {
277-
putByte((byte) '?');
278-
} else {
279-
putByte((byte) (0xE0 | (c >> 12)));
280-
putByte((byte) (0x80 | ((c >> 6) & 0x3F)));
281-
putByte((byte) (0x80 | (c & 0x3F)));
282-
}
283-
}
258+
int utf8Len = utf8Length(value);
259+
ensureCapacity(utf8Len);
260+
encodeUtf8(value);
284261
}
285262

286263
/**
@@ -314,4 +291,36 @@ public void skip(int bytes) {
314291
ensureCapacity(bytes);
315292
position += bytes;
316293
}
294+
295+
private void encodeUtf8(String value) {
296+
long addr = bufferPtr + position;
297+
for (int i = 0, n = value.length(); i < n; i++) {
298+
char c = value.charAt(i);
299+
if (c < 0x80) {
300+
Unsafe.getUnsafe().putByte(addr++, (byte) c);
301+
} else if (c < 0x800) {
302+
Unsafe.getUnsafe().putByte(addr++, (byte) (0xC0 | (c >> 6)));
303+
Unsafe.getUnsafe().putByte(addr++, (byte) (0x80 | (c & 0x3F)));
304+
} else if (c >= 0xD800 && c <= 0xDBFF && i + 1 < n) {
305+
char c2 = value.charAt(++i);
306+
if (Character.isLowSurrogate(c2)) {
307+
int codePoint = 0x10000 + ((c - 0xD800) << 10) + (c2 - 0xDC00);
308+
Unsafe.getUnsafe().putByte(addr++, (byte) (0xF0 | (codePoint >> 18)));
309+
Unsafe.getUnsafe().putByte(addr++, (byte) (0x80 | ((codePoint >> 12) & 0x3F)));
310+
Unsafe.getUnsafe().putByte(addr++, (byte) (0x80 | ((codePoint >> 6) & 0x3F)));
311+
Unsafe.getUnsafe().putByte(addr++, (byte) (0x80 | (codePoint & 0x3F)));
312+
} else {
313+
Unsafe.getUnsafe().putByte(addr++, (byte) '?');
314+
i--;
315+
}
316+
} else if (Character.isSurrogate(c)) {
317+
Unsafe.getUnsafe().putByte(addr++, (byte) '?');
318+
} else {
319+
Unsafe.getUnsafe().putByte(addr++, (byte) (0xE0 | (c >> 12)));
320+
Unsafe.getUnsafe().putByte(addr++, (byte) (0x80 | ((c >> 6) & 0x3F)));
321+
Unsafe.getUnsafe().putByte(addr++, (byte) (0x80 | (c & 0x3F)));
322+
}
323+
}
324+
position = (int) (addr - bufferPtr);
325+
}
317326
}

0 commit comments

Comments
 (0)