Skip to content

Commit d573d96

Browse files
committed
Preserve lone surrogates in native preview literals
1 parent baae7a1 commit d573d96

13 files changed

Lines changed: 420 additions & 19 deletions

File tree

_packages/native-preview/src/api/async/api.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import {
3030
readSourceFileHash,
3131
RemoteSourceFile,
3232
} from "../node/node.ts";
33+
import { Wtf8Decoder } from "../node/wtf8.ts";
3334
import { ObjectRegistry } from "../objectRegistry.ts";
3435
import type {
3536
APIOptions,
@@ -314,7 +315,7 @@ export class Program {
314315
private client: Client;
315316
private sourceFileCache: SourceFileCache;
316317
private toPath: (fileName: string) => Path;
317-
private decoder = new TextDecoder();
318+
private decoder = new Wtf8Decoder();
318319

319320
constructor(
320321
snapshotId: string,

_packages/native-preview/src/api/node/msgpack.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
// Minimal msgpack encoder/decoder.
22
// Supports: arrays, unsigned integers, strings, booleans, binary data.
33

4+
import { Wtf8Decoder } from "./wtf8.ts";
5+
46
// ── MessagePack format constants ────────────────────────────────────
57
export const MSGPACK_FIXARRAY3 = 0x93; // 3-element fixarray
68
export const MSGPACK_BIN8 = 0xc4;
@@ -39,7 +41,7 @@ export function writeBinHeader(buf: Uint8Array, off: number, len: number): numbe
3941
}
4042

4143
const encoder = new TextEncoder();
42-
const decoder = new TextDecoder();
44+
const decoder = new Wtf8Decoder();
4345

4446
export class MsgpackWriter {
4547
private buf: Uint8Array;

_packages/native-preview/src/api/node/node.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import {
2323
HEADER_OFFSET_STRUCTURED_DATA,
2424
NODE_LEN,
2525
} from "./protocol.ts";
26+
import { Wtf8Decoder } from "./wtf8.ts";
2627

2728
// Re-export everything consumers need from the other two files.
2829
export { RemoteNode, RemoteNodeList } from "./node.generated.ts";
@@ -242,7 +243,7 @@ export function parseNodeHandle(handle: string): ParsedNodeHandle {
242243
* (e.g. from typeToTypeNode) that don't have a source file.
243244
*/
244245
export function decodeNode(data: Uint8Array): Node {
245-
const sf = new RemoteSourceFile(data, new TextDecoder());
246+
const sf = new RemoteSourceFile(data, new Wtf8Decoder());
246247
return sf as unknown as Node;
247248
}
248249

_packages/native-preview/src/api/node/protocol.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
export const PROTOCOL_VERSION = 5;
1+
export const PROTOCOL_VERSION = 6;
22

33
export const HEADER_OFFSET_METADATA = 0;
44
export const HEADER_OFFSET_HASH_LO0 = 4;
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import type { TextDecodeOptions } from "node:util";
2+
3+
const surrogateLeadByte = 0xED;
4+
const surrogateSecondByteMin = 0xA0;
5+
const surrogateSecondByteMax = 0xBF;
6+
const continuationByteMin = 0x80;
7+
const continuationByteMax = 0xBF;
8+
9+
function isWtf8Surrogate(bytes: Uint8Array, index: number): boolean {
10+
return index + 2 < bytes.length
11+
&& bytes[index] === surrogateLeadByte
12+
&& bytes[index + 1] >= surrogateSecondByteMin
13+
&& bytes[index + 1] <= surrogateSecondByteMax
14+
&& bytes[index + 2] >= continuationByteMin
15+
&& bytes[index + 2] <= continuationByteMax;
16+
}
17+
18+
function getSurrogateCodeUnit(bytes: Uint8Array, index: number): number {
19+
return 0xD000 | ((bytes[index + 1] & 0x3F) << 6) | (bytes[index + 2] & 0x3F);
20+
}
21+
22+
function toUint8Array(input: NodeJS.AllowSharedBufferSource): Uint8Array {
23+
if (input instanceof Uint8Array) {
24+
return input;
25+
}
26+
if (ArrayBuffer.isView(input)) {
27+
return new Uint8Array(input.buffer, input.byteOffset, input.byteLength);
28+
}
29+
return new Uint8Array(input);
30+
}
31+
32+
export class Wtf8Decoder extends TextDecoder {
33+
override decode(input?: NodeJS.AllowSharedBufferSource, options?: TextDecodeOptions): string {
34+
if (input === undefined) {
35+
return super.decode(input, options);
36+
}
37+
38+
const bytes = toUint8Array(input);
39+
let result = "";
40+
let segmentStart = 0;
41+
42+
for (let i = 0; i < bytes.length; i++) {
43+
if (!isWtf8Surrogate(bytes, i)) {
44+
continue;
45+
}
46+
47+
if (segmentStart < i) {
48+
result += super.decode(bytes.subarray(segmentStart, i));
49+
}
50+
result += String.fromCharCode(getSurrogateCodeUnit(bytes, i));
51+
i += 2;
52+
segmentStart = i + 1;
53+
}
54+
55+
if (segmentStart === 0) {
56+
return super.decode(bytes, options);
57+
}
58+
if (segmentStart < bytes.length) {
59+
result += super.decode(bytes.subarray(segmentStart), options);
60+
}
61+
return result;
62+
}
63+
}

_packages/native-preview/src/api/sync/api.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import {
3838
readSourceFileHash,
3939
RemoteSourceFile,
4040
} from "../node/node.ts";
41+
import { Wtf8Decoder } from "../node/wtf8.ts";
4142
import { ObjectRegistry } from "../objectRegistry.ts";
4243
import type {
4344
APIOptions,
@@ -322,7 +323,7 @@ export class Program {
322323
private client: Client;
323324
private sourceFileCache: SourceFileCache;
324325
private toPath: (fileName: string) => Path;
325-
private decoder = new TextDecoder();
326+
private decoder = new Wtf8Decoder();
326327

327328
constructor(
328329
snapshotId: string,

_packages/native-preview/test/async/api.test.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,18 +206,24 @@ test("unicode escapes", async () => {
206206
"/tsconfig.json": "{}",
207207
"/src/1.ts": `"😃"`,
208208
"/src/2.ts": `"\\ud83d\\ude03"`,
209+
"/src/3.ts": `"\\ud800a\\udc00"`,
209210
});
210211
try {
211212
const snapshot = await api.updateSnapshot({ openProject: "/tsconfig.json" });
212213
const project = snapshot.getProject("/tsconfig.json")!;
214+
const expectedTexts = new Map([
215+
["/src/1.ts", "😃"],
216+
["/src/2.ts", "😃"],
217+
["/src/3.ts", "\ud800a\udc00"],
218+
]);
213219

214-
for (const file of ["/src/1.ts", "/src/2.ts"]) {
220+
for (const file of expectedTexts.keys()) {
215221
const sourceFile = await project.program.getSourceFile(file);
216222
assert.ok(sourceFile);
217223

218224
sourceFile.forEachChild(function visit(node) {
219225
if (isStringLiteral(node)) {
220-
assert.equal(node.text, "😃");
226+
assert.equal(node.text, expectedTexts.get(file));
221227
}
222228
node.forEachChild(visit);
223229
});

_packages/native-preview/test/encoder.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ describe("Encoder", () => {
6262
// Verify header
6363
const view = new DataView(encoded.buffer, encoded.byteOffset, encoded.byteLength);
6464
const metadata = view.getUint32(0, true);
65-
assert.strictEqual(metadata >>> 24, 5, "protocol version should be 5");
65+
assert.strictEqual(metadata >>> 24, 6, "protocol version should be 6");
6666

6767
// Verify we can decode it
6868
const decoded = decode(encoded);
@@ -169,11 +169,11 @@ describe("Encoder", () => {
169169
assert.strictEqual(rootKind, SyntaxKind.IfStatement);
170170
});
171171

172-
test("protocol version is 5", () => {
172+
test("protocol version is 6", () => {
173173
const sf = makeSF("", "/test.ts", []);
174174
const encoded = encodeSourceFile(sf);
175175
const view = new DataView(encoded.buffer, encoded.byteOffset, encoded.byteLength);
176-
assert.strictEqual(view.getUint32(0, true) >>> 24, 5);
176+
assert.strictEqual(view.getUint32(0, true) >>> 24, 6);
177177
});
178178

179179
test("boolean properties are encoded", () => {

_packages/native-preview/test/sync/api.test.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,18 +214,24 @@ test("unicode escapes", () => {
214214
"/tsconfig.json": "{}",
215215
"/src/1.ts": `"😃"`,
216216
"/src/2.ts": `"\\ud83d\\ude03"`,
217+
"/src/3.ts": `"\\ud800a\\udc00"`,
217218
});
218219
try {
219220
const snapshot = api.updateSnapshot({ openProject: "/tsconfig.json" });
220221
const project = snapshot.getProject("/tsconfig.json")!;
222+
const expectedTexts = new Map([
223+
["/src/1.ts", "😃"],
224+
["/src/2.ts", "😃"],
225+
["/src/3.ts", "\ud800a\udc00"],
226+
]);
221227

222-
for (const file of ["/src/1.ts", "/src/2.ts"]) {
228+
for (const file of expectedTexts.keys()) {
223229
const sourceFile = project.program.getSourceFile(file);
224230
assert.ok(sourceFile);
225231

226232
sourceFile.forEachChild(function visit(node) {
227233
if (isStringLiteral(node)) {
228-
assert.equal(node.text, "😃");
234+
assert.equal(node.text, expectedTexts.get(file));
229235
}
230236
node.forEachChild(visit);
231237
});
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import assert from "node:assert";
2+
import {
3+
describe,
4+
test,
5+
} from "node:test";
6+
import { Wtf8Decoder } from "../src/api/node/wtf8.ts";
7+
8+
describe("Wtf8Decoder", () => {
9+
test("decodes standard UTF-8", () => {
10+
const decoder = new Wtf8Decoder();
11+
assert.strictEqual(decoder.decode(new TextEncoder().encode("hello 🦀")), "hello 🦀");
12+
});
13+
14+
test("preserves WTF-8 encoded lone surrogates", () => {
15+
const decoder = new Wtf8Decoder();
16+
const text = decoder.decode(Uint8Array.of(
17+
0xF0,
18+
0x9F,
19+
0xA6,
20+
0x80,
21+
0xED,
22+
0x9F,
23+
0xBF,
24+
0xED,
25+
0xA0,
26+
0x80,
27+
0xED,
28+
0xA0,
29+
0x81,
30+
0xED,
31+
0xB0,
32+
0x80,
33+
0xF0,
34+
0x9F,
35+
0xA6,
36+
0x80,
37+
));
38+
39+
assert.deepStrictEqual(
40+
Array.from({ length: text.length }, (_, i) => text.charCodeAt(i)),
41+
[0xD83E, 0xDD80, 0xD7FF, 0xD800, 0xD801, 0xDC00, 0xD83E, 0xDD80],
42+
);
43+
});
44+
});

0 commit comments

Comments
 (0)