diff --git a/_packages/native-preview/src/api/async/api.ts b/_packages/native-preview/src/api/async/api.ts index b6fc71a2c18..2d65c9da5d3 100644 --- a/_packages/native-preview/src/api/async/api.ts +++ b/_packages/native-preview/src/api/async/api.ts @@ -31,6 +31,7 @@ import { readSourceFileHash, RemoteSourceFile, } from "../node/node.ts"; +import { Wtf8Decoder } from "../node/wtf8.ts"; import { ObjectRegistry } from "../objectRegistry.ts"; import type { APIOptions, @@ -315,7 +316,7 @@ export class Program { private client: Client; private sourceFileCache: SourceFileCache; private toPath: (fileName: string) => Path; - private decoder = new TextDecoder(); + private decoder = new Wtf8Decoder(); constructor( snapshotId: string, diff --git a/_packages/native-preview/src/api/node/msgpack.ts b/_packages/native-preview/src/api/node/msgpack.ts index 6e32df4283d..50248d27bda 100644 --- a/_packages/native-preview/src/api/node/msgpack.ts +++ b/_packages/native-preview/src/api/node/msgpack.ts @@ -1,6 +1,8 @@ // Minimal msgpack encoder/decoder. // Supports: arrays, unsigned integers, strings, booleans, binary data. +import { Wtf8Decoder } from "./wtf8.ts"; + // ── MessagePack format constants ──────────────────────────────────── export const MSGPACK_FIXARRAY3 = 0x93; // 3-element fixarray export const MSGPACK_BIN8 = 0xc4; @@ -39,7 +41,7 @@ export function writeBinHeader(buf: Uint8Array, off: number, len: number): numbe } const encoder = new TextEncoder(); -const decoder = new TextDecoder(); +const decoder = new Wtf8Decoder(); export class MsgpackWriter { private buf: Uint8Array; diff --git a/_packages/native-preview/src/api/node/node.ts b/_packages/native-preview/src/api/node/node.ts index 8db099184a3..82919e55699 100644 --- a/_packages/native-preview/src/api/node/node.ts +++ b/_packages/native-preview/src/api/node/node.ts @@ -23,6 +23,7 @@ import { HEADER_OFFSET_STRUCTURED_DATA, NODE_LEN, } from "./protocol.ts"; +import { Wtf8Decoder } from "./wtf8.ts"; // Re-export everything consumers need from the other two files. export { RemoteNode, RemoteNodeList } from "./node.generated.ts"; @@ -242,7 +243,7 @@ export function parseNodeHandle(handle: string): ParsedNodeHandle { * (e.g. from typeToTypeNode) that don't have a source file. */ export function decodeNode(data: Uint8Array): Node { - const sf = new RemoteSourceFile(data, new TextDecoder()); + const sf = new RemoteSourceFile(data, new Wtf8Decoder()); return sf as unknown as Node; } diff --git a/_packages/native-preview/src/api/node/protocol.ts b/_packages/native-preview/src/api/node/protocol.ts index b21098cc2d4..dc28e0fa444 100644 --- a/_packages/native-preview/src/api/node/protocol.ts +++ b/_packages/native-preview/src/api/node/protocol.ts @@ -1,4 +1,4 @@ -export const PROTOCOL_VERSION = 5; +export const PROTOCOL_VERSION = 6; export const HEADER_OFFSET_METADATA = 0; export const HEADER_OFFSET_HASH_LO0 = 4; diff --git a/_packages/native-preview/src/api/node/wtf8.ts b/_packages/native-preview/src/api/node/wtf8.ts new file mode 100644 index 00000000000..83b409664f9 --- /dev/null +++ b/_packages/native-preview/src/api/node/wtf8.ts @@ -0,0 +1,62 @@ +const surrogateLeadByte = 0xED; +const surrogateSecondByteMin = 0xA0; +const surrogateSecondByteMax = 0xBF; +const continuationByteMin = 0x80; +const continuationByteMax = 0xBF; +type DecodeOptions = Parameters[1]; + +function isWtf8Surrogate(bytes: Uint8Array, index: number): boolean { + return index + 2 < bytes.length + && bytes[index] === surrogateLeadByte + && bytes[index + 1] >= surrogateSecondByteMin + && bytes[index + 1] <= surrogateSecondByteMax + && bytes[index + 2] >= continuationByteMin + && bytes[index + 2] <= continuationByteMax; +} + +function getSurrogateCodeUnit(bytes: Uint8Array, index: number): number { + return 0xD000 | ((bytes[index + 1] & 0x3F) << 6) | (bytes[index + 2] & 0x3F); +} + +function toUint8Array(input: NodeJS.AllowSharedBufferSource): Uint8Array { + if (input instanceof Uint8Array) { + return input; + } + if (ArrayBuffer.isView(input)) { + return new Uint8Array(input.buffer, input.byteOffset, input.byteLength); + } + return new Uint8Array(input); +} + +export class Wtf8Decoder extends TextDecoder { + override decode(input?: NodeJS.AllowSharedBufferSource, options?: DecodeOptions): string { + if (input === undefined) { + return super.decode(input, options); + } + + const bytes = toUint8Array(input); + const parts: string[] = []; + let segmentStart = 0; + + for (let i = 0; i < bytes.length; i++) { + if (!isWtf8Surrogate(bytes, i)) { + continue; + } + + if (segmentStart < i) { + parts.push(super.decode(bytes.subarray(segmentStart, i), options)); + } + parts.push(String.fromCharCode(getSurrogateCodeUnit(bytes, i))); + i += 2; + segmentStart = i + 1; + } + + if (segmentStart === 0) { + return super.decode(bytes, options); + } + if (segmentStart < bytes.length) { + parts.push(super.decode(bytes.subarray(segmentStart), options)); + } + return parts.join(""); + } +} diff --git a/_packages/native-preview/src/api/sync/api.ts b/_packages/native-preview/src/api/sync/api.ts index 38af3e847d5..cbcb2decc2d 100644 --- a/_packages/native-preview/src/api/sync/api.ts +++ b/_packages/native-preview/src/api/sync/api.ts @@ -39,6 +39,7 @@ import { readSourceFileHash, RemoteSourceFile, } from "../node/node.ts"; +import { Wtf8Decoder } from "../node/wtf8.ts"; import { ObjectRegistry } from "../objectRegistry.ts"; import type { APIOptions, @@ -323,7 +324,7 @@ export class Program { private client: Client; private sourceFileCache: SourceFileCache; private toPath: (fileName: string) => Path; - private decoder = new TextDecoder(); + private decoder = new Wtf8Decoder(); constructor( snapshotId: string, diff --git a/_packages/native-preview/test/async/api.test.ts b/_packages/native-preview/test/async/api.test.ts index fceba3ca6df..74f5347bb31 100644 --- a/_packages/native-preview/test/async/api.test.ts +++ b/_packages/native-preview/test/async/api.test.ts @@ -206,18 +206,24 @@ test("unicode escapes", async () => { "/tsconfig.json": "{}", "/src/1.ts": `"😃"`, "/src/2.ts": `"\\ud83d\\ude03"`, + "/src/3.ts": `"\\ud800a\\udc00"`, }); try { const snapshot = await api.updateSnapshot({ openProject: "/tsconfig.json" }); const project = snapshot.getProject("/tsconfig.json")!; + const expectedTexts = new Map([ + ["/src/1.ts", "😃"], + ["/src/2.ts", "😃"], + ["/src/3.ts", "\ud800a\udc00"], + ]); - for (const file of ["/src/1.ts", "/src/2.ts"]) { + for (const file of expectedTexts.keys()) { const sourceFile = await project.program.getSourceFile(file); assert.ok(sourceFile); sourceFile.forEachChild(function visit(node) { if (isStringLiteral(node)) { - assert.equal(node.text, "😃"); + assert.equal(node.text, expectedTexts.get(file)); } node.forEachChild(visit); }); @@ -228,6 +234,38 @@ test("unicode escapes", async () => { } }); +test("template unicode escapes", async () => { + const api = spawnAPI({ + "/tsconfig.json": "{}", + "/src/index.ts": "`\\ud800${0}\\udc00`", + }); + try { + const snapshot = await api.updateSnapshot({ openProject: "/tsconfig.json" }); + const project = snapshot.getProject("/tsconfig.json")!; + const sourceFile = await project.program.getSourceFile("/src/index.ts"); + assert.ok(sourceFile); + + let sawHead = false; + let sawTail = false; + sourceFile.forEachChild(function visit(node) { + if (isTemplateHead(node)) { + assert.equal(node.text, "\ud800"); + sawHead = true; + } + else if (isTemplateTail(node)) { + assert.equal(node.text, "\udc00"); + sawTail = true; + } + node.forEachChild(visit); + }); + assert.ok(sawHead); + assert.ok(sawTail); + } + finally { + await api.close(); + } +}); + test("Object equality", async () => { const api = spawnAPI(); try { diff --git a/_packages/native-preview/test/encoder.test.ts b/_packages/native-preview/test/encoder.test.ts index 0dd09db4c25..4ccc34d018a 100644 --- a/_packages/native-preview/test/encoder.test.ts +++ b/_packages/native-preview/test/encoder.test.ts @@ -62,7 +62,7 @@ describe("Encoder", () => { // Verify header const view = new DataView(encoded.buffer, encoded.byteOffset, encoded.byteLength); const metadata = view.getUint32(0, true); - assert.strictEqual(metadata >>> 24, 5, "protocol version should be 5"); + assert.strictEqual(metadata >>> 24, 6, "protocol version should be 6"); // Verify we can decode it const decoded = decode(encoded); @@ -169,11 +169,11 @@ describe("Encoder", () => { assert.strictEqual(rootKind, SyntaxKind.IfStatement); }); - test("protocol version is 5", () => { + test("protocol version is 6", () => { const sf = makeSF("", "/test.ts", []); const encoded = encodeSourceFile(sf); const view = new DataView(encoded.buffer, encoded.byteOffset, encoded.byteLength); - assert.strictEqual(view.getUint32(0, true) >>> 24, 5); + assert.strictEqual(view.getUint32(0, true) >>> 24, 6); }); test("boolean properties are encoded", () => { diff --git a/_packages/native-preview/test/sync/api.test.ts b/_packages/native-preview/test/sync/api.test.ts index 53b53be92d2..359387865b0 100644 --- a/_packages/native-preview/test/sync/api.test.ts +++ b/_packages/native-preview/test/sync/api.test.ts @@ -214,18 +214,24 @@ test("unicode escapes", () => { "/tsconfig.json": "{}", "/src/1.ts": `"😃"`, "/src/2.ts": `"\\ud83d\\ude03"`, + "/src/3.ts": `"\\ud800a\\udc00"`, }); try { const snapshot = api.updateSnapshot({ openProject: "/tsconfig.json" }); const project = snapshot.getProject("/tsconfig.json")!; + const expectedTexts = new Map([ + ["/src/1.ts", "😃"], + ["/src/2.ts", "😃"], + ["/src/3.ts", "\ud800a\udc00"], + ]); - for (const file of ["/src/1.ts", "/src/2.ts"]) { + for (const file of expectedTexts.keys()) { const sourceFile = project.program.getSourceFile(file); assert.ok(sourceFile); sourceFile.forEachChild(function visit(node) { if (isStringLiteral(node)) { - assert.equal(node.text, "😃"); + assert.equal(node.text, expectedTexts.get(file)); } node.forEachChild(visit); }); @@ -236,6 +242,38 @@ test("unicode escapes", () => { } }); +test("template unicode escapes", () => { + const api = spawnAPI({ + "/tsconfig.json": "{}", + "/src/index.ts": "`\\ud800${0}\\udc00`", + }); + try { + const snapshot = api.updateSnapshot({ openProject: "/tsconfig.json" }); + const project = snapshot.getProject("/tsconfig.json")!; + const sourceFile = project.program.getSourceFile("/src/index.ts"); + assert.ok(sourceFile); + + let sawHead = false; + let sawTail = false; + sourceFile.forEachChild(function visit(node) { + if (isTemplateHead(node)) { + assert.equal(node.text, "\ud800"); + sawHead = true; + } + else if (isTemplateTail(node)) { + assert.equal(node.text, "\udc00"); + sawTail = true; + } + node.forEachChild(visit); + }); + assert.ok(sawHead); + assert.ok(sawTail); + } + finally { + api.close(); + } +}); + test("Object equality", () => { const api = spawnAPI(); try { diff --git a/_packages/native-preview/test/wtf8.test.ts b/_packages/native-preview/test/wtf8.test.ts new file mode 100644 index 00000000000..8fe8fcd3652 --- /dev/null +++ b/_packages/native-preview/test/wtf8.test.ts @@ -0,0 +1,44 @@ +import assert from "node:assert"; +import { + describe, + test, +} from "node:test"; +import { Wtf8Decoder } from "../src/api/node/wtf8.ts"; + +describe("Wtf8Decoder", () => { + test("decodes standard UTF-8", () => { + const decoder = new Wtf8Decoder(); + assert.strictEqual(decoder.decode(new TextEncoder().encode("hello 🦀")), "hello 🦀"); + }); + + test("preserves WTF-8 encoded lone surrogates", () => { + const decoder = new Wtf8Decoder(); + const text = decoder.decode(Uint8Array.of( + 0xF0, + 0x9F, + 0xA6, + 0x80, + 0xED, + 0x9F, + 0xBF, + 0xED, + 0xA0, + 0x80, + 0xED, + 0xA0, + 0x81, + 0xED, + 0xB0, + 0x80, + 0xF0, + 0x9F, + 0xA6, + 0x80, + )); + + assert.deepStrictEqual( + Array.from({ length: text.length }, (_, i) => text.charCodeAt(i)), + [0xD83E, 0xDD80, 0xD7FF, 0xD800, 0xD801, 0xDC00, 0xD83E, 0xDD80], + ); + }); +}); diff --git a/internal/api/encoder/encoder.go b/internal/api/encoder/encoder.go index 22b4c7fff5d..a50129f65ec 100644 --- a/internal/api/encoder/encoder.go +++ b/internal/api/encoder/encoder.go @@ -59,7 +59,7 @@ const ( ) const ( - ProtocolVersion uint8 = 5 + ProtocolVersion uint8 = 6 ) // Source File Binary Format @@ -108,10 +108,11 @@ const ( // String data (variable) // ---------------------- // -// The string data section contains UTF-8 encoded string data. In typical cases, the entirety of the string data is the -// source file text, and individual nodes with string properties reference their positional slice of the file text. In -// cases where a node's string property is not equal to the slice of file text at its position, the unique string is -// appended to the string data section after the file text. +// The string data section contains UTF-8 encoded string data, with WTF-8 used for JS strings containing lone UTF-16 +// surrogates. In typical cases, the entirety of the string data is the source file text, and individual nodes with +// string properties reference their positional slice of the file text. In cases where a node's string property is not +// equal to the slice of file text at its position, the unique string is appended to the string data section after the +// file text. // // Extended node data (variable) // ----------------------------- @@ -546,21 +547,21 @@ func recordExtendedData_SourceFile(node *ast.Node, strs *stringTable, positionMa func recordExtendedData_TemplateHead(node *ast.Node, strs *stringTable, positionMap *ast.PositionMap, extendedData *[]byte, structuredData *[]byte) { n := node.AsTemplateHead() - textIndex := strs.add(n.Text, node.Kind, node.Pos(), node.End()) + textIndex := strs.add(encodeTemplateTextForJS(n.Text, n.RawText), node.Kind, node.Pos(), node.End()) rawTextIndex := strs.add(n.RawText, node.Kind, node.Pos(), node.End()) *extendedData = appendUint32s(*extendedData, textIndex, rawTextIndex, uint32(n.TemplateFlags)) } func recordExtendedData_TemplateMiddle(node *ast.Node, strs *stringTable, positionMap *ast.PositionMap, extendedData *[]byte, structuredData *[]byte) { n := node.AsTemplateMiddle() - textIndex := strs.add(n.Text, node.Kind, node.Pos(), node.End()) + textIndex := strs.add(encodeTemplateTextForJS(n.Text, n.RawText), node.Kind, node.Pos(), node.End()) rawTextIndex := strs.add(n.RawText, node.Kind, node.Pos(), node.End()) *extendedData = appendUint32s(*extendedData, textIndex, rawTextIndex, uint32(n.TemplateFlags)) } func recordExtendedData_TemplateTail(node *ast.Node, strs *stringTable, positionMap *ast.PositionMap, extendedData *[]byte, structuredData *[]byte) { n := node.AsTemplateTail() - textIndex := strs.add(n.Text, node.Kind, node.Pos(), node.End()) + textIndex := strs.add(encodeTemplateTextForJS(n.Text, n.RawText), node.Kind, node.Pos(), node.End()) rawTextIndex := strs.add(n.RawText, node.Kind, node.Pos(), node.End()) *extendedData = appendUint32s(*extendedData, textIndex, rawTextIndex, uint32(n.TemplateFlags)) } @@ -704,7 +705,7 @@ func getNodeCommonData_SyntheticExpression(_ *ast.Node) uint32 { func recordExtendedData_StringLiteral(node *ast.Node, strs *stringTable, _ *ast.PositionMap, extendedData *[]byte, _ *[]byte) { n := node.AsStringLiteral() - textIndex := strs.add(n.Text, node.Kind, node.Pos(), node.End()) + textIndex := strs.add(encodeLiteralTextForJS(n.Text, node, strs), node.Kind, node.Pos(), node.End()) *extendedData = appendUint32s(*extendedData, textIndex, uint32(n.TokenFlags)) } @@ -728,6 +729,6 @@ func recordExtendedData_RegularExpressionLiteral(node *ast.Node, strs *stringTab func recordExtendedData_NoSubstitutionTemplateLiteral(node *ast.Node, strs *stringTable, _ *ast.PositionMap, extendedData *[]byte, _ *[]byte) { n := node.AsNoSubstitutionTemplateLiteral() - textIndex := strs.add(n.Text, node.Kind, node.Pos(), node.End()) + textIndex := strs.add(encodeLiteralTextForJS(n.Text, node, strs), node.Kind, node.Pos(), node.End()) *extendedData = appendUint32s(*extendedData, textIndex, uint32(n.TemplateFlags)) } diff --git a/internal/api/encoder/encoder_test.go b/internal/api/encoder/encoder_test.go index f676bf9eebd..7c6931122e3 100644 --- a/internal/api/encoder/encoder_test.go +++ b/internal/api/encoder/encoder_test.go @@ -53,6 +53,63 @@ func TestEncodeSourceFileWithUnicodeEscapes(t *testing.T) { }) } +func TestEncodeSourceFilePreservesSurrogateEscapes(t *testing.T) { + t.Parallel() + sourceFile := parser.ParseSourceFile(ast.SourceFileParseOptions{ + FileName: "/test.ts", + Path: "/test.ts", + }, `let s = "\uD83E\uDD80\uD800a\uDC00\uD7FF\uD801\uDBFF\uDFFF";`, core.ScriptKindTS) + + buf, err := encoder.EncodeSourceFile(sourceFile) + assert.NilError(t, err) + + text, ok := findExtendedNodeText(buf, ast.KindStringLiteral) + assert.Assert(t, ok) + assert.DeepEqual(t, text, []byte{ + 0xf0, 0x9f, 0xa6, 0x80, // \uD83E\uDD80 + 0xed, 0xa0, 0x80, // \uD800 + 'a', + 0xed, 0xb0, 0x80, // \uDC00 + 0xed, 0x9f, 0xbf, // \uD7FF + 0xed, 0xa0, 0x81, // \uD801 + 0xf4, 0x8f, 0xbf, 0xbf, // \uDBFF\uDFFF + }) +} + +func TestEncodeSourceFilePreservesTemplateSurrogateEscapes(t *testing.T) { + t.Parallel() + sourceFile := parser.ParseSourceFile(ast.SourceFileParseOptions{ + FileName: "/test.ts", + Path: "/test.ts", + }, "let s = `\\uD800${1}\\uDC00`;", core.ScriptKindTS) + + buf, err := encoder.EncodeSourceFile(sourceFile) + assert.NilError(t, err) + + headText, ok := findExtendedNodeText(buf, ast.KindTemplateHead) + assert.Assert(t, ok) + assert.DeepEqual(t, headText, []byte{0xed, 0xa0, 0x80}) + + tailText, ok := findExtendedNodeText(buf, ast.KindTemplateTail) + assert.Assert(t, ok) + assert.DeepEqual(t, tailText, []byte{0xed, 0xb0, 0x80}) +} + +func TestEncodeSourceFileFallsBackForUnterminatedSurrogateEscape(t *testing.T) { + t.Parallel() + sourceFile := parser.ParseSourceFile(ast.SourceFileParseOptions{ + FileName: "/test.ts", + Path: "/test.ts", + }, `let s = "\uD800a`, core.ScriptKindTS) + + buf, err := encoder.EncodeSourceFile(sourceFile) + assert.NilError(t, err) + + text, ok := findExtendedNodeText(buf, ast.KindStringLiteral) + assert.Assert(t, ok) + assert.DeepEqual(t, text, []byte("\ufffda")) +} + func BenchmarkEncodeSourceFile(b *testing.B) { repo.SkipIfNoTypeScriptSubmodule(b) filePath := filepath.Join(repo.TypeScriptSubmodulePath(), "src/compiler/checker.ts") @@ -73,6 +130,28 @@ func readUint32(buf []byte, offset int) uint32 { return binary.LittleEndian.Uint32(buf[offset : offset+4]) } +func findExtendedNodeText(encoded []byte, kind ast.Kind) ([]byte, bool) { + offsetExtended := readUint32(encoded, encoder.HeaderOffsetExtendedData) + offsetNodes := readUint32(encoded, encoder.HeaderOffsetNodes) + for i := int(offsetNodes) + encoder.NodeSize; i < len(encoded); i += encoder.NodeSize { + if ast.Kind(readUint32(encoded, i+encoder.NodeOffsetKind)) != kind { + continue + } + data := readUint32(encoded, i+encoder.NodeOffsetData) + textIndex := readUint32(encoded, int(offsetExtended+(data&encoder.NodeDataStringIndexMask))) + return encodedString(encoded, textIndex), true + } + return nil, false +} + +func encodedString(encoded []byte, stringIndex uint32) []byte { + offsetStringOffsets := readUint32(encoded, encoder.HeaderOffsetStringOffsets) + offsetStrings := readUint32(encoded, encoder.HeaderOffsetStringData) + strStart := readUint32(encoded, int(offsetStringOffsets+stringIndex*4)) + strEnd := readUint32(encoded, int(offsetStringOffsets+stringIndex*4)+4) + return encoded[offsetStrings+strStart : offsetStrings+strEnd] +} + func formatEncodedSourceFile(encoded []byte) string { var result strings.Builder var getIndent func(parentIndex uint32) string diff --git a/internal/api/encoder/literal_text.go b/internal/api/encoder/literal_text.go new file mode 100644 index 00000000000..4ef27c3fdba --- /dev/null +++ b/internal/api/encoder/literal_text.go @@ -0,0 +1,252 @@ +package encoder + +import ( + "strings" + "unicode/utf8" + + "github.com/microsoft/typescript-go/internal/ast" + "github.com/microsoft/typescript-go/internal/scanner" +) + +const ( + surr1 = 0xd800 + surr2 = 0xdc00 + surr3 = 0xe000 + surrSelf = 0x10000 +) + +func encodeLiteralTextForJS(text string, node *ast.Node, strs *stringTable) string { + raw, ok := rawQuotedLiteralText(node, strs) + if !ok { + return text + } + decoded, hasSurrogate, ok := decodeQuotedLiteralText(raw) + if !ok || !hasSurrogate { + return text + } + return decoded +} + +func rawQuotedLiteralText(node *ast.Node, strs *stringTable) (string, bool) { + if node.End() <= 0 || node.End() > len(strs.fileText) { + return "", false + } + start := scanner.SkipTrivia(strs.fileText, node.Pos()) + if start >= node.End() { + return "", false + } + switch strs.fileText[start] { + case '\'', '"', '`': + if node.End()-start < 2 || strs.fileText[node.End()-1] != strs.fileText[start] { + return "", false + } + return strs.fileText[start:node.End()], true + default: + return "", false + } +} + +func decodeQuotedLiteralText(raw string) (text string, hasSurrogate bool, ok bool) { + if len(raw) < 2 { + return "", false, false + } + return decodeEscapedLiteralText(raw[1:len(raw)-1], false) +} + +func encodeTemplateTextForJS(text string, rawText string) string { + decoded, hasSurrogate, ok := decodeEscapedLiteralText(rawText, true) + if !ok || !hasSurrogate { + return text + } + return decoded +} + +func decodeEscapedLiteralText(raw string, normalizeTemplateLineEndings bool) (text string, hasSurrogate bool, ok bool) { + var out strings.Builder + for i := 0; i < len(raw); { + if raw[i] != '\\' { + if normalizeTemplateLineEndings && raw[i] == '\r' { + out.WriteByte('\n') + i++ + if i < len(raw) && raw[i] == '\n' { + i++ + } + continue + } + out.WriteByte(raw[i]) + i++ + continue + } + ch, next, ok := decodeEscape(raw, i, len(raw)) + if !ok { + return "", false, false + } + if codePointIsHighSurrogate(ch) { + hasSurrogate = true + if nextCh, nextNext, ok := decodeUnicodeEscape(raw, next, len(raw)); ok && codePointIsLowSurrogate(nextCh) { + out.WriteRune(surrogatePairToCodepoint(ch, nextCh)) + i = nextNext + continue + } + } else if codePointIsLowSurrogate(ch) { + hasSurrogate = true + } + out.WriteString(encodeCodePointForJS(ch)) + i = next + } + return out.String(), hasSurrogate, true +} + +func decodeEscape(raw string, start int, end int) (rune, int, bool) { + if start+1 >= end { + return 0, 0, false + } + switch raw[start+1] { + case '0': + if start+2 >= end || !isDigit(raw[start+2]) { + return 0, start + 2, true + } + return decodeOctalEscape(raw, start, end, 3) + case '1', '2', '3': + return decodeOctalEscape(raw, start, end, 3) + case '4', '5', '6', '7': + return decodeOctalEscape(raw, start, end, 2) + case 'u': + return decodeUnicodeEscape(raw, start, end) + case 'x': + if start+4 > end { + return 0, 0, false + } + hi, ok := hexValue(raw[start+2]) + if !ok { + return 0, 0, false + } + lo, ok := hexValue(raw[start+3]) + if !ok { + return 0, 0, false + } + return rune(hi<<4 | lo), start + 4, true + case 'b': + return '\b', start + 2, true + case 't': + return '\t', start + 2, true + case 'n': + return '\n', start + 2, true + case 'v': + return '\v', start + 2, true + case 'f': + return '\f', start + 2, true + case 'r': + return '\r', start + 2, true + case '\r': + next := start + 2 + if next < end && raw[next] == '\n' { + next++ + } + return -1, next, true + case '\n': + return -1, start + 2, true + default: + ch, size := utf8.DecodeRuneInString(raw[start+1 : end]) + if ch == utf8.RuneError && size == 0 { + return 0, 0, false + } + return ch, start + 1 + size, true + } +} + +func decodeOctalEscape(raw string, start int, end int, maxDigits int) (rune, int, bool) { + next := start + 2 + for digits := 1; digits < maxDigits && next < end && isOctalDigit(raw[next]); digits++ { + next++ + } + return parseOctalEscape(raw[start+1 : next]), next, true +} + +func parseOctalEscape(text string) rune { + value := rune(0) + for i := range len(text) { + value = value*8 + rune(text[i]-'0') + } + return value +} + +func isDigit(b byte) bool { + return '0' <= b && b <= '9' +} + +func isOctalDigit(b byte) bool { + return '0' <= b && b <= '7' +} + +func decodeUnicodeEscape(raw string, start int, end int) (rune, int, bool) { + if start+1 >= end || raw[start] != '\\' || raw[start+1] != 'u' { + return 0, 0, false + } + if start+2 < end && raw[start+2] == '{' { + value := 0 + i := start + 3 + for ; i < end && raw[i] != '}'; i++ { + digit, ok := hexValue(raw[i]) + if !ok { + return 0, 0, false + } + value = value*16 + digit + } + if i >= end || raw[i] != '}' || value > 0x10FFFF { + return 0, 0, false + } + return rune(value), i + 1, true + } + if start+6 > end { + return 0, 0, false + } + value := 0 + for i := start + 2; i < start+6; i++ { + digit, ok := hexValue(raw[i]) + if !ok { + return 0, 0, false + } + value = value*16 + digit + } + return rune(value), start + 6, true +} + +func hexValue(b byte) (int, bool) { + switch { + case '0' <= b && b <= '9': + return int(b - '0'), true + case 'a' <= b && b <= 'f': + return int(b-'a') + 10, true + case 'A' <= b && b <= 'F': + return int(b-'A') + 10, true + default: + return 0, false + } +} + +func codePointIsHighSurrogate(r rune) bool { + return surr1 <= r && r < surr2 +} + +func codePointIsLowSurrogate(r rune) bool { + return surr2 <= r && r < surr3 +} + +func surrogatePairToCodepoint(r1, r2 rune) rune { + return ((r1 - surr1) << 10) + (r2 - surr2) + surrSelf +} + +func encodeCodePointForJS(r rune) string { + if r < 0 { + return "" + } + if codePointIsHighSurrogate(r) || codePointIsLowSurrogate(r) { + return string([]byte{ + 0xed, + byte(0x80 | ((r >> 6) & 0x3f)), + byte(0x80 | (r & 0x3f)), + }) + } + return string(r) +}