Skip to content

Commit 8ef685e

Browse files
committed
address vacuum issue 863.
daveshanley/vacuum#863 added a pre-parser that is efficient and only activated in this edge case.
1 parent ff053d3 commit 8ef685e

3 files changed

Lines changed: 292 additions & 43 deletions

File tree

datamodel/spec_info.go

Lines changed: 121 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import (
1010
"fmt"
1111
"strings"
1212
"time"
13+
"unicode/utf16"
14+
"unicode/utf8"
1315

1416
"github.com/pb33f/libopenapi/utils"
1517
"go.yaml.in/yaml/v4"
@@ -97,12 +99,11 @@ func extractSpecInfoInternal(spec []byte, bypass bool, skipJSON bool) (*SpecInfo
9799

98100
specInfo.NumLines = bytes.Count(spec, []byte{'\n'}) + 1
99101

100-
// Pre-process JSON to handle \/ escape sequences that YAML parser doesn't recognize.
101-
// JSON (RFC 8259) allows \/ as an optional escape for forward slash, but YAML does not.
102-
// See: https://github.com/pb33f/libopenapi/issues/479
102+
// Pre-process JSON escapes that YAML parsers do not accept even though
103+
// they are valid JSON, while preserving the existing YAML-node parse path.
103104
parseBytes := spec
104105
if specInfo.SpecFileType == JSONFileType {
105-
parseBytes = unescapeJSONSlashes(spec)
106+
parseBytes = normalizeJSONForYAMLParser(spec)
106107
}
107108

108109
err := yaml.Unmarshal(parseBytes, &parsedSpec)
@@ -324,39 +325,126 @@ func parseVersionTypeData(d interface{}) (string, int, error) {
324325
return string(r), int(r[0]) - '0', nil
325326
}
326327

327-
// unescapeJSONSlashes replaces the optional \/ escape sequence in JSON with /
328-
// JSON (RFC 8259) allows \/ as an optional escape for forward slash, but YAML
329-
// parsers (including go.yaml.in/yaml/v4) do not recognize it.
330-
// This handles escaped backslashes correctly: \\/ becomes \/ not //
331-
// Returns the original slice if no transformation is needed (zero allocation).
332-
func unescapeJSONSlashes(jsonBytes []byte) []byte {
333-
// fast path: check if transformation is needed
334-
if !bytes.Contains(jsonBytes, []byte(`\/`)) {
328+
// normalizeJSONForYAMLParser rewrites the small set of JSON escapes accepted by
329+
// RFC 8259 but rejected by go.yaml.in/yaml/v4. It returns the original slice
330+
// without allocation unless a rewrite is required.
331+
func normalizeJSONForYAMLParser(jsonBytes []byte) []byte {
332+
if bytes.IndexByte(jsonBytes, '\\') < 0 {
335333
return jsonBytes
336334
}
337335

338-
result := make([]byte, 0, len(jsonBytes))
339-
i := 0
340-
for i < len(jsonBytes) {
341-
if jsonBytes[i] == '\\' && i+1 < len(jsonBytes) {
342-
switch jsonBytes[i+1] {
343-
case '/':
344-
// \/ -> / (json optional escape for solidus)
345-
result = append(result, '/')
346-
i += 2
347-
case '\\':
348-
// preserve escaped backslash to prevent \\/ becoming //
349-
result = append(result, '\\', '\\')
350-
i += 2
351-
default:
352-
// preserve other escape sequences (\n, \t, \", etc.)
353-
result = append(result, jsonBytes[i])
354-
i++
355-
}
356-
} else {
357-
result = append(result, jsonBytes[i])
358-
i++
336+
var result []byte
337+
var runeBytes [utf8.UTFMax]byte
338+
last := 0
339+
scan := 0
340+
341+
for scan < len(jsonBytes) {
342+
rel := bytes.IndexByte(jsonBytes[scan:], '\\')
343+
if rel < 0 {
344+
break
345+
}
346+
347+
escape := scan + rel
348+
replacement, consumed, ok := jsonEscapeReplacement(jsonBytes, escape, &runeBytes)
349+
if !ok {
350+
scan = nextJSONEscapeScanOffset(jsonBytes, escape)
351+
continue
359352
}
353+
354+
if result == nil {
355+
result = make([]byte, 0, len(jsonBytes))
356+
}
357+
result = append(result, jsonBytes[last:escape]...)
358+
result = append(result, replacement...)
359+
scan = escape + consumed
360+
last = scan
361+
}
362+
363+
if result == nil {
364+
return jsonBytes
360365
}
366+
367+
result = append(result, jsonBytes[last:]...)
361368
return result
362369
}
370+
371+
func jsonEscapeReplacement(jsonBytes []byte, escape int, runeBytes *[utf8.UTFMax]byte) ([]byte, int, bool) {
372+
if escape+1 >= len(jsonBytes) {
373+
return nil, 0, false
374+
}
375+
376+
switch jsonBytes[escape+1] {
377+
case '/':
378+
runeBytes[0] = '/'
379+
return runeBytes[:1], 2, true
380+
case 'u':
381+
if escape+12 > len(jsonBytes) {
382+
return nil, 0, false
383+
}
384+
385+
high, ok := decodeJSONUnicodeEscape(jsonBytes[escape+2 : escape+6])
386+
if !ok || !isHighSurrogate(high) {
387+
return nil, 0, false
388+
}
389+
390+
lowEscape := escape + 6
391+
if jsonBytes[lowEscape] != '\\' || jsonBytes[lowEscape+1] != 'u' {
392+
return nil, 0, false
393+
}
394+
395+
low, ok := decodeJSONUnicodeEscape(jsonBytes[lowEscape+2 : lowEscape+6])
396+
if !ok || !isLowSurrogate(low) {
397+
return nil, 0, false
398+
}
399+
400+
r := utf16.DecodeRune(rune(high), rune(low))
401+
n := utf8.EncodeRune(runeBytes[:], r)
402+
return runeBytes[:n], 12, true
403+
default:
404+
return nil, 0, false
405+
}
406+
}
407+
408+
func nextJSONEscapeScanOffset(jsonBytes []byte, escape int) int {
409+
if escape+1 >= len(jsonBytes) {
410+
return escape + 1
411+
}
412+
return escape + 2
413+
}
414+
415+
func decodeJSONUnicodeEscape(hexBytes []byte) (uint16, bool) {
416+
if len(hexBytes) != 4 {
417+
return 0, false
418+
}
419+
420+
var value uint16
421+
for _, b := range hexBytes {
422+
hex, ok := jsonHexValue(b)
423+
if !ok {
424+
return 0, false
425+
}
426+
value = value<<4 | uint16(hex)
427+
}
428+
return value, true
429+
}
430+
431+
func jsonHexValue(b byte) (byte, bool) {
432+
switch {
433+
case b >= '0' && b <= '9':
434+
return b - '0', true
435+
case b >= 'a' && b <= 'f':
436+
return b - 'a' + 10, true
437+
case b >= 'A' && b <= 'F':
438+
return b - 'A' + 10, true
439+
default:
440+
return 0, false
441+
}
442+
}
443+
444+
func isHighSurrogate(value uint16) bool {
445+
return value >= 0xD800 && value <= 0xDBFF
446+
}
447+
448+
func isLowSurrogate(value uint16) bool {
449+
return value >= 0xDC00 && value <= 0xDFFF
450+
}

datamodel/spec_info_test.go

Lines changed: 137 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -454,9 +454,12 @@ $self: something`
454454
assert.Equal(t, "something", r.Self)
455455
}
456456

457-
// TestUnescapeJSONSlashes tests the unescapeJSONSlashes helper function
458-
// This addresses issue #479 where JSON files with \/ escape sequences fail to parse
459-
func TestUnescapeJSONSlashes(t *testing.T) {
457+
// TestNormalizeJSONForYAMLParser tests JSON escapes that are valid JSON but
458+
// rejected by the YAML parser used for YAML-node construction.
459+
func TestNormalizeJSONForYAMLParser(t *testing.T) {
460+
thumbsUp := string(rune(0x1F44D))
461+
rocket := string(rune(0x1F680))
462+
460463
tests := []struct {
461464
name string
462465
input string
@@ -475,22 +478,61 @@ func TestUnescapeJSONSlashes(t *testing.T) {
475478
{"other escapes preserved", `\n\t\/`, `\n\t/`},
476479
{"multiple escaped slashes", `\/one\/two\/three`, `/one/two/three`},
477480
{"mixed content", `{"path":"\/test","url":"https:\/\/example.com"}`, `{"path":"/test","url":"https://example.com"}`},
481+
{"valid surrogate pair", `\ud83d\udc4d`, thumbsUp},
482+
{"valid uppercase surrogate pair", `\uD83D\uDC4D`, thumbsUp},
483+
{"multiple surrogate pairs", `\ud83d\udc4d \ud83d\ude80`, thumbsUp + " " + rocket},
484+
{"surrogate pair with escaped slash", `https:\/\/example.com\/\ud83d\udc4d`, `https://example.com/` + thumbsUp},
485+
{"double escaped surrogate pair", `\\ud83d\\udc4d`, `\\ud83d\\udc4d`},
486+
{"trailing backslash", `\`, `\`},
487+
{"lone high surrogate", `\ud83d`, `\ud83d`},
488+
{"high surrogate without low escape", `\ud83dxxxxxx`, `\ud83dxxxxxx`},
489+
{"high surrogate followed by non-low surrogate", `\ud83d\u0041`, `\ud83d\u0041`},
490+
{"lone low surrogate", `\udc4d`, `\udc4d`},
491+
{"invalid high surrogate hex", `\ud83x\udc4d`, `\ud83x\udc4d`},
492+
{"invalid low surrogate hex", `\ud83d\udc4x`, `\ud83d\udc4x`},
493+
{"truncated unicode escape", `\u12`, `\u12`},
494+
{"non surrogate unicode escape", `\u003c`, `\u003c`},
478495
}
479496

480497
for _, tt := range tests {
481498
t.Run(tt.name, func(t *testing.T) {
482-
result := unescapeJSONSlashes([]byte(tt.input))
499+
result := normalizeJSONForYAMLParser([]byte(tt.input))
483500
assert.Equal(t, tt.expected, string(result))
484501
})
485502
}
486503
}
487504

488-
// TestUnescapeJSONSlashes_NoAllocation tests that the fast path returns original slice
489-
func TestUnescapeJSONSlashes_NoAllocation(t *testing.T) {
490-
input := []byte(`{"path":"/test"}`)
491-
result := unescapeJSONSlashes(input)
492-
// Should return same slice when no \/ present
493-
assert.Equal(t, &input[0], &result[0], "Should return original slice when no \\/ present")
505+
func TestDecodeJSONUnicodeEscape(t *testing.T) {
506+
value, ok := decodeJSONUnicodeEscape([]byte("D83D"))
507+
assert.True(t, ok)
508+
assert.Equal(t, uint16(0xD83D), value)
509+
510+
_, ok = decodeJSONUnicodeEscape([]byte("123"))
511+
assert.False(t, ok)
512+
513+
_, ok = decodeJSONUnicodeEscape([]byte("12xz"))
514+
assert.False(t, ok)
515+
}
516+
517+
// TestNormalizeJSONForYAMLParser_NoAllocation tests that unchanged inputs
518+
// return the original slice.
519+
func TestNormalizeJSONForYAMLParser_NoAllocation(t *testing.T) {
520+
tests := []string{
521+
`{"path":"/test"}`,
522+
`{"text":"line\nquoted\"tab\tunicode\u003c"}`,
523+
`{"text":"\\ud83d\\udc4d"}`,
524+
`{"text":"\ud83d"}`,
525+
`{"text":"\udc4d"}`,
526+
}
527+
528+
for _, tt := range tests {
529+
t.Run(tt, func(t *testing.T) {
530+
input := []byte(tt)
531+
result := normalizeJSONForYAMLParser(input)
532+
assert.Equal(t, tt, string(result))
533+
assert.Equal(t, &input[0], &result[0], "Should return original slice when no rewrite is needed")
534+
})
535+
}
494536
}
495537

496538
// TestExtractSpecInfo_JSON_EscapedSlashes tests issue #479
@@ -506,6 +548,49 @@ func TestExtractSpecInfo_JSON_EscapedSlashes(t *testing.T) {
506548
assert.Equal(t, utils.OpenApi3, r.SpecType)
507549
}
508550

551+
func TestExtractSpecInfo_JSON_SurrogatePairInExample(t *testing.T) {
552+
jsonWithSurrogatePair := `{
553+
"openapi": "3.0.1",
554+
"info": {"title": "r", "version": "1"},
555+
"paths": {
556+
"/t": {
557+
"post": {
558+
"operationId": "t",
559+
"responses": {
560+
"201": {
561+
"description": "ok",
562+
"content": {
563+
"application/json": {
564+
"schema": {"type": "object", "properties": {"x": {"type": "string"}}},
565+
"examples": {
566+
"e": {"value": {"x": "Hello \ud83d\udc4d"}}
567+
}
568+
}
569+
}
570+
}
571+
}
572+
}
573+
}
574+
}
575+
}`
576+
577+
r, e := ExtractSpecInfo([]byte(jsonWithSurrogatePair))
578+
assert.NoError(t, e)
579+
assert.Equal(t, "3.0.1", r.Version)
580+
assert.Equal(t, JSONFileType, r.SpecFileType)
581+
assert.Equal(t, utils.OpenApi3, r.SpecType)
582+
}
583+
584+
func TestExtractSpecInfo_JSON_SurrogatePairInDescription(t *testing.T) {
585+
jsonWithSurrogatePair := `{"openapi":"3.0.1","info":{"title":"r","version":"1","description":"Hello \ud83d\udc4d"},"paths":{}}`
586+
587+
r, e := ExtractSpecInfo([]byte(jsonWithSurrogatePair))
588+
assert.NoError(t, e)
589+
assert.Equal(t, "3.0.1", r.Version)
590+
assert.Equal(t, JSONFileType, r.SpecFileType)
591+
assert.Equal(t, utils.OpenApi3, r.SpecType)
592+
}
593+
509594
// TestExtractSpecInfo_JSON_EscapedSlashes_URL tests URL paths with escaped slashes
510595
func TestExtractSpecInfo_JSON_EscapedSlashes_URL(t *testing.T) {
511596
jsonWithURL := `{"openapi":"3.0.0","info":{"title":"Test","version":"1.0.0"},"servers":[{"url":"https:\/\/api.example.com\/v1"}],"paths":{}}`
@@ -619,6 +704,48 @@ func TestSpecInfo_Release(t *testing.T) {
619704
assert.Equal(t, "3.1.0", s.Version)
620705
}
621706

707+
var normalizeJSONForYAMLParserSink []byte
708+
709+
func BenchmarkNormalizeJSONForYAMLParser_NoEscapes(b *testing.B) {
710+
input := []byte(`{"openapi":"3.0.1","info":{"title":"r","version":"1"},"paths":{}}`)
711+
b.ReportAllocs()
712+
b.SetBytes(int64(len(input)))
713+
714+
for i := 0; i < b.N; i++ {
715+
normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input)
716+
}
717+
}
718+
719+
func BenchmarkNormalizeJSONForYAMLParser_CommonEscapesNoRewrite(b *testing.B) {
720+
input := []byte(`{"openapi":"3.0.1","info":{"title":"line\nquoted\"tab\tunicode\u003c","version":"1"},"paths":{}}`)
721+
b.ReportAllocs()
722+
b.SetBytes(int64(len(input)))
723+
724+
for i := 0; i < b.N; i++ {
725+
normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input)
726+
}
727+
}
728+
729+
func BenchmarkNormalizeJSONForYAMLParser_EscapedSlashes(b *testing.B) {
730+
input := []byte(`{"openapi":"3.0.1","info":{"title":"r","version":"1"},"servers":[{"url":"https:\/\/api.example.com\/v1"}],"paths":{"\/test":{}}}`)
731+
b.ReportAllocs()
732+
b.SetBytes(int64(len(input)))
733+
734+
for i := 0; i < b.N; i++ {
735+
normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input)
736+
}
737+
}
738+
739+
func BenchmarkNormalizeJSONForYAMLParser_SurrogatePair(b *testing.B) {
740+
input := []byte(`{"openapi":"3.0.1","info":{"title":"r","version":"1","description":"Hello \ud83d\udc4d"},"paths":{}}`)
741+
b.ReportAllocs()
742+
b.SetBytes(int64(len(input)))
743+
744+
for i := 0; i < b.N; i++ {
745+
normalizeJSONForYAMLParserSink = normalizeJSONForYAMLParser(input)
746+
}
747+
}
748+
622749
func TestSpecInfo_Release_Nil(t *testing.T) {
623750
var s *SpecInfo
624751
s.Release() // must not panic

0 commit comments

Comments
 (0)