From a5cbeef497234dcfc24627ca1ef40a61a1ab2c32 Mon Sep 17 00:00:00 2001 From: Kevin McDonald Date: Wed, 20 May 2026 08:22:35 +0200 Subject: [PATCH] Add protoscope language, assembler, disassembler Adds `protoscope`, a human-readable text format for raw Protobuf binary wire payloads, to `protocompile`. It enables writing, parsing, compiling (assembly), and decompiling (disassembly) without requiring a schema. It supports multi-frame documents (separated by `---`) and transport framing (gRPC, ConnectRPC, and Varint delimited). - **Parser & AST (`internal/protoscope/parser`, `internal/protoscope/ast`)**: Parses tokens into an AST of fields, blocks (groups/length-delimited), values, and options. - **Assembler (`internal/protoscope/assembler`)**: Compiles AST nodes into raw Protobuf wire format. - **Disassembler (`internal/protoscope/disassembler`)**: Decompiles binary Protobuf payloads back to protoscope text format using heuristics. - **Public API (`protoscope`)**: Exposes public endpoints (`Assemble`, `Disassemble`, `Diagnostics`, `Hover`, and `Possibilities`) for integrations (e.g. Buf LSP). Handles frame splitting/joining, flags parsing, and context-aware hover info. - `Fuzz tests` and a few related fixes as a result of the fuzz testing. Changes in `experimental/internal/lexer/number.go` is an example of changes made to avoid a DoS vector that was revealed with this testing. --- .golangci.yml | 3 + Makefile | 8 + experimental/internal/lexer/lexer.go | 2 + experimental/internal/lexer/loop.go | 4 +- experimental/internal/lexer/number.go | 10 +- .../protoscope/assembler/assembler.go | 262 ++++++ .../protoscope/assembler/assembler_test.go | 74 ++ .../protoscope/assembler/fuzz_test.go | 58 ++ .../protoscope/assembler/integration_test.go | 276 +++++++ experimental/internal/protoscope/ast/ast.go | 254 ++++++ .../protoscope/disassembler/disassembler.go | 418 ++++++++++ .../disassembler/disassembler_test.go | 43 + .../protoscope/disassembler/fuzz_test.go | 67 ++ .../protoscope/disassembler/heuristic_test.go | 100 +++ .../protoscope/disassembler/possibilities.go | 352 ++++++++ .../disassembler/possibilities_test.go | 132 +++ .../internal/protoscope/parser/fuzz_test.go | 36 + .../internal/protoscope/parser/parser.go | 201 +++++ .../internal/protoscope/parser/parser_test.go | 113 +++ .../testdata/fuzz/FuzzParse/21e7c81290725f5b | 2 + .../protoscope/testdata/all_types.proto | 40 + .../testdata/all_types_proto2.proto | 12 + .../internal/protoscope/testdata/all_wires.pb | Bin 0 -> 247 bytes .../protoscope/testdata/all_wires.protoscope | 84 ++ .../internal/protoscope/testdata/fixed.pb | 1 + .../protoscope/testdata/fixed.protoscope | 2 + .../internal/protoscope/testdata/group.pb | 1 + .../protoscope/testdata/group.protoscope | 3 + .../internal/protoscope/testdata/nested.pb | 1 + .../protoscope/testdata/nested.protoscope | 2 + .../internal/protoscope/testdata/normal.pb | 2 + .../protoscope/testdata/normal.protoscope | 4 + .../internal/protoscope/testdata/packed.pb | 2 + .../protoscope/testdata/packed.protoscope | 1 + .../protoscope/testdata/recursive.protoscope | 3 + .../internal/protoscope/testdata/simple.pb | 1 + .../protoscope/testdata/simple.protoscope | 1 + .../option/cel_literals.proto.stderr.txt | 18 +- experimental/protoscope/fuzz_test.go | 74 ++ experimental/protoscope/protoscope.go | 774 ++++++++++++++++++ experimental/protoscope/protoscope_test.go | 446 ++++++++++ .../fuzz/FuzzDisassemble/5955458d53084c9e | 3 + experimental/report/diagnostic.go | 12 + experimental/report/report.go | 10 + experimental/token/keyword/keyword.go | 14 - experimental/token/number.go | 5 +- internal/decimal/decimal.go | 5 + 47 files changed, 3914 insertions(+), 22 deletions(-) create mode 100644 experimental/internal/protoscope/assembler/assembler.go create mode 100644 experimental/internal/protoscope/assembler/assembler_test.go create mode 100644 experimental/internal/protoscope/assembler/fuzz_test.go create mode 100644 experimental/internal/protoscope/assembler/integration_test.go create mode 100644 experimental/internal/protoscope/ast/ast.go create mode 100644 experimental/internal/protoscope/disassembler/disassembler.go create mode 100644 experimental/internal/protoscope/disassembler/disassembler_test.go create mode 100644 experimental/internal/protoscope/disassembler/fuzz_test.go create mode 100644 experimental/internal/protoscope/disassembler/heuristic_test.go create mode 100644 experimental/internal/protoscope/disassembler/possibilities.go create mode 100644 experimental/internal/protoscope/disassembler/possibilities_test.go create mode 100644 experimental/internal/protoscope/parser/fuzz_test.go create mode 100644 experimental/internal/protoscope/parser/parser.go create mode 100644 experimental/internal/protoscope/parser/parser_test.go create mode 100644 experimental/internal/protoscope/parser/testdata/fuzz/FuzzParse/21e7c81290725f5b create mode 100644 experimental/internal/protoscope/testdata/all_types.proto create mode 100644 experimental/internal/protoscope/testdata/all_types_proto2.proto create mode 100644 experimental/internal/protoscope/testdata/all_wires.pb create mode 100644 experimental/internal/protoscope/testdata/all_wires.protoscope create mode 100644 experimental/internal/protoscope/testdata/fixed.pb create mode 100644 experimental/internal/protoscope/testdata/fixed.protoscope create mode 100644 experimental/internal/protoscope/testdata/group.pb create mode 100644 experimental/internal/protoscope/testdata/group.protoscope create mode 100644 experimental/internal/protoscope/testdata/nested.pb create mode 100644 experimental/internal/protoscope/testdata/nested.protoscope create mode 100644 experimental/internal/protoscope/testdata/normal.pb create mode 100644 experimental/internal/protoscope/testdata/normal.protoscope create mode 100644 experimental/internal/protoscope/testdata/packed.pb create mode 100644 experimental/internal/protoscope/testdata/packed.protoscope create mode 100644 experimental/internal/protoscope/testdata/recursive.protoscope create mode 100644 experimental/internal/protoscope/testdata/simple.pb create mode 100644 experimental/internal/protoscope/testdata/simple.protoscope create mode 100644 experimental/protoscope/fuzz_test.go create mode 100644 experimental/protoscope/protoscope.go create mode 100644 experimental/protoscope/protoscope_test.go create mode 100644 experimental/protoscope/testdata/fuzz/FuzzDisassemble/5955458d53084c9e diff --git a/.golangci.yml b/.golangci.yml index 777bd5f3..ff5f2b17 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,6 +1,8 @@ version: "2" linters: default: all + enable: + - gomodguard_v2 disable: # TODO: TCN-350 - initial exclusions for failing linters. # Should enable all of these? @@ -20,6 +22,7 @@ linters: - goconst - gocyclo - godoclint + - gomodguard - interfacebloat - modernize - nestif diff --git a/Makefile b/Makefile index a28d368d..45c93439 100644 --- a/Makefile +++ b/Makefile @@ -60,6 +60,14 @@ test: $(PROTOC) ## Run unit tests $(GO) test $(if $(filter 386,$(GOARCH)),,-race) -cover ./... $(GO) test -tags protolegacy ./... +.PHONY: fuzz +fuzz: $(PROTOC) ## Run fuzz tests + $(GO) test -v -fuzz=FuzzRoundTrip -fuzztime=30s ./experimental/internal/protoscope/assembler + $(GO) test -v -fuzz=FuzzParse -fuzztime=30s ./experimental/internal/protoscope/parser + $(GO) test -v -fuzz=FuzzDisassemble -fuzztime=30s ./experimental/internal/protoscope/disassembler + $(GO) test -v -fuzz=FuzzAssemble -fuzztime=30s ./experimental/protoscope + $(GO) test -v -fuzz=FuzzDisassemble -fuzztime=30s ./experimental/protoscope + .PHONY: benchmarks benchmarks: $(PROTOC) ## Run benchmarks $(GO) test -bench=. -benchmem -v ./experimental/benchmark diff --git a/experimental/internal/lexer/lexer.go b/experimental/internal/lexer/lexer.go index 5b0ea24c..5f45abab 100644 --- a/experimental/internal/lexer/lexer.go +++ b/experimental/internal/lexer/lexer.go @@ -92,6 +92,8 @@ type Lexer struct { EscapePartialX bool // Partial \xN escapes. EscapeUppercaseX bool // The unusual \XNN escape. EscapeOldStyleUnicode bool // Old-style Unicode escapes \uXXXX and \UXXXXXXXX. + + AllowBacktickStrings bool // If true, backticks ` can enclose string/hex literals. } // Lex runs lexical analysis on file and returns a new token stream as a result. diff --git a/experimental/internal/lexer/loop.go b/experimental/internal/lexer/loop.go index d3cf9016..62c8d2fb 100644 --- a/experimental/internal/lexer/loop.go +++ b/experimental/internal/lexer/loop.go @@ -187,14 +187,14 @@ func loop(l *lexer) { r := l.pop() switch { - case r == '"', r == '\'': + case r == '"', r == '\'', (r == '`' && l.AllowBacktickStrings): l.cursor-- // Back up to behind the quote before resuming. lexString(l, "") case l.NumberCanStartWithDot && r == '.', unicode.IsDigit(r): // Back up behind the rune we just popped. l.cursor -= utf8.RuneLen(r) - lexNumber(l) + _ = lexNumber(l) case unicodex.IsXIDStart(r): // Back up behind the rune we just popped. diff --git a/experimental/internal/lexer/number.go b/experimental/internal/lexer/number.go index 100a734c..0fc1f078 100644 --- a/experimental/internal/lexer/number.go +++ b/experimental/internal/lexer/number.go @@ -81,7 +81,7 @@ func lexNumber(l *lexer) token.Token { token.MutateMeta[tokenmeta.Number](tok).Base = base } - isFloat := taxa.IsFloatText(digits) + isFloat := taxa.IsFloatText(tok.Text()) expBase := 1 expIdx := -1 if isFloat { @@ -190,6 +190,12 @@ func lexNumber(l *lexer) token.Token { goto fail } + // Cap the exponent to prevent extremely large values from causing hangs in v.Int() + // or other operations. 1,000,000 is still quite large but manageable. + if v.IsInf() || v.IsNaN() || v.Exp() > 1000000 || v.Exp() < -1000000 { + goto fail + } + // We want this to overflow to Infinity as needed, which Float64 // will do for us. Otherwise it will ties-to-even as the // protobuf.com spec requires. @@ -219,7 +225,7 @@ func lexNumber(l *lexer) token.Token { case result.big != nil: token.MutateMeta[tokenmeta.Number](tok).Big = new(decimal.Decimal).ReuseInt(result.big) - case base == 10 && !result.hasThousands: + case base == 10 && !result.hasThousands && suffix == "" && prefix == "": // We explicitly do not call SetValue for the most common case of base // 10 integers, because that is handled for us on-demand in AsInt. This // is a memory consumption optimization. diff --git a/experimental/internal/protoscope/assembler/assembler.go b/experimental/internal/protoscope/assembler/assembler.go new file mode 100644 index 00000000..d53154d1 --- /dev/null +++ b/experimental/internal/protoscope/assembler/assembler.go @@ -0,0 +1,262 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package assembler + +import ( + "encoding/binary" + "encoding/hex" + "math" + "strings" + "unicode" + + "github.com/bufbuild/protocompile/experimental/id" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/ast" + "github.com/bufbuild/protocompile/experimental/seq" + "github.com/bufbuild/protocompile/experimental/token" + "github.com/bufbuild/protocompile/experimental/token/keyword" +) + +// Assemble translates a protoscope AST into Protobuf wire format. +func Assemble(file *ast.File) []byte { + a := &assembler{} + for decl := range seq.Values(file.Decls()) { + a.assembleDecl(decl, false) + } + return a.buf +} + +type assembler struct { + buf []byte +} + +func (a *assembler) assembleDecl(decl ast.DeclAny, inBlock bool) { + switch decl.Kind() { + case ast.DeclKindField: + a.assembleField(id.Wrap(decl.Context(), id.ID[ast.Field](decl.ID().Value()))) + case ast.DeclKindLiteral: + a.assembleLiteral(id.Wrap(decl.Context(), id.ID[ast.Literal](decl.ID().Value())), inBlock) + case ast.DeclKindBlock: + a.assembleBlock(id.Wrap(decl.Context(), id.ID[ast.Block](decl.ID().Value())), inBlock) + } +} + +func (a *assembler) assembleField(f ast.Field) { + tag, _ := f.Tag().AsNumber().Int() + wireType := uint64(0) + val := f.Value() + if !val.IsZero() { + switch val.Kind() { + case ast.DeclKindBlock: + block := id.Wrap(val.Context(), id.ID[ast.Block](val.ID().Value())) + if block.Token().Keyword() == keyword.Bang { + wireType = 3 // SGROUP + } else { + wireType = 2 // LEN + } + case ast.DeclKindLiteral: + lit := id.Wrap(val.Context(), id.ID[ast.Literal](val.ID().Value())) + if lit.Token().Kind() == token.String { + wireType = 2 // LEN + } else if lit.Token().Kind() == token.Number { + num := lit.Token().AsNumber() + suffix := num.Suffix().Text() + switch suffix { + case "i64", "f64": + wireType = 1 // I64 + case "i32", "f32": + wireType = 5 // I32 + default: + if num.IsFloat() { + wireType = 1 // I64 (double) + } + } + } + } + } + + // Override with explicit wire type hint. + switch f.WireType().Text() { + case "VARINT": + wireType = 0 + case "I64": + wireType = 1 + case "LEN": + wireType = 2 + case "SGROUP": + wireType = 3 + case "EGROUP": + wireType = 4 + case "I32": + wireType = 5 + } + + a.writeVarint(tag<<3 | wireType) + + if wireType == 4 { // EGROUP + return + } + + if wireType == 3 { // SGROUP + a.assembleDecl(val, false) + a.writeVarint(tag<<3 | 4) // Emit matching EGROUP + return + } + + if wireType == 2 { // LEN + // Length-delimited fields must be prefixed by their length. + // Some values (blocks and strings) are self-length-delimited. + isSelfDelimited := false + if !val.IsZero() && val.Kind() == ast.DeclKindBlock { + isSelfDelimited = true + } + if !val.IsZero() && val.Kind() == ast.DeclKindLiteral { + lit := id.Wrap(val.Context(), id.ID[ast.Literal](val.ID().Value())) + if lit.Token().Kind() == token.String { + isSelfDelimited = true + } + } + + if isSelfDelimited { + a.assembleDecl(val, false) + } else { + sub := &assembler{} + sub.assembleDecl(val, false) + a.writeVarint(uint64(len(sub.buf))) + a.buf = append(a.buf, sub.buf...) + } + return + } + + a.assembleDecl(val, false) +} + +func (a *assembler) assembleLiteral(l ast.Literal, inBlock bool) { + tok := l.Token() + switch tok.Keyword() { + case keyword.True: + a.writeVarint(1) + return + case keyword.False: + a.writeVarint(0) + return + } + + switch tok.Kind() { + case token.Number: + // Check for suffix hints + num := tok.AsNumber() + suffix := num.Suffix().Text() + switch suffix { + case "i32": + var buf [4]byte + if num.IsFloat() { + f, _ := num.Float() + binary.LittleEndian.PutUint32(buf[:], math.Float32bits(float32(f))) + } else { + v, _ := num.Int() + binary.LittleEndian.PutUint32(buf[:], uint32(v)) + } + a.buf = append(a.buf, buf[:]...) + case "f32": + f, _ := num.Float() + var buf [4]byte + binary.LittleEndian.PutUint32(buf[:], math.Float32bits(float32(f))) + a.buf = append(a.buf, buf[:]...) + case "i64": + var buf [8]byte + if num.IsFloat() { + f, _ := num.Float() + binary.LittleEndian.PutUint64(buf[:], math.Float64bits(f)) + } else { + v, _ := num.Int() + binary.LittleEndian.PutUint64(buf[:], v) + } + a.buf = append(a.buf, buf[:]...) + case "f64": + f, _ := num.Float() + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], math.Float64bits(f)) + a.buf = append(a.buf, buf[:]...) + case "z": + v := num.Value().Int(nil).Int64() + // Zigzag encoding: (n << 1) ^ (n >> 63) + zigzag := uint64((v << 1) ^ (v >> 63)) + a.writeVarint(zigzag) + default: + if num.IsFloat() { + f, _ := num.Float() + var buf [8]byte + binary.LittleEndian.PutUint64(buf[:], math.Float64bits(f)) + a.buf = append(a.buf, buf[:]...) + } else { + v, _ := num.Int() + a.writeVarint(v) + } + } + case token.String: + open, _ := tok.AsString().Quotes() + isHex := open.Text() == "`" + var contentBytes []byte + if isHex { + // Decode hex string + var sb strings.Builder + for _, r := range tok.AsString().Text() { + if !unicode.IsSpace(r) { + sb.WriteRune(r) + } + } + var err error + contentBytes, err = hex.DecodeString(sb.String()) + if err != nil { + // Fallback to raw string text if decoding fails + contentBytes = []byte(tok.AsString().Text()) + } + } else { + contentBytes = []byte(tok.AsString().Text()) + } + + if inBlock { + a.buf = append(a.buf, contentBytes...) + } else { + a.writeVarint(uint64(len(contentBytes))) + a.buf = append(a.buf, contentBytes...) + } + } +} + +func (a *assembler) assembleBlock(b ast.Block, _ bool) { + tok := b.Token() + switch tok.Keyword() { + case keyword.LBracket, keyword.Brackets, keyword.LBrace, keyword.Braces: + // Length-prefixed block + sub := &assembler{} + for decl := range seq.Values(b.Decls()) { + sub.assembleDecl(decl, true) + } + a.writeVarint(uint64(len(sub.buf))) + a.buf = append(a.buf, sub.buf...) + case keyword.Bang: + // Group content (no length prefix) + for decl := range seq.Values(b.Decls()) { + a.assembleDecl(decl, false) + } + } +} + +func (a *assembler) writeVarint(v uint64) { + var buf [10]byte + n := binary.PutUvarint(buf[:], v) + a.buf = append(a.buf, buf[:n]...) +} diff --git a/experimental/internal/protoscope/assembler/assembler_test.go b/experimental/internal/protoscope/assembler/assembler_test.go new file mode 100644 index 00000000..736bcfd8 --- /dev/null +++ b/experimental/internal/protoscope/assembler/assembler_test.go @@ -0,0 +1,74 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package assembler + +import ( + "bytes" + "testing" + + "github.com/bufbuild/protocompile/experimental/internal/protoscope/parser" + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/source" +) + +func TestAssemble(t *testing.T) { + tests := []struct { + name string + input string + expected []byte + }{ + { + name: "simple varint", + input: "1: 150", + expected: []byte{0x08, 0x96, 0x01}, + }, + { + name: "boolean true", + input: "1: true", + expected: []byte{0x08, 0x01}, + }, + { + name: "string literal", + input: `1: "testing"`, + expected: []byte{0x0a, 0x07, 't', 'e', 's', 't', 'i', 'n', 'g'}, + }, + { + name: "nested block", + input: "4: [ 1: 42 ]", + expected: []byte{0x22, 0x02, 0x08, 0x2a}, + }, + { + name: "nested group", + input: "5: !{ 1: 42 }", + expected: []byte{0x2b, 0x08, 0x2a, 0x2c}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + src := source.NewFile("test.protoscope", tt.input) + r := &report.Report{} + file, ok := parser.Parse("test.protoscope", src, r) + if !ok { + t.Fatalf("failed to parse: %v", r.Diagnostics) + } + + got := Assemble(file) + if !bytes.Equal(got, tt.expected) { + t.Errorf("Assemble() = %x, want %x", got, tt.expected) + } + }) + } +} diff --git a/experimental/internal/protoscope/assembler/fuzz_test.go b/experimental/internal/protoscope/assembler/fuzz_test.go new file mode 100644 index 00000000..b761a2f1 --- /dev/null +++ b/experimental/internal/protoscope/assembler/fuzz_test.go @@ -0,0 +1,58 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package assembler + +import ( + "bytes" + "testing" + + "github.com/bufbuild/protocompile/experimental/internal/protoscope/disassembler" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/parser" + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/source" +) + +func FuzzRoundTrip(f *testing.F) { + f.Add([]byte{0x08, 0x96, 0x01}) + f.Add([]byte{0x0a, 0x08, 'J', 'o', 'h', 'n', ' ', 'D', 'o', 'e'}) + + f.Fuzz(func(_ *testing.T, data []byte) { + // 1. Disassemble bytes to text + var buf bytes.Buffer + if err := disassembler.Disassemble(data, &buf); err != nil { + return + } + text := buf.String() + + // 2. Parse text back to AST + src := source.NewFile("fuzz.protoscope", text) + r := &report.Report{} + file, ok := parser.Parse("fuzz.protoscope", src, r) + if !ok { + // This might happen if the disassembler outputs something that the parser + // doesn't support yet, or if the input was invalid and the disassembler + // produced "invalid" text (like unsupported wire types). + // For now we just return, but in a mature implementation this should be rare. + return + } + + // 3. Assemble AST back to bytes + _ = Assemble(file) + + // Note: we don't strictly require that re-assembled bytes match the original + // because of heuristics in disassembly and potential ambiguity in wire types + // if not explicitly tagged. The goal here is stability (no panics). + }) +} diff --git a/experimental/internal/protoscope/assembler/integration_test.go b/experimental/internal/protoscope/assembler/integration_test.go new file mode 100644 index 00000000..8bf22827 --- /dev/null +++ b/experimental/internal/protoscope/assembler/integration_test.go @@ -0,0 +1,276 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package assembler + +import ( + "bytes" + "context" + "os" + "path/filepath" + "strings" + "testing" + + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/reflect/protoreflect" + "google.golang.org/protobuf/types/dynamicpb" + + "github.com/bufbuild/protocompile" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/disassembler" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/parser" + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/source" +) + +func TestTestDataIntegration(t *testing.T) { + files, err := filepath.Glob("../testdata/*.protoscope") + if err != nil { + t.Fatal(err) + } + + for _, file := range files { + name := filepath.Base(file) + t.Run(name, func(t *testing.T) { + content, err := os.ReadFile(file) + if err != nil { + t.Fatal(err) + } + + src := source.NewFile(name, string(content)) + r := &report.Report{} + parsed, ok := parser.Parse(name, src, r) + if !ok { + t.Fatalf("failed to parse: %v", r.Diagnostics) + } + + gotBytes := Assemble(parsed) + + pbFile := strings.TrimSuffix(file, ".protoscope") + ".pb" + if _, err := os.Stat(pbFile); err == nil { + expectedBytes, err := os.ReadFile(pbFile) + if err != nil { + t.Fatal(err) + } + if !bytes.Equal(gotBytes, expectedBytes) { + t.Errorf("assembled bytes mismatch for %s\ngot: %x\nwant: %x", name, gotBytes, expectedBytes) + } + } else { + t.Logf("no corresponding .pb file for %s", name) + } + }) + } +} + +func TestDisassemblerRoundTrip(t *testing.T) { + files, err := filepath.Glob("../testdata/*.pb") + if err != nil { + t.Fatal(err) + } + + for _, pbFile := range files { + name := filepath.Base(pbFile) + t.Run(name, func(t *testing.T) { + originalBytes, err := os.ReadFile(pbFile) + if err != nil { + t.Fatal(err) + } + + // 1. Disassemble + var buf bytes.Buffer + if err := disassembler.Disassemble(originalBytes, &buf); err != nil { + t.Fatalf("Disassemble failed: %v", err) + } + disassembledText := buf.String() + t.Logf("Disassembled text:\n%s", disassembledText) + + // 2. Parse + src := source.NewFile(name+".protoscope", disassembledText) + r := &report.Report{} + parsed, ok := parser.Parse(name+".protoscope", src, r) + if !ok { + t.Fatalf("failed to parse disassembled text: %v", r.Diagnostics) + } + + // 3. Assemble + gotBytes := Assemble(parsed) + + // 4. Compare + if !bytes.Equal(gotBytes, originalBytes) { + t.Errorf("round-trip failed for %s\ngot: %x\nwant: %x", name, gotBytes, originalBytes) + } + }) + } +} + +func TestAllTypesDynamic(t *testing.T) { + // Compile the two proto files dynamically + compiler := &protocompile.Compiler{ + Resolver: &protocompile.SourceResolver{ + ImportPaths: []string{"../testdata"}, + }, + } + ctx := context.Background() + fds, err := compiler.Compile(ctx, "all_types.proto", "all_types_proto2.proto") + if err != nil { + t.Fatalf("failed to compile proto files: %v", err) + } + + var allTypesFD, allTypesProto2FD protoreflect.FileDescriptor + for _, fd := range fds { + if fd.Path() == "all_types.proto" { + allTypesFD = fd + } else if fd.Path() == "all_types_proto2.proto" { + allTypesProto2FD = fd + } + } + + if allTypesFD == nil || allTypesProto2FD == nil { + t.Fatal("failed to find compiled file descriptors") + } + + // 1. Fill and test AllTypes (proto3) + t.Run("AllTypes (proto3)", func(t *testing.T) { + md := allTypesFD.Messages().ByName("AllTypes") + msg := dynamicpb.NewMessage(md) + + // Set values for every single field type + msg.Set(md.Fields().ByNumber(1), protoreflect.ValueOfFloat64(123.456)) + msg.Set(md.Fields().ByNumber(2), protoreflect.ValueOfFloat32(78.9)) + msg.Set(md.Fields().ByNumber(3), protoreflect.ValueOfInt32(-42)) + msg.Set(md.Fields().ByNumber(4), protoreflect.ValueOfInt64(-123456789)) + msg.Set(md.Fields().ByNumber(5), protoreflect.ValueOfUint32(42)) + msg.Set(md.Fields().ByNumber(6), protoreflect.ValueOfUint64(123456789)) + msg.Set(md.Fields().ByNumber(7), protoreflect.ValueOfInt32(-10)) + msg.Set(md.Fields().ByNumber(8), protoreflect.ValueOfInt64(-1000)) + msg.Set(md.Fields().ByNumber(9), protoreflect.ValueOfUint32(55)) + msg.Set(md.Fields().ByNumber(10), protoreflect.ValueOfUint64(66)) + msg.Set(md.Fields().ByNumber(11), protoreflect.ValueOfInt32(-77)) + msg.Set(md.Fields().ByNumber(12), protoreflect.ValueOfInt64(-88)) + msg.Set(md.Fields().ByNumber(13), protoreflect.ValueOfBool(true)) + msg.Set(md.Fields().ByNumber(14), protoreflect.ValueOfString("hello world")) + msg.Set(md.Fields().ByNumber(15), protoreflect.ValueOfBytes([]byte{0x01, 0x02, 0x03})) + + nestedMd := md.Fields().ByNumber(16).Message() + nestedMsg := dynamicpb.NewMessage(nestedMd) + nestedMsg.Set(nestedMd.Fields().ByNumber(1), protoreflect.ValueOfInt32(999)) + msg.Set(md.Fields().ByNumber(16), protoreflect.ValueOfMessage(nestedMsg)) + + msg.Set(md.Fields().ByNumber(17), protoreflect.ValueOfEnum(1)) + + repeatedField := msg.Mutable(md.Fields().ByNumber(18)).List() + repeatedField.Append(protoreflect.ValueOfInt32(11)) + repeatedField.Append(protoreflect.ValueOfInt32(22)) + repeatedField.Append(protoreflect.ValueOfInt32(33)) + + mapField := msg.Mutable(md.Fields().ByNumber(19)).Map() + mapField.Set(protoreflect.MapKey(protoreflect.ValueOfString("key1")), protoreflect.ValueOfString("val1")) + mapField.Set(protoreflect.MapKey(protoreflect.ValueOfString("key2")), protoreflect.ValueOfString("val2")) + + originalBytes, err := proto.Marshal(msg) + if err != nil { + t.Fatalf("Marshal failed: %v", err) + } + + verifyRoundTrip(t, "all_types_proto3", originalBytes) + }) + + // 2. Fill and test AllTypesProto2 (proto2) for groups and unpacked fields + t.Run("AllTypesProto2 (proto2)", func(t *testing.T) { + md2 := allTypesProto2FD.Messages().ByName("AllTypesProto2") + msg2 := dynamicpb.NewMessage(md2) + + groupMd := md2.Fields().ByNumber(1).Message() + groupList := msg2.Mutable(md2.Fields().ByNumber(1)).List() + + groupMsg1 := dynamicpb.NewMessage(groupMd) + groupMsg1.Set(groupMd.Fields().ByNumber(2), protoreflect.ValueOfInt32(50)) + groupList.Append(protoreflect.ValueOfMessage(groupMsg1)) + + groupMsg2 := dynamicpb.NewMessage(groupMd) + groupMsg2.Set(groupMd.Fields().ByNumber(2), protoreflect.ValueOfInt32(60)) + groupList.Append(protoreflect.ValueOfMessage(groupMsg2)) + + unpackedList := msg2.Mutable(md2.Fields().ByNumber(3)).List() + unpackedList.Append(protoreflect.ValueOfInt32(70)) + unpackedList.Append(protoreflect.ValueOfInt32(80)) + + originalBytes, err := proto.Marshal(msg2) + if err != nil { + t.Fatalf("Marshal failed: %v", err) + } + + verifyRoundTrip(t, "all_types_proto2", originalBytes) + }) +} + +func TestHeuristicEdgeCases(t *testing.T) { + // Edge Case 1: Printable string looking like 4 fields (false positive for isMessage) + // ASCII string: " 0(10283" + // Parsed as fields: + // - tag 4, wire 0, value 48 (0x20 0x30) + // - tag 5, wire 0, value 49 (0x28 0x31) + // - tag 6, wire 0, value 50 (0x30 0x32) + // - tag 7, wire 0, value 51 (0x38 0x33) + // Wrap as length-delimited field 1. Total bytes: 0a 08 20 30 28 31 30 32 38 33 + t.Run("printable string false positive message", func(t *testing.T) { + bytesVal := []byte{0x0a, 0x08, 0x20, 0x30, 0x28, 0x31, 0x30, 0x32, 0x38, 0x33} + verifyRoundTrip(t, "printable_str_looks_like_msg", bytesVal) + }) + + // Edge Case 2: Submessage looking like printable string (false negative for isMessage) + // Bytes: 0a 02 30 32 + // (Field 1, length 2, content: 0x30 0x32) + // Inside submessage: tag 6, wire 0, value 50 (ASCII "02"). + // Classified as string because length <= 3 fields and 100% printable. + t.Run("submessage false negative string", func(t *testing.T) { + bytesVal := []byte{0x0a, 0x02, 0x30, 0x32} + verifyRoundTrip(t, "submsg_looks_like_str", bytesVal) + }) + + // Edge Case 3: Submessage containing a group (false negative for isMessage) + // Bytes: 0a 04 0b 10 03 0c + // (Field 1, length 4, content: 0x0b 0x10 0x03 0x0c) + // Inside submessage: tag 1, wireType 3 (SGROUP), tag 2, wireType 0, value 3, tag 1, wireType 4 (EGROUP) + // Classified as hex string because contains groups. + t.Run("submessage containing group", func(t *testing.T) { + bytesVal := []byte{0x0a, 0x04, 0x0b, 0x10, 0x03, 0x0c} + verifyRoundTrip(t, "submsg_with_group", bytesVal) + }) +} + +func verifyRoundTrip(t *testing.T, name string, originalBytes []byte) { + // 1. Disassemble + var buf bytes.Buffer + if err := disassembler.Disassemble(originalBytes, &buf); err != nil { + t.Fatalf("Disassemble failed: %v", err) + } + disassembledText := buf.String() + t.Logf("[%s] Disassembled:\n%s", name, disassembledText) + + // 2. Parse + src := source.NewFile(name+".protoscope", disassembledText) + r := &report.Report{} + parsed, ok := parser.Parse(name+".protoscope", src, r) + if !ok { + t.Fatalf("failed to parse disassembled text: %v", r.Diagnostics) + } + + // 3. Assemble + gotBytes := Assemble(parsed) + + // 4. Compare + if !bytes.Equal(gotBytes, originalBytes) { + t.Errorf("round-trip failed for %s\ngot: %x\nwant: %x", name, gotBytes, originalBytes) + } +} diff --git a/experimental/internal/protoscope/ast/ast.go b/experimental/internal/protoscope/ast/ast.go new file mode 100644 index 00000000..f1f32a96 --- /dev/null +++ b/experimental/internal/protoscope/ast/ast.go @@ -0,0 +1,254 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ast + +import ( + "github.com/bufbuild/protocompile/experimental/id" + "github.com/bufbuild/protocompile/experimental/seq" + "github.com/bufbuild/protocompile/experimental/source" + "github.com/bufbuild/protocompile/experimental/token" + "github.com/bufbuild/protocompile/internal/arena" +) + +var _ id.Context = (*File)(nil) +var _ source.Spanner = DeclAny{} +var _ source.Spanner = Field{} +var _ source.Spanner = Literal{} +var _ source.Spanner = Block{} + +// File is the root of a protoscope AST. +type File struct { + path string + stream *token.Stream + nodes Nodes + + decls id.DynSeq[DeclAny, DeclKind, *File] +} + +// New creates a new File with the given path and token stream. +func New(path string, stream *token.Stream) *File { + f := &File{ + path: path, + stream: stream, + } + f.nodes.file = f + return f +} + +// FromID implements id.Context. +func (f *File) FromID(id uint64, want any) any { + switch want.(type) { + case **rawField: + return f.nodes.fields.Deref(arena.Pointer[rawField](id)) + case **rawLiteral: + return f.nodes.literals.Deref(arena.Pointer[rawLiteral](id)) + case **rawBlock: + return f.nodes.blocks.Deref(arena.Pointer[rawBlock](id)) + default: + return nil + } +} + +// Path returns the path to this file. +func (f *File) Path() string { + return f.path +} + +// Stream returns the token stream for this file. +func (f *File) Stream() *token.Stream { + return f.stream +} + +// Nodes returns a [Nodes] that can be used to construct new nodes in this file. +func (f *File) Nodes() *Nodes { + return &f.nodes +} + +// Decls returns the top-level declarations in this file. +func (f *File) Decls() seq.Inserter[DeclAny] { + return f.decls.Inserter(f) +} + +// DeclAny represents any protoscope declaration. +type DeclAny id.DynNode[DeclAny, DeclKind, *File] + +// AsField converts a DeclAny into a Field, if that is the declaration it contains. +func (d DeclAny) AsField() Field { + if d.Kind() != DeclKindField { + return Field{} + } + return id.Wrap(d.Context(), id.ID[Field](d.ID().Value())) +} + +// AsLiteral converts a DeclAny into a Literal, if that is the declaration it contains. +func (d DeclAny) AsLiteral() Literal { + if d.Kind() != DeclKindLiteral { + return Literal{} + } + return id.Wrap(d.Context(), id.ID[Literal](d.ID().Value())) +} + +// AsBlock converts a DeclAny into a Block, if that is the declaration it contains. +func (d DeclAny) AsBlock() Block { + if d.Kind() != DeclKindBlock { + return Block{} + } + return id.Wrap(d.Context(), id.ID[Block](d.ID().Value())) +} + +func (d DeclAny) Span() source.Span { + switch d.Kind() { + case DeclKindField: + return d.AsField().Span() + case DeclKindLiteral: + return d.AsLiteral().Span() + case DeclKindBlock: + return d.AsBlock().Span() + default: + return source.Span{} + } +} + +type DeclKind byte + +const ( + DeclKindUnknown DeclKind = iota + DeclKindField + DeclKindLiteral + DeclKindBlock +) + +func (k DeclKind) DecodeDynID(lo, _ int32) DeclKind { + return DeclKind(lo) +} + +func (k DeclKind) EncodeDynID(value int32) (lo, hi int32, ok bool) { + return int32(k), value, true +} + +// Nodes provides storage for the various AST node types, and can be used +// to construct new ones. +type Nodes struct { + file *File + + fields arena.Arena[rawField] + literals arena.Arena[rawLiteral] + blocks arena.Arena[rawBlock] +} + +// NewField creates a new Field node. +func (n *Nodes) NewField(args FieldArgs) Field { + idField := n.fields.NewCompressed(rawField{ + tag: args.Tag.ID(), + wireType: args.WireType.ID(), + value: args.Value.ID(), + }) + return id.Wrap(n.file, id.ID[Field](idField)) +} + +// NewLiteral creates a new Literal node. +func (n *Nodes) NewLiteral(t token.Token) Literal { + idLiteral := n.literals.NewCompressed(rawLiteral{ + token: t.ID(), + }) + return id.Wrap(n.file, id.ID[Literal](idLiteral)) +} + +// NewBlock creates a new Block node. +func (n *Nodes) NewBlock(t token.Token) Block { + idBlock := n.blocks.NewCompressed(rawBlock{ + token: t.ID(), + }) + return id.Wrap(n.file, id.ID[Block](idBlock)) +} + +// Field represents a tag expression: `Tag:WireType Value` or `Tag: Value`. +type Field id.Node[Field, *File, *rawField] + +type rawField struct { + tag token.ID + wireType token.ID // Optional, may be zero + value id.Dyn[DeclAny, DeclKind] +} + +type FieldArgs struct { + Tag token.Token + WireType token.Token // Optional + Value DeclAny +} + +func (f Field) Tag() token.Token { + return id.Wrap(f.Context().Stream(), f.Raw().tag) +} + +func (f Field) WireType() token.Token { + return id.Wrap(f.Context().Stream(), f.Raw().wireType) +} + +func (f Field) Value() DeclAny { + return id.WrapDyn(f.Context(), f.Raw().value) +} + +func (f Field) AsAny() DeclAny { + return id.WrapDyn(f.Context(), id.NewDyn(DeclKindField, id.ID[DeclAny](f.ID()))) +} + +func (f Field) Span() source.Span { + return source.Join(f.Tag(), f.WireType(), f.Value()) +} + +// Literal represents a single literal value (string, number, boolean, or hex). +type Literal id.Node[Literal, *File, *rawLiteral] + +type rawLiteral struct { + token token.ID +} + +func (l Literal) Token() token.Token { + return id.Wrap(l.Context().Stream(), l.Raw().token) +} + +func (l Literal) AsAny() DeclAny { + return id.WrapDyn(l.Context(), id.NewDyn(DeclKindLiteral, id.ID[DeclAny](l.ID()))) +} + +func (l Literal) Span() source.Span { + return l.Token().Span() +} + +// Block represents a sequence of declarations enclosed in brackets or braces. +// For example: `[...]` (length-prefixed) or `!{...}` (group). +type Block id.Node[Block, *File, *rawBlock] + +type rawBlock struct { + token token.ID + decls id.DynSeq[DeclAny, DeclKind, *File] +} + +func (b Block) Token() token.Token { + return id.Wrap(b.Context().Stream(), b.Raw().token) +} + +func (b Block) Decls() seq.Inserter[DeclAny] { + return b.Raw().decls.Inserter(b.Context()) +} + +func (b Block) AsAny() DeclAny { + return id.WrapDyn(b.Context(), id.NewDyn(DeclKindBlock, id.ID[DeclAny](b.ID()))) +} + +func (b Block) Span() source.Span { + return b.Token().Span() +} diff --git a/experimental/internal/protoscope/disassembler/disassembler.go b/experimental/internal/protoscope/disassembler/disassembler.go new file mode 100644 index 00000000..7f73d6e0 --- /dev/null +++ b/experimental/internal/protoscope/disassembler/disassembler.go @@ -0,0 +1,418 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disassembler + +import ( + "encoding/binary" + "errors" + "fmt" + "io" + "strconv" + "strings" + "unicode" + "unicode/utf8" +) + +// Options contains disassembly options. +type Options struct { + ExplicitWireTypes bool + ExplicitLengthPrefixes bool + NoGroups bool + MaxDepth int +} + +// Disassemble translates Protobuf wire format into protoscope text. +func Disassemble(data []byte, out io.Writer) error { + return DisassembleWithOptions(data, out, Options{}) +} + +// DisassembleWithOptions translates Protobuf wire format into protoscope text with options. +func DisassembleWithOptions(data []byte, out io.Writer, opts Options) error { + d := &disassembler{data: data, opts: opts} + return d.disassemble(out, 0, 0, 0) +} + +type disassembler struct { + data []byte + off int + opts Options +} + +const ( + wireVarint = 0 + wireI64 = 1 + wireLen = 2 + wireSGroup = 3 + wireEGroup = 4 + wireI32 = 5 + + defaultMaxDepth = 10 +) + +var wireTypeNames = [...]string{ + wireVarint: "VARINT ", + wireI64: "I64 ", + wireLen: "LEN ", + wireSGroup: "SGROUP ", + wireEGroup: "EGROUP ", + wireI32: "I32 ", +} + +func (d *disassembler) disassemble(out io.Writer, indent int, groupTag uint64, depth int) error { + limit := d.opts.MaxDepth + if limit <= 0 { + limit = defaultMaxDepth + } + if depth > limit { + return errors.New("max depth exceeded") + } + + for d.off < len(d.data) { + u, n := binary.Uvarint(d.data[d.off:]) + if n <= 0 { + // Not a valid varint, dump remaining as hex + fmt.Fprint(out, strings.Repeat(" ", indent)) + fmt.Fprintln(out, "# Error: invalid varint tag") + return d.dumpHex(out, indent) + } + + tag := u >> 3 + wireType := u & 0x7 + + // If we're in a group and see an EGroup with the same tag, we're done. + if groupTag != 0 && wireType == wireEGroup && tag == groupTag { + if !d.opts.NoGroups { + d.off += n + return nil + } + } + + if wireType > 5 || tag == 0 { + // Invalid wire type, this isn't a protobuf stream or it's corrupted. + fmt.Fprint(out, strings.Repeat(" ", indent)) + if tag == 0 { + fmt.Fprintln(out, "# Error: invalid tag 0; this might be using a different framing (e.g. gRPC)") + } else { + fmt.Fprintf(out, "# Error: invalid wire type %d; this might be corrupted or using a different framing (e.g. gRPC)\n", wireType) + } + return d.dumpHex(out, indent) + } + + d.off += n + + tagPart := fmt.Sprintf("%d:", tag) + if d.opts.ExplicitWireTypes { + tagPart += wireTypeNames[wireType] + } else { + tagPart += " " + } + + var valStr string + var comment string + var err error + + switch wireType { + case wireVarint: + v, vn := binary.Uvarint(d.data[d.off:]) + if vn <= 0 { + err = fmt.Errorf("invalid varint at offset %d", d.off) + } else { + d.off += vn + valStr = strconv.FormatUint(v, 10) + } + case wireI64: + valStr, comment, err = d.disassembleI64() + case wireLen: + fmt.Fprint(out, strings.Repeat(" ", indent)) + fmt.Fprint(out, tagPart) + err = d.disassembleLen(out, indent, depth) + if err == nil { + continue // Already handled line/block + } + case wireSGroup: + if d.opts.NoGroups { + fmt.Fprintln(out) + continue + } + fmt.Fprint(out, strings.Repeat(" ", indent)) + fmt.Fprint(out, tagPart) + err = d.disassembleSGroup(out, indent, tag, depth) + if err == nil { + continue + } + case wireEGroup: + if d.opts.NoGroups { + fmt.Fprintln(out) + continue + } + // Should have been handled above if matching. + valStr = "(unmatched EGroup)" + case wireI32: + valStr, comment, err = d.disassembleI32() + default: + valStr = fmt.Sprintf("(unsupported wire type %d)", wireType) + } + + if err != nil { + return err + } + + if valStr != "" { + fmt.Fprint(out, strings.Repeat(" ", indent)) + line := tagPart + valStr + fmt.Fprint(out, line) + if comment != "" { + // Align to column 30 (including indentation) + currentPos := indent*2 + len(line) + padding := 30 - currentPos + if padding < 1 { + padding = 1 + } + fmt.Fprint(out, strings.Repeat(" ", padding)) + fmt.Fprint(out, "# ") + fmt.Fprint(out, comment) + } + fmt.Fprintln(out) + } + } + return nil +} + +func (d *disassembler) disassembleI64() (string, string, error) { + if d.off+8 > len(d.data) { + return "", "", errors.New("unexpected EOF reading I64") + } + payload := d.data[d.off : d.off+8] + v := binary.LittleEndian.Uint64(payload) + d.off += 8 + + reps := possibilitiesI64(payload) + var floatRep *Representation + for _, r := range reps { + if r.Type == "float64" { + floatRep = &r + break + } + } + + hexVal := fmt.Sprintf("0x%016xi64", v) + if floatRep != nil && floatRep.Likelihood >= 0.5 { + return floatRep.Text, hexVal, nil + } else if floatRep != nil { + return hexVal, floatRep.Text, nil + } + return hexVal, "", nil +} + +func (d *disassembler) disassembleI32() (string, string, error) { + if d.off+4 > len(d.data) { + return "", "", errors.New("unexpected EOF reading I32") + } + payload := d.data[d.off : d.off+4] + v := binary.LittleEndian.Uint32(payload) + d.off += 4 + + reps := possibilitiesI32(payload) + var floatRep *Representation + for _, r := range reps { + if r.Type == "float32" { + floatRep = &r + break + } + } + + hexVal := fmt.Sprintf("0x%08xi32", v) + if floatRep != nil && floatRep.Likelihood >= 0.5 { + return floatRep.Text, hexVal, nil + } else if floatRep != nil { + return hexVal, floatRep.Text, nil + } + return hexVal, "", nil +} + +func (d *disassembler) disassembleLen(out io.Writer, indent, depth int) error { + l, n := binary.Uvarint(d.data[d.off:]) + if n <= 0 { + return fmt.Errorf("invalid length at offset %d", d.off) + } + d.off += n + if l > uint64(len(d.data)-d.off) { + return fmt.Errorf("length %d out of bounds", l) + } + payload := d.data[d.off : d.off+int(l)] + d.off += int(l) + + if d.opts.ExplicitLengthPrefixes { + fmt.Fprintf(out, "%d ", l) + } + + // Heuristic: Prefer string if it's cleanly printable and not obviously a message. + switch { + case isPrintable(payload) && !isMessage(payload): + fmt.Fprintf(out, "{%q}\n", string(payload)) + case isMessage(payload): + fmt.Fprint(out, "{\n") + sub := &disassembler{data: payload, opts: d.opts} + if err := sub.disassemble(out, indent+1, 0, depth+1); err != nil { + if err.Error() == "max depth exceeded" { + return err + } + // If recursion fails, fall back to hex for this payload + fmt.Fprintf(out, " (fallback) `%s`", toHexSpace(payload)) + } + fmt.Fprint(out, strings.Repeat(" ", indent)) + fmt.Fprint(out, "}\n") + default: + fmt.Fprintf(out, "{`%s`}\n", toHexSpace(payload)) + } + return nil +} + +func (d *disassembler) disassembleSGroup(out io.Writer, indent int, tag uint64, depth int) error { + fmt.Fprint(out, "!{\n") + if err := d.disassemble(out, indent+1, tag, depth+1); err != nil { + return err + } + fmt.Fprint(out, strings.Repeat(" ", indent)) + fmt.Fprint(out, "}\n") + return nil +} + +func (d *disassembler) dumpHex(out io.Writer, indent int) error { + if d.off >= len(d.data) { + return nil + } + fmt.Fprint(out, strings.Repeat(" ", indent)) + fmt.Fprintf(out, "`%s`\n", toHexSpace(d.data[d.off:])) + d.off = len(d.data) + return nil +} + +func checkMessageStructure(data []byte) (ok bool, fields int) { + if len(data) == 0 { + return false, 0 + } + off := 0 + for off < len(data) { + u, n := binary.Uvarint(data[off:]) + if n <= 0 { + return false, 0 + } + off += n + wireType := u & 0x7 + tag := u >> 3 + if wireType > 5 || tag == 0 { + return false, 0 + } + fields++ + switch wireType { + case wireVarint: + _, n = binary.Uvarint(data[off:]) + if n <= 0 { + return false, 0 + } + off += n + case wireI64: + if off+8 > len(data) { + return false, 0 + } + off += 8 + case wireLen: + l, n := binary.Uvarint(data[off:]) + if n <= 0 { + return false, 0 + } + off += n + if l > uint64(len(data)-off) { + return false, 0 + } + off += int(l) + case wireSGroup, wireEGroup: + // Groups are not supported for simple structural check here + return false, 0 + case wireI32: + if off+4 > len(data) { + return false, 0 + } + off += 4 + default: + return false, 0 + } + } + return off == len(data), fields +} + +func isMessage(data []byte) bool { + ok, fields := checkMessageStructure(data) + if !ok || fields == 0 { + return false + } + // Heuristic: if it has many fields, it's likely a message even if it looks like a string. + if fields > 3 { + return true + } + // If it's short and mostly printable, it's likely a string. + if isMostlyPrintable(data) { + return false + } + return true +} + +func isPrintable(data []byte) bool { + if len(data) == 0 { + return false + } + if !utf8.Valid(data) { + return false + } + for _, r := range string(data) { + if !unicode.IsPrint(r) && !unicode.IsSpace(r) { + return false + } + } + return true +} + +func isMostlyPrintable(data []byte) bool { + if len(data) == 0 { + return false + } + if !utf8.Valid(data) { + return false + } + printable := 0 + total := 0 + for _, r := range string(data) { + total++ + if unicode.IsPrint(r) || unicode.IsSpace(r) { + printable++ + } + } + if total == 0 { + return false + } + return printable*10 > total*8 +} + +func toHexSpace(data []byte) string { + var sb strings.Builder + for i, b := range data { + if i > 0 { + sb.WriteByte(' ') + } + fmt.Fprintf(&sb, "%02x", b) + } + return sb.String() +} diff --git a/experimental/internal/protoscope/disassembler/disassembler_test.go b/experimental/internal/protoscope/disassembler/disassembler_test.go new file mode 100644 index 00000000..9275507e --- /dev/null +++ b/experimental/internal/protoscope/disassembler/disassembler_test.go @@ -0,0 +1,43 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disassembler + +import ( + "bytes" + "testing" +) + +func TestMaxDepth(t *testing.T) { + // Nested groups: 0x0b = SGROUP tag 1, 0x0c = EGROUP tag 1 + // 3 levels of nesting + data := []byte{0x0b, 0x0b, 0x0b, 0x0c, 0x0c, 0x0c} + + var buf bytes.Buffer + // With MaxDepth: 2, should exceed the limit and error + err := DisassembleWithOptions(data, &buf, Options{MaxDepth: 2}) + if err == nil { + t.Error("expected error with MaxDepth: 2, got nil") + } else if err.Error() != "max depth exceeded" { + t.Errorf("expected 'max depth exceeded', got %v", err) + } + + // With MaxDepth: 1, should also exceed and error + err = DisassembleWithOptions(data, &buf, Options{MaxDepth: 1}) + if err == nil { + t.Error("expected error with MaxDepth: 1, got nil") + } else if err.Error() != "max depth exceeded" { + t.Errorf("expected 'max depth exceeded', got %v", err) + } +} diff --git a/experimental/internal/protoscope/disassembler/fuzz_test.go b/experimental/internal/protoscope/disassembler/fuzz_test.go new file mode 100644 index 00000000..a79db6f2 --- /dev/null +++ b/experimental/internal/protoscope/disassembler/fuzz_test.go @@ -0,0 +1,67 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disassembler + +import ( + "io" + "os" + "path/filepath" + "testing" +) + +func FuzzDisassemble(f *testing.F) { + // Add baseline corpus with option permutations + baselineSeeds := [][]byte{ + {0x08, 0x96, 0x01}, // simple varint + {0x0a, 0x03, 0x01, 0x02, 0x03}, // packed + {0x0b, 0x10, 0x03, 0x0c}, // group + } + for _, seed := range baselineSeeds { + for _, explicitWireTypes := range []bool{false, true} { + for _, explicitLengthPrefixes := range []bool{false, true} { + for _, noGroups := range []bool{false, true} { + f.Add(seed, explicitWireTypes, explicitLengthPrefixes, noGroups) + } + } + } + } + + // Dynamically find and load complex .pb files from testdata directory + files, err := filepath.Glob("../testdata/*.pb") + if err == nil { + for _, file := range files { + data, err := os.ReadFile(file) + if err != nil { + continue + } + for _, explicitWireTypes := range []bool{false, true} { + for _, explicitLengthPrefixes := range []bool{false, true} { + for _, noGroups := range []bool{false, true} { + f.Add(data, explicitWireTypes, explicitLengthPrefixes, noGroups) + } + } + } + } + } + + f.Fuzz(func(_ *testing.T, data []byte, explicitWireTypes, explicitLengthPrefixes, noGroups bool) { + opts := Options{ + ExplicitWireTypes: explicitWireTypes, + ExplicitLengthPrefixes: explicitLengthPrefixes, + NoGroups: noGroups, + } + _ = DisassembleWithOptions(data, io.Discard, opts) + }) +} diff --git a/experimental/internal/protoscope/disassembler/heuristic_test.go b/experimental/internal/protoscope/disassembler/heuristic_test.go new file mode 100644 index 00000000..9fed2800 --- /dev/null +++ b/experimental/internal/protoscope/disassembler/heuristic_test.go @@ -0,0 +1,100 @@ +package disassembler + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDisassembleHeuristicComments(t *testing.T) { + tests := []struct { + name string + data []byte + expected []string // substrings we expect to find + }{ + { + name: "Likely Float64", + data: func() []byte { + return []byte{ + // Tag 1, I64 + 0x09, + // 0x405edd2f1a9fbe77 -> 123.456 + 0x77, 0xbe, 0x9f, 0x1a, 0x2f, 0xdd, 0x5e, 0x40, + } + }(), + expected: []string{ + "1: 123.456 # 0x405edd2f1a9fbe77i64", + }, + }, + { + name: "Likely Float32", + data: func() []byte { + return []byte{ + // Tag 2, I32 + 0x15, + // 78.9f32 -> 0x429dcccd + 0xcd, 0xcc, 0x9d, 0x42, + } + }(), + expected: []string{ + "2: 78.9i32 # 0x429dcccdi32", + }, + }, + { + name: "Ambiguous I64 (Text)", + data: func() []byte { + return []byte{ + // Tag 12, I64 + 0x61, + // "ram@nibl" -> 72 61 6d 40 6e 69 62 6c + 0x72, 0x61, 0x6d, 0x40, 0x6e, 0x69, 0x62, 0x6c, + } + }(), + expected: []string{ + "12: 1.239664294489405e214 # 0x6c62696e406d6172i64", + }, + }, + { + name: "Ambiguous I32 (Text)", + data: func() []byte { + return []byte{ + // Tag 12, I32 + 0x65, + // "ting" -> 74 69 6e 67 + 0x67, 0x6e, 0x69, 0x74, + } + }(), + expected: []string{ + "12: 7.397732e31i32 # 0x74696e67i32", + }, + }, + { + name: "NaN Float64", + data: func() []byte { + return []byte{ + // Tag 1, I64 + 0x09, + // NaN -> 0xffffffffffffffa8 + 0xa8, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + } + }(), + expected: []string{ + "1: 0xffffffffffffffa8i64 # NaN", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var buf bytes.Buffer + err := DisassembleWithOptions(tt.data, &buf, Options{}) + require.NoError(t, err) + output := buf.String() + for _, exp := range tt.expected { + assert.Contains(t, output, exp) + } + }) + } +} diff --git a/experimental/internal/protoscope/disassembler/possibilities.go b/experimental/internal/protoscope/disassembler/possibilities.go new file mode 100644 index 00000000..6aace4d1 --- /dev/null +++ b/experimental/internal/protoscope/disassembler/possibilities.go @@ -0,0 +1,352 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disassembler + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "sort" + "strconv" + "strings" + "unicode/utf8" +) + +// Representation represents a possible translation/formatting of a protobuf value. +type Representation struct { + Type string // E.g., "message", "string", "bytes", "varint", "zigzag", "bool", "fixed32", "float32", "fixed64", "float64", "packed_varint", "packed_fixed32", "packed_fixed64" + Text string // The protoscope textual value representation + Description string // Human-readable description + Likelihood float64 // Likelihood score, between 0.0 and 1.0 (higher is more likely) +} + +// Possibilities analyzes the raw payload bytes for a given wire type and returns +// all valid alternative representations sorted by likelihood. +func Possibilities(wireType int, payload []byte) []Representation { + var reps []Representation + + switch wireType { + case wireVarint: + reps = possibilitiesVarint(payload) + case wireI32: + reps = possibilitiesI32(payload) + case wireI64: + reps = possibilitiesI64(payload) + case wireLen: + reps = possibilitiesLen(payload) + } + + sort.Slice(reps, func(i, j int) bool { + if reps[i].Likelihood == reps[j].Likelihood { + return reps[i].Description < reps[j].Description + } + return reps[i].Likelihood > reps[j].Likelihood + }) + + return reps +} + +func possibilitiesVarint(payload []byte) []Representation { + val, n := binary.Uvarint(payload) + if n <= 0 || n < len(payload) { + return nil + } + + var reps []Representation + + // 1. Unsigned Varint (Decimal) + reps = append(reps, Representation{ + Type: "varint", + Text: strconv.FormatUint(val, 10), + Description: "Varint", + Likelihood: 0.9, + }) + + // 2. Zigzag Varint + zz := int64(val>>1) ^ -int64(val&1) + reps = append(reps, Representation{ + Type: "zigzag", + Text: strconv.FormatInt(zz, 10), + Description: "Zigzag Varint", + Likelihood: 0.7, + }) + + // 3. Boolean + switch val { + case 0: + reps = append(reps, Representation{ + Type: "bool", + Text: "false", + Description: "Boolean", + Likelihood: 0.8, + }) + case 1: + reps = append(reps, Representation{ + Type: "bool", + Text: "true", + Description: "Boolean", + Likelihood: 0.8, + }) + } + + return reps +} + +func possibilitiesI32(payload []byte) []Representation { + if len(payload) != 4 { + return nil + } + val := binary.LittleEndian.Uint32(payload) + + reps := make([]Representation, 0, 3) + + // 1. Fixed32 (Hex) + reps = append(reps, Representation{ + Type: "fixed32", + Text: fmt.Sprintf("0x%08xi32", val), + Description: "Fixed32 (Hex)", + Likelihood: 0.9, + }) + + // 2. Fixed32 (Decimal) + reps = append(reps, Representation{ + Type: "fixed32", + Text: fmt.Sprintf("%di32", int32(val)), + Description: "Fixed32 (Decimal)", + Likelihood: 0.8, + }) + + // 3. Float32 + fval := math.Float32frombits(val) + f64 := float64(fval) + text := fmt.Sprintf("%g", fval) + text = strings.Replace(text, "e+", "e", 1) + text += "i32" + if !strings.Contains(text, ".") && !strings.Contains(text, "e") && !math.IsNaN(f64) && !math.IsInf(f64, 0) { + text = fmt.Sprintf("%.1fi32", fval) + } + var likelihood float64 + switch { + case math.IsNaN(f64) || math.IsInf(f64, 0): + likelihood = 0.2 + case fval == 0.0 || (math.Abs(f64) > 1e-6 && math.Abs(f64) < 1e6): + likelihood = 0.7 + default: + likelihood = 0.5 + } + reps = append(reps, Representation{ + Type: "float32", + Text: text, + Description: "Float32", + Likelihood: likelihood, + }) + + return reps +} + +func possibilitiesI64(payload []byte) []Representation { + if len(payload) != 8 { + return nil + } + val := binary.LittleEndian.Uint64(payload) + + reps := make([]Representation, 0, 3) + + // 1. Fixed64 (Hex) + reps = append(reps, Representation{ + Type: "fixed64", + Text: fmt.Sprintf("0x%016xi64", val), + Description: "Fixed64 (Hex)", + Likelihood: 0.9, + }) + + // 2. Fixed64 (Decimal) + reps = append(reps, Representation{ + Type: "fixed64", + Text: fmt.Sprintf("%di64", int64(val)), + Description: "Fixed64 (Decimal)", + Likelihood: 0.8, + }) + + // 3. Float64 + fValActual := math.Float64frombits(val) + text := fmt.Sprintf("%g", fValActual) + text = strings.Replace(text, "e+", "e", 1) + if !strings.Contains(text, ".") && !strings.Contains(text, "e") && !math.IsNaN(fValActual) && !math.IsInf(fValActual, 0) { + text = fmt.Sprintf("%.1f", fValActual) + } + var likelihood float64 + switch { + case math.IsNaN(fValActual) || math.IsInf(fValActual, 0): + likelihood = 0.2 + case fValActual == 0.0 || (math.Abs(fValActual) > 1e-6 && math.Abs(fValActual) < 1e6): + likelihood = 0.7 + default: + likelihood = 0.5 + } + reps = append(reps, Representation{ + Type: "float64", + Text: text, + Description: "Float64", + Likelihood: likelihood, + }) + + return reps +} + +func possibilitiesLen(payload []byte) []Representation { + var reps []Representation + + // 1. Fallback Hex Bytes (always valid) + reps = append(reps, Representation{ + Type: "bytes", + Text: fmt.Sprintf("{`%s`}", toHexSpace(payload)), + Description: "Bytes", + Likelihood: 0.1, + }) + + // 2. String + if utf8.Valid(payload) { + isMsg := isMessage(payload) + likelihood := 0.4 + if isPrintable(payload) { + if !isMsg { + likelihood = 0.9 + } else { + likelihood = 0.6 + } + } + reps = append(reps, Representation{ + Type: "string", + Text: fmt.Sprintf("{%q}", string(payload)), + Description: "String", + Likelihood: likelihood, + }) + } + + // 3. Message + if isStructMessage(payload) { + var buf bytes.Buffer + if err := Disassemble(payload, &buf); err == nil { + text := formatSingleLine(buf.String()) + likelihood := 0.6 + if isMessage(payload) { + likelihood = 0.9 + } + reps = append(reps, Representation{ + Type: "message", + Text: text, + Description: "Embedded Message", + Likelihood: likelihood, + }) + } + } + + // 4. Packed Varints + if len(payload) > 0 { + var ints []uint64 + off := 0 + ok := true + for off < len(payload) { + v, n := binary.Uvarint(payload[off:]) + if n <= 0 { + ok = false + break + } + off += n + ints = append(ints, v) + } + if ok && len(ints) > 0 { + var sb strings.Builder + sb.WriteString("[") + for _, val := range ints { + fmt.Fprintf(&sb, " %d", val) + } + sb.WriteString(" ]") + allSmall := true + for _, val := range ints { + if val > 1000 { + allSmall = false + break + } + } + likelihood := 0.3 + if allSmall { + likelihood = 0.5 + } + reps = append(reps, Representation{ + Type: "packed_varint", + Text: sb.String(), + Description: "Packed Varints", + Likelihood: likelihood, + }) + } + } + + // 5. Packed Fixed32 + if len(payload) > 0 && len(payload)%4 == 0 { + var sb strings.Builder + sb.WriteString("[") + for i := 0; i < len(payload); i += 4 { + v := binary.LittleEndian.Uint32(payload[i:]) + fmt.Fprintf(&sb, " 0x%08xi32", v) + } + sb.WriteString(" ]") + reps = append(reps, Representation{ + Type: "packed_fixed32", + Text: sb.String(), + Description: "Packed Fixed32", + Likelihood: 0.4, + }) + } + + // 6. Packed Fixed64 + if len(payload) > 0 && len(payload)%8 == 0 { + var sb strings.Builder + sb.WriteString("[") + for i := 0; i < len(payload); i += 8 { + v := binary.LittleEndian.Uint64(payload[i:]) + fmt.Fprintf(&sb, " 0x%016xi64", v) + } + sb.WriteString(" ]") + reps = append(reps, Representation{ + Type: "packed_fixed64", + Text: sb.String(), + Description: "Packed Fixed64", + Likelihood: 0.4, + }) + } + + return reps +} + +func isStructMessage(data []byte) bool { + ok, fields := checkMessageStructure(data) + return ok && fields > 0 +} + +func formatSingleLine(text string) string { + text = strings.TrimSpace(text) + lines := strings.Split(text, "\n") + var cleaned []string + for _, line := range lines { + line = strings.TrimSpace(line) + if line != "" { + cleaned = append(cleaned, line) + } + } + return "{ " + strings.Join(cleaned, " ") + " }" +} diff --git a/experimental/internal/protoscope/disassembler/possibilities_test.go b/experimental/internal/protoscope/disassembler/possibilities_test.go new file mode 100644 index 00000000..fa516d6e --- /dev/null +++ b/experimental/internal/protoscope/disassembler/possibilities_test.go @@ -0,0 +1,132 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package disassembler + +import ( + "reflect" + "testing" +) + +func TestPossibilitiesVarint(t *testing.T) { + // Value 150 (varint encoded is 0x96, 0x01) + payload := []byte{0x96, 0x01} + res := Possibilities(wireVarint, payload) + + expected := []Representation{ + {Type: "varint", Text: "150", Description: "Varint", Likelihood: 0.9}, + {Type: "zigzag", Text: "75", Description: "Zigzag Varint", Likelihood: 0.7}, + } + if !reflect.DeepEqual(res, expected) { + t.Errorf("expected %#v, got %#v", expected, res) + } + + // Value 1 (varint encoded is 0x01) + payload = []byte{0x01} + res = Possibilities(wireVarint, payload) + expected = []Representation{ + {Type: "varint", Text: "1", Description: "Varint", Likelihood: 0.9}, + {Type: "bool", Text: "true", Description: "Boolean", Likelihood: 0.8}, + {Type: "zigzag", Text: "-1", Description: "Zigzag Varint", Likelihood: 0.7}, + } + if !reflect.DeepEqual(res, expected) { + t.Errorf("expected %#v, got %#v", expected, res) + } +} + +func TestPossibilitiesI32(t *testing.T) { + // Little-endian float 1.0 (binary 0x3f800000) + payload := []byte{0x00, 0x00, 0x80, 0x3f} + res := Possibilities(wireI32, payload) + + foundFloat := false + for _, r := range res { + if r.Type == "float32" { + foundFloat = true + if r.Text != "1.0i32" && r.Text != "1i32" { + t.Errorf("unexpected float32 format: %s", r.Text) + } + } + } + if !foundFloat { + t.Errorf("float32 possibility not found in %#v", res) + } +} + +func TestPossibilitiesI64(t *testing.T) { + // Little-endian float64 1.0 (binary 0x3ff0000000000000) + payload := []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f} + res := Possibilities(wireI64, payload) + + foundFloat := false + for _, r := range res { + if r.Type == "float64" { + foundFloat = true + if r.Text != "1.0" && r.Text != "1" { + t.Errorf("unexpected float64 format: %s", r.Text) + } + } + } + if !foundFloat { + t.Errorf("float64 possibility not found in %#v", res) + } +} + +func TestPossibilitiesLen(t *testing.T) { + // String "hello" (binary 0x68, 0x65, 0x6c, 0x6c, 0x6f) + payload := []byte("hello") + res := Possibilities(wireLen, payload) + + var hasStr, hasBytes, hasPackedVarint bool + for _, r := range res { + switch r.Type { + case "string": + hasStr = true + if r.Text != `{"hello"}` { + t.Errorf("unexpected string text: %q", r.Text) + } + case "bytes": + hasBytes = true + if r.Text != "{`68 65 6c 6c 6f`}" { + t.Errorf("unexpected bytes text: %q", r.Text) + } + case "packed_varint": + hasPackedVarint = true + if r.Text != "[ 104 101 108 108 111 ]" { + t.Errorf("unexpected packed varints text: %q", r.Text) + } + } + } + + if !hasStr || !hasBytes || !hasPackedVarint { + t.Errorf("missing possibilities for 'hello': string=%v, bytes=%v, packed_varint=%v", hasStr, hasBytes, hasPackedVarint) + } + + // Message {1: 150} (binary 0x08, 0x96, 0x01) + payload = []byte{0x08, 0x96, 0x01} + res = Possibilities(wireLen, payload) + + var hasMsg bool + for _, r := range res { + if r.Type == "message" { + hasMsg = true + if r.Text != "{ 1: 150 }" { + t.Errorf("unexpected message text: %q", r.Text) + } + } + } + if !hasMsg { + t.Errorf("missing message possibility for {1: 150}") + } +} diff --git a/experimental/internal/protoscope/parser/fuzz_test.go b/experimental/internal/protoscope/parser/fuzz_test.go new file mode 100644 index 00000000..783743a7 --- /dev/null +++ b/experimental/internal/protoscope/parser/fuzz_test.go @@ -0,0 +1,36 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parser + +import ( + "testing" + + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/source" +) + +func FuzzParse(f *testing.F) { + f.Add("1: 150") + f.Add("2: \"testing\"") + f.Add("3: [ 1 2 3 ]") + f.Add("4: !{ 1: 42 }") + f.Add("5:I32 100i32") + + f.Fuzz(func(_ *testing.T, input string) { + src := source.NewFile("fuzz.protoscope", input) + r := &report.Report{} + _, _ = Parse("fuzz.protoscope", src, r) + }) +} diff --git a/experimental/internal/protoscope/parser/parser.go b/experimental/internal/protoscope/parser/parser.go new file mode 100644 index 00000000..93045f48 --- /dev/null +++ b/experimental/internal/protoscope/parser/parser.go @@ -0,0 +1,201 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parser + +import ( + "github.com/bufbuild/protocompile/experimental/internal/lexer" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/ast" + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/seq" + "github.com/bufbuild/protocompile/experimental/source" + "github.com/bufbuild/protocompile/experimental/token" + "github.com/bufbuild/protocompile/experimental/token/keyword" + "github.com/bufbuild/protocompile/internal/ext/slicesx" +) + +var lex = lexer.Lexer{ + OnKeyword: func(k keyword.Keyword) lexer.OnKeyword { + switch k { + case keyword.Hash: + return lexer.LineComment + case keyword.LBracket, keyword.LBrace: + return lexer.BracketKeyword + case keyword.RParen, keyword.RBracket, keyword.RBrace: + return lexer.BracketKeyword + default: + return lexer.SoftKeyword + } + }, + + IsAffix: func(affix string, kind token.Kind, suffix bool) bool { + switch kind { + case token.Number: + res := suffix && slicesx.Among(affix, "z", "i32", "i64", "f32", "f64") + return res + default: + return false + } + }, + AllowBacktickStrings: true, +} + +// Parse lexes and parses a protoscope file. +func Parse(path string, source *source.File, r *report.Report) (file *ast.File, ok bool) { + prior := len(r.Diagnostics) + + file = ast.New(path, lex.Lex(source, r)) + p := &parser{ + file: file, + report: r, + } + p.parse(file.Decls(), file.Stream().Cursor()) + + ok = true + for _, d := range r.Diagnostics[prior:] { + if d.Level() >= report.Error { + ok = false + break + } + } + + return file, ok +} + +type parser struct { + file *ast.File + report *report.Report +} + +func (p *parser) parse(inserter seq.Inserter[ast.DeclAny], c *token.Cursor) { + for !c.Done() { + m := c.Mark() + node := p.parseDecl(c) + if !node.IsZero() { + seq.Append(inserter, node) + } else { + _ = c.Next() + } + ensureProgress(c, &m) + } +} + +func (p *parser) parseDecl(c *token.Cursor) ast.DeclAny { + tok := c.Peek() + if tok.IsZero() { + return ast.DeclAny{} + } + + // Group: !{ ... } + if tok.Keyword() == keyword.Bang { + clone := c.Clone() + _ = clone.Next() + next := clone.Peek() + if next.Keyword() == keyword.LBrace || next.Keyword() == keyword.Braces { + return p.parseGroup(c).AsAny() + } + } + + // Heuristic: if it's a number followed by a colon, it's a field. + if tok.Kind() == token.Number { + clone := c.Clone() + _ = clone.Next() + if clone.Peek().Keyword() == keyword.Colon { + return p.parseField(c).AsAny() + } + } + + // Otherwise, it's a literal or a block. + if !tok.IsLeaf() { + return p.parseBlock(c).AsAny() + } + return p.parseLiteral(c).AsAny() +} + +func (p *parser) parseField(c *token.Cursor) ast.Field { + tag := c.Next() + _ = c.Next() // consume colon + + var wireType token.Token + // Optional wire type + if c.Peek().Kind() == token.Ident { + switch c.Peek().Text() { + case "VARINT", "I64", "LEN", "SGROUP", "EGROUP", "I32": + wireType = c.Next() + } + } + + value := p.parseDecl(c) + + return p.file.Nodes().NewField(ast.FieldArgs{ + Tag: tag, + WireType: wireType, + Value: value, + }) +} + +func (p *parser) parseLiteral(c *token.Cursor) ast.Literal { + return p.file.Nodes().NewLiteral(c.Next()) +} + +func (p *parser) parseBlock(c *token.Cursor) ast.Block { + tok := c.Next() + b := p.file.Nodes().NewBlock(tok) + + // Recurse into children + if children := tok.Children(); children != nil { + p.parse(b.Decls(), children) + } + + return b +} + +func (p *parser) parseGroup(c *token.Cursor) ast.Block { + bang := c.Next() + b := p.file.Nodes().NewBlock(bang) // Use bang as the anchor token for the group + + next := c.Next() + if children := next.Children(); children != nil { + p.parse(b.Decls(), children) + return b + } + + for !c.Done() && c.Peek().Keyword() != keyword.RBrace { + m := c.Mark() + node := p.parseDecl(c) + if !node.IsZero() { + seq.Append(b.Decls(), node) + } else { + _ = c.Next() + } + ensureProgress(c, &m) + } + + if !c.Done() && c.Peek().Keyword() == keyword.RBrace { + _ = c.Next() // consume } + } + + return b +} + +// ensureProgress is a helper to ensure that the parser is making progress. +// This is used to catch bugs in the parser that would cause it to loop +// forever. +func ensureProgress(c *token.Cursor, m *token.CursorMark) { + next := c.Mark() + if *m == next { + panic("protocompile/parser: parser failed to make progress; this is a bug in protocompile") + } + *m = next +} diff --git a/experimental/internal/protoscope/parser/parser_test.go b/experimental/internal/protoscope/parser/parser_test.go new file mode 100644 index 00000000..a5fcb2b8 --- /dev/null +++ b/experimental/internal/protoscope/parser/parser_test.go @@ -0,0 +1,113 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parser + +import ( + "bytes" + "os" + "path/filepath" + "testing" + + "github.com/bufbuild/protocompile/experimental/id" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/ast" + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/seq" + "github.com/bufbuild/protocompile/experimental/source" +) + +func TestParse(t *testing.T) { + files, err := filepath.Glob("../testdata/*.protoscope") + if err != nil { + t.Fatal(err) + } + if len(files) == 0 { + t.Fatal("no test files found in ../testdata") + } + + for _, file := range files { + name := filepath.Base(file) + t.Run(name, func(t *testing.T) { + content, err := os.ReadFile(file) + if err != nil { + t.Fatal(err) + } + + src := source.NewFile(name, string(content)) + r := &report.Report{} + parsed, ok := Parse(name, src, r) + if !ok { + t.Fatalf("Parse failed: %v", r.Diagnostics) + } + + for decl := range seq.Values(parsed.Decls()) { + span := decl.Span() + t.Logf("decl span: %v", span) + } + + if parsed.Decls().Len() == 0 { + t.Errorf("expected at least one declaration, got 0") + t.Logf("token stream: %v", parsed.Stream()) + c := parsed.Stream().Cursor() + for !c.Done() { + t.Logf("token: %v (%q)", c.Peek(), c.Peek().Text()) + _ = c.Next() + } + } + }) + } +} + +func TestSliceReallocation(t *testing.T) { + var buf bytes.Buffer + for i := 1; i <= 1000; i++ { + buf.WriteString("1: 42\n") + } + + src := source.NewFile("large.protoscope", buf.String()) + r := &report.Report{} + file, ok := Parse("large.protoscope", src, r) + if !ok { + t.Fatalf("Parse failed: %v", r.Diagnostics) + } + + count := 0 + for decl := range seq.Values(file.Decls()) { + count++ + field := id.Wrap(file, id.ID[ast.Field](decl.ID().Value())) + tag, _ := field.Tag().AsNumber().Int() + if tag != 1 { + t.Errorf("decl %d: expected tag 1, got %d", count, tag) + } + + val := field.Value() + lit := id.Wrap(file, id.ID[ast.Literal](val.ID().Value())) + num, _ := lit.Token().AsNumber().Int() + if num != 42 { + t.Errorf("decl %d: expected value 42, got %d", count, num) + } + } + if count != 1000 { + t.Errorf("expected 1000 declarations, got %d", count) + } +} + +func TestParseBackticks(t *testing.T) { + src := source.NewFile("test.protoscope", "1: {`01 02 03`}") + r := &report.Report{} + _, ok := Parse("test.protoscope", src, r) + if !ok { + t.Fatalf("Parse failed: %v", r.Diagnostics) + } +} diff --git a/experimental/internal/protoscope/parser/testdata/fuzz/FuzzParse/21e7c81290725f5b b/experimental/internal/protoscope/parser/testdata/fuzz/FuzzParse/21e7c81290725f5b new file mode 100644 index 00000000..ee1d9cc1 --- /dev/null +++ b/experimental/internal/protoscope/parser/testdata/fuzz/FuzzParse/21e7c81290725f5b @@ -0,0 +1,2 @@ +go test fuzz v1 +string("2E50000000") diff --git a/experimental/internal/protoscope/testdata/all_types.proto b/experimental/internal/protoscope/testdata/all_types.proto new file mode 100644 index 00000000..c4421aa7 --- /dev/null +++ b/experimental/internal/protoscope/testdata/all_types.proto @@ -0,0 +1,40 @@ +syntax = "proto3"; + +package test; + +option go_package = "github.com/bufbuild/protocompile/experimental/internal/protoscope/testdata/alltypes"; + +message AllTypes { + double f_double = 1; + float f_float = 2; + int32 f_int32 = 3; + int64 f_int64 = 4; + uint32 f_uint32 = 5; + uint64 f_uint64 = 6; + sint32 f_sint32 = 7; + sint64 f_sint64 = 8; + fixed32 f_fixed32 = 9; + fixed64 f_fixed64 = 10; + sfixed32 f_sfixed32 = 11; + sfixed64 f_sfixed64 = 12; + bool f_bool = 13; + string f_string = 14; + bytes f_bytes = 15; + NestedMessage f_nested = 16; + NestedEnum f_enum = 17; + repeated int32 f_repeated_int32 = 18; // packed + map f_map = 19; + + message NestedMessage { + int32 val = 1; + } + + enum NestedEnum { + ZERO = 0; + ONE = 1; + } +} + +message Proto2GroupTest { + // Group syntax is only supported in proto2 or editions, not proto3 +} diff --git a/experimental/internal/protoscope/testdata/all_types_proto2.proto b/experimental/internal/protoscope/testdata/all_types_proto2.proto new file mode 100644 index 00000000..eebfb9d4 --- /dev/null +++ b/experimental/internal/protoscope/testdata/all_types_proto2.proto @@ -0,0 +1,12 @@ +syntax = "proto2"; + +package test; + +option go_package = "github.com/bufbuild/protocompile/experimental/internal/protoscope/testdata/alltypesproto2"; + +message AllTypesProto2 { + repeated group OptionalGroup = 1 { + optional int32 val = 2; + } + repeated int32 f_unpacked_int32 = 3 [packed = false]; +} diff --git a/experimental/internal/protoscope/testdata/all_wires.pb b/experimental/internal/protoscope/testdata/all_wires.pb new file mode 100644 index 0000000000000000000000000000000000000000..2d834b176ae4076e51e85fc1970431449ac24aae GIT binary patch literal 247 zcmd-q-#1@M|8AUv=-D%Kog}6)N?iL71&j()XT7)5&@$L~@#i&W3t@-z{J!Q43=Dxz zKmr6}Hv?5Bu7E1fU@X$}NX^N~*HH)!anrR>C`qj-QB-&~ed)6~YhJ9Hr=##;LT?9< zoIhzdh@90>#dB}%zV{6O8x9DWgjH-})Ced_P0Or6(yjnfpb%V|hAP~|$jtGay@Qc) z5+gIWnBpu(9xj&b)Jj7kma@bgLm1Zx%r#ofC}6aN5lGrBVO+uJwt_KW2crl8HW^|A literal 0 HcmV?d00001 diff --git a/experimental/internal/protoscope/testdata/all_wires.protoscope b/experimental/internal/protoscope/testdata/all_wires.protoscope new file mode 100644 index 00000000..eaf21300 --- /dev/null +++ b/experimental/internal/protoscope/testdata/all_wires.protoscope @@ -0,0 +1,84 @@ +# Wire Type 1 (I64): double +1: 0x405edd2f1a9fbe77i64 + +# Wire Type 5 (I32): float +2: 0x429dcccdi32 + +# Wire Type 0 (VARINT): int32 +3: 150 +3: 18446744073709551574 + +# Wire Type 0 (VARINT): int64 +4: 123456789 + +# Wire Type 0 (VARINT): uint32 +5: 42 + +# Wire Type 0 (VARINT): uint64 +6: 987654321 + +# Wire Type 0 (VARINT): sint32 (zigzag varint) +7: 19 + +# Wire Type 0 (VARINT): sint64 (zigzag varint) +8: 1999 + +# Wire Type 5 (I32): fixed32 +9: 0x00000037i32 + +# Wire Type 1 (I64): fixed64 +10: 0x0000000000000042i64 + +# Wire Type 5 (I32): sfixed32 +11: 0xffffffb3i32 + +# Wire Type 1 (I64): sfixed64 +12: 0xffffffffffffffa8i64 + +# Wire Type 0 (VARINT): bool +13: true + +# Wire Type 2 (LEN): string - Pure UTF-8 text +14: "Hello, UTF-8 text! 日本語, 𐍈, 💻, 🚀" + +# Wire Type 2 (LEN): bytes - Pure random binary bytes +15: `de ad be ef 00 ff 80 c0 12 34 56 78` + +# Wire Type 2 (LEN): bytes - Mixed text and binary +22: "Prefix UTF-8 text \xde\xad\xbe\xef Suffix UTF-8 text" + +# Wire Type 2 (LEN): NestedMessage +16: [ + 1: 999 +] + +# Wire Type 0 (VARINT): NestedEnum +17: 1 + +# Wire Type 2 (LEN): repeated int32 (packed) +18: [ 11 22 33 ] + +# Wire Type 2 (LEN): map +19: [ + 1: "key1" + 2: "val1" +] +19: [ + 1: "key2" + 2: "val2" +] + +# Wire Type 3 / 4 (Group): OptionalGroup +20: !{ + 2: 50 +} +20: !{ + 2: 60 +} + +# Unpacked repeated int32 +21: 70 +21: 80 + +# Extra: Zigzag suffix test +23: 10z diff --git a/experimental/internal/protoscope/testdata/fixed.pb b/experimental/internal/protoscope/testdata/fixed.pb new file mode 100644 index 00000000..dd79adda --- /dev/null +++ b/experimental/internal/protoscope/testdata/fixed.pb @@ -0,0 +1 @@ + wfUD3"D3" \ No newline at end of file diff --git a/experimental/internal/protoscope/testdata/fixed.protoscope b/experimental/internal/protoscope/testdata/fixed.protoscope new file mode 100644 index 00000000..ce1f0838 --- /dev/null +++ b/experimental/internal/protoscope/testdata/fixed.protoscope @@ -0,0 +1,2 @@ +1:I64 0x1122334455667788i64 +2:I32 0x11223344i32 diff --git a/experimental/internal/protoscope/testdata/group.pb b/experimental/internal/protoscope/testdata/group.pb new file mode 100644 index 00000000..7d4829fa --- /dev/null +++ b/experimental/internal/protoscope/testdata/group.pb @@ -0,0 +1 @@ +  \ No newline at end of file diff --git a/experimental/internal/protoscope/testdata/group.protoscope b/experimental/internal/protoscope/testdata/group.protoscope new file mode 100644 index 00000000..3841b58c --- /dev/null +++ b/experimental/internal/protoscope/testdata/group.protoscope @@ -0,0 +1,3 @@ +1: !{ + 2: 3 +} diff --git a/experimental/internal/protoscope/testdata/nested.pb b/experimental/internal/protoscope/testdata/nested.pb new file mode 100644 index 00000000..7472446f --- /dev/null +++ b/experimental/internal/protoscope/testdata/nested.pb @@ -0,0 +1 @@ +"* \ No newline at end of file diff --git a/experimental/internal/protoscope/testdata/nested.protoscope b/experimental/internal/protoscope/testdata/nested.protoscope new file mode 100644 index 00000000..e1553a57 --- /dev/null +++ b/experimental/internal/protoscope/testdata/nested.protoscope @@ -0,0 +1,2 @@ +4: [ 1: 42 ] + diff --git a/experimental/internal/protoscope/testdata/normal.pb b/experimental/internal/protoscope/testdata/normal.pb new file mode 100644 index 00000000..381345bd --- /dev/null +++ b/experimental/internal/protoscope/testdata/normal.pb @@ -0,0 +1,2 @@ + +John Doejohn@example.comdoe@example.com \ No newline at end of file diff --git a/experimental/internal/protoscope/testdata/normal.protoscope b/experimental/internal/protoscope/testdata/normal.protoscope new file mode 100644 index 00000000..2a867ccb --- /dev/null +++ b/experimental/internal/protoscope/testdata/normal.protoscope @@ -0,0 +1,4 @@ +1: "John Doe" +2: 30 +3: "john@example.com" +3: "doe@example.com" diff --git a/experimental/internal/protoscope/testdata/packed.pb b/experimental/internal/protoscope/testdata/packed.pb new file mode 100644 index 00000000..f6aaef78 --- /dev/null +++ b/experimental/internal/protoscope/testdata/packed.pb @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/experimental/internal/protoscope/testdata/packed.protoscope b/experimental/internal/protoscope/testdata/packed.protoscope new file mode 100644 index 00000000..e525dee7 --- /dev/null +++ b/experimental/internal/protoscope/testdata/packed.protoscope @@ -0,0 +1 @@ +1: [ 1 2 3 ] diff --git a/experimental/internal/protoscope/testdata/recursive.protoscope b/experimental/internal/protoscope/testdata/recursive.protoscope new file mode 100644 index 00000000..a6be9368 --- /dev/null +++ b/experimental/internal/protoscope/testdata/recursive.protoscope @@ -0,0 +1,3 @@ +4: [ + 1: 42 +] diff --git a/experimental/internal/protoscope/testdata/simple.pb b/experimental/internal/protoscope/testdata/simple.pb new file mode 100644 index 00000000..9ee38217 --- /dev/null +++ b/experimental/internal/protoscope/testdata/simple.pb @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/experimental/internal/protoscope/testdata/simple.protoscope b/experimental/internal/protoscope/testdata/simple.protoscope new file mode 100644 index 00000000..9763b7a1 --- /dev/null +++ b/experimental/internal/protoscope/testdata/simple.protoscope @@ -0,0 +1 @@ +1: 150 diff --git a/experimental/parser/testdata/parser/option/cel_literals.proto.stderr.txt b/experimental/parser/testdata/parser/option/cel_literals.proto.stderr.txt index c18f5ca7..eea3f717 100644 --- a/experimental/parser/testdata/parser/option/cel_literals.proto.stderr.txt +++ b/experimental/parser/testdata/parser/option/cel_literals.proto.stderr.txt @@ -1,3 +1,19 @@ +error: unrecognized suffix for integer literal + --> testdata/parser/option/cel_literals.proto:5:13 + help: delete it + | + 5 | - option x = 0u; + 5 | + option x = 0; + | + +error: unrecognized suffix for integer literal + --> testdata/parser/option/cel_literals.proto:6:13 + help: delete it + | + 6 | - option x = 0U; + 6 | + option x = 0; + | + error: invalid digit in decimal integer literal --> testdata/parser/option/cel_literals.proto:7:14 | @@ -58,4 +74,4 @@ error: implicitly-concatenated string has incompatible prefix | | | must match this prefix -encountered 8 errors +encountered 10 errors diff --git a/experimental/protoscope/fuzz_test.go b/experimental/protoscope/fuzz_test.go new file mode 100644 index 00000000..72856aa0 --- /dev/null +++ b/experimental/protoscope/fuzz_test.go @@ -0,0 +1,74 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package protoscope_test + +import ( + "testing" + + "github.com/bufbuild/protocompile/experimental/protoscope" +) + +func FuzzAssemble(f *testing.F) { + f.Add("1: 150", 0) + f.Add("1: 150\n---\n2: \"hello\"", 3) + f.Add("# flags: 5\n1: 150\n", 1) + + f.Fuzz(func(_ *testing.T, text string, framingVal int) { + framings := []protoscope.Framing{ + protoscope.FramingNone, + protoscope.FramingGRPC, + protoscope.FramingConnect, + protoscope.FramingVarint, + } + framing := framings[uint(framingVal)%uint(len(framings))] + + // Assemble + binary, diags := protoscope.AssembleWithOptions("fuzz.protoscope", []byte(text), protoscope.AssembleOptions{Framing: framing}) + if len(diags) > 0 || len(binary) == 0 { + return + } + + // Round-trip disassemble + _, _ = protoscope.Disassemble(binary, protoscope.DisassembleOptions{Framing: framing}) + }) +} + +func FuzzDisassemble(f *testing.F) { + // Add some typical payloads + f.Add([]byte{0x08, 0x96, 0x01}, 0) + f.Add([]byte{0x00, 0x00, 0x00, 0x00, 0x03, 0x08, 0x96, 0x01}, 1) + f.Add([]byte{0x03, 0x08, 0x96, 0x01}, 3) + + f.Fuzz(func(_ *testing.T, binary []byte, framingVal int) { + framings := []protoscope.Framing{ + protoscope.FramingNone, + protoscope.FramingGRPC, + protoscope.FramingConnect, + protoscope.FramingVarint, + } + framing := framings[uint(framingVal)%uint(len(framings))] + + // Disassemble + text, err := protoscope.Disassemble(binary, protoscope.DisassembleOptions{ + Framing: framing, + }) + if err != nil || len(text) == 0 { + return + } + + // Round-trip assemble + _, _ = protoscope.AssembleWithOptions("fuzz.protoscope", []byte(text), protoscope.AssembleOptions{Framing: framing}) + }) +} diff --git a/experimental/protoscope/protoscope.go b/experimental/protoscope/protoscope.go new file mode 100644 index 00000000..e01afb23 --- /dev/null +++ b/experimental/protoscope/protoscope.go @@ -0,0 +1,774 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package protoscope + +import ( + "encoding/binary" + "encoding/hex" + "fmt" + "strconv" + "strings" + "unicode" + "unicode/utf8" + + "github.com/bufbuild/protocompile/experimental/internal/protoscope/assembler" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/ast" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/disassembler" + "github.com/bufbuild/protocompile/experimental/internal/protoscope/parser" + "github.com/bufbuild/protocompile/experimental/report" + "github.com/bufbuild/protocompile/experimental/seq" + "github.com/bufbuild/protocompile/experimental/source" + "github.com/bufbuild/protocompile/experimental/source/length" + "github.com/bufbuild/protocompile/experimental/token" +) + +// Severity represents diagnostic severity levels. +type Severity int + +const ( + SeverityInfo Severity = iota + SeverityWarning + SeverityError +) + +// Position represents a 1-indexed line and column position. +type Position struct { + Line, Column int +} + +// Range represents a span between two positions. +type Range struct { + Start, End Position +} + +// Diagnostic represents a syntax or validation diagnostic. +type Diagnostic struct { + Range Range + Message string + Level Severity +} + +// Framing represents the message framing format. +type Framing int + +const ( + // FramingNone represents no framing. + FramingNone Framing = iota + // FramingGRPC represents gRPC framing format. + FramingGRPC + // FramingConnect represents ConnectRPC framing format. + FramingConnect + // FramingVarint represents Varint delimited framing format. + FramingVarint +) + +// String returns the string representation of the framing. +func (f Framing) String() string { + switch f { + case FramingNone: + return "none" + case FramingGRPC: + return "grpc" + case FramingConnect: + return "connect" + case FramingVarint: + return "varint" + default: + return fmt.Sprintf("unknown(%d)", f) + } +} + +// ParseFraming parses a framing string to its enum value. +func ParseFraming(s string) (Framing, error) { + switch strings.ToLower(strings.ReplaceAll(strings.ReplaceAll(s, " ", ""), "-", "")) { + case "none", "raw", "": + return FramingNone, nil + case "grpc": + return FramingGRPC, nil + case "connect", "connectrpc": + return FramingConnect, nil + case "varint", "varintdelimited": + return FramingVarint, nil + default: + return FramingNone, fmt.Errorf("unknown framing: %q", s) + } +} + +// DisassembleOptions matches the internal disassembler options. +type DisassembleOptions struct { + ExplicitWireTypes bool + ExplicitLengthPrefixes bool + NoGroups bool + MaxDepth int + Framing Framing +} + +// AssembleOptions contains assembly options. +type AssembleOptions struct { + Framing Framing +} + +// Assemble parses and compiles protoscope text directly to protobuf wire binary. +func Assemble(path string, text []byte) ([]byte, []Diagnostic) { + return AssembleWithOptions(path, text, AssembleOptions{}) +} + +// AssembleWithOptions compiles protoscope text directly to protobuf wire binary with options. +func AssembleWithOptions(path string, text []byte, opts AssembleOptions) ([]byte, []Diagnostic) { + frames := splitFrames(text) + parentFile := source.NewFile(path, string(text)) + allDiags := make([]Diagnostic, 0, len(frames)) + var payloads [][]byte + var flags []byte + + if len(frames) > 1 && opts.Framing == FramingNone { + line := frames[1].lineOffset + allDiags = append(allDiags, Diagnostic{ + Range: Range{ + Start: Position{Line: line, Column: 1}, + End: Position{Line: line, Column: 4}, + }, + Message: "multiple frames are not supported for no framing", + Level: SeverityError, + }) + return nil, allDiags + } + + hasError := false + for _, frame := range frames { + // Extract flags comment from this frame if present. + var frameFlags byte + frameLines := strings.Split(frame.text, "\n") + for _, line := range frameLines { + trimmed := strings.TrimSpace(line) + if trimmed == "" { + continue + } + if strings.HasPrefix(trimmed, "#") { + comment := strings.TrimSpace(strings.TrimPrefix(trimmed, "#")) + if strings.HasPrefix(comment, "flags:") { + valStr := strings.TrimSpace(strings.TrimPrefix(comment, "flags:")) + if val, err := strconv.ParseUint(valStr, 10, 8); err == nil { + frameFlags = byte(val) + } + } else if strings.HasPrefix(comment, "flag:") { + valStr := strings.TrimSpace(strings.TrimPrefix(comment, "flag:")) + if val, err := strconv.ParseUint(valStr, 10, 8); err == nil { + frameFlags = byte(val) + } + } + } else { + break + } + } + flags = append(flags, frameFlags) + + src := source.NewFile(path, frame.text) + r := &report.Report{} + file, ok := parser.Parse(path, src, r) + + report.ShiftReportSpans(r, parentFile, frame.byteOffset) + + diags := convertDiagnostics(r) + allDiags = append(allDiags, diags...) + if !ok || file == nil { + hasError = true + continue + } + + out := assembler.Assemble(file) + payloads = append(payloads, out) + } + + if hasError { + return nil, allDiags + } + + // Apply the framing + var result []byte + switch opts.Framing { + case FramingGRPC, FramingConnect: + for i, payload := range payloads { + header := make([]byte, 5) + header[0] = flags[i] + binary.BigEndian.PutUint32(header[1:5], uint32(len(payload))) + result = append(result, header...) + result = append(result, payload...) + } + case FramingVarint: + for _, payload := range payloads { + var lengthBuf [10]byte + n := binary.PutUvarint(lengthBuf[:], uint64(len(payload))) + result = append(result, lengthBuf[:n]...) + result = append(result, payload...) + } + default: + // raw / default: concatenate all payloads + for _, payload := range payloads { + result = append(result, payload...) + } + } + + return result, allDiags +} + +// Disassemble converts protobuf wire binary back to protoscope text. +func Disassemble(data []byte, opts DisassembleOptions) (string, error) { + var buf strings.Builder + disOpts := disassembler.Options{ + ExplicitWireTypes: opts.ExplicitWireTypes, + ExplicitLengthPrefixes: opts.ExplicitLengthPrefixes, + NoGroups: opts.NoGroups, + MaxDepth: opts.MaxDepth, + } + + switch opts.Framing { + case FramingGRPC, FramingConnect: + off := 0 + first := true + for off < len(data) { + if off+5 > len(data) { + return "", fmt.Errorf("unexpected EOF reading header at offset %d", off) + } + flags := data[off] + length := binary.BigEndian.Uint32(data[off+1 : off+5]) + off += 5 + + if uint64(length) > uint64(len(data)-off) { + return "", fmt.Errorf("length %d out of bounds at offset %d", length, off) + } + + payload := data[off : off+int(length)] + off += int(length) + + if !first { + buf.WriteString("---\n") + } + first = false + + if flags != 0 { + fmt.Fprintf(&buf, "# flags: %d\n", flags) + } + + err := disassembler.DisassembleWithOptions(payload, &buf, disOpts) + if err != nil { + return "", err + } + } + return buf.String(), nil + + case FramingVarint: + off := 0 + first := true + for off < len(data) { + l, n := binary.Uvarint(data[off:]) + if n <= 0 { + return "", fmt.Errorf("invalid varint length prefix at offset %d", off) + } + off += n + + if l > uint64(len(data)-off) { + return "", fmt.Errorf("length %d out of bounds at offset %d", l, off) + } + + payload := data[off : off+int(l)] + off += int(l) + + if !first { + buf.WriteString("---\n") + } + first = false + + err := disassembler.DisassembleWithOptions(payload, &buf, disOpts) + if err != nil { + return "", err + } + } + return buf.String(), nil + + default: + err := disassembler.DisassembleWithOptions(data, &buf, disOpts) + if err != nil { + return "", err + } + return buf.String(), nil + } +} + +// Diagnostics parses the text and returns any syntactic or structural diagnostics. +func Diagnostics(path string, text []byte) []Diagnostic { + frames := splitFrames(text) + parentFile := source.NewFile(path, string(text)) + allDiags := make([]Diagnostic, 0, len(frames)) + for _, frame := range frames { + src := source.NewFile(path, frame.text) + r := &report.Report{} + _, _ = parser.Parse(path, src, r) + report.ShiftReportSpans(r, parentFile, frame.byteOffset) + allDiags = append(allDiags, convertDiagnostics(r)...) + } + return allDiags +} + +// DocumentSymbol represents a simplified symbol hierarchy (e.g. fields, groups, blocks). +type DocumentSymbol struct { + Name string + Detail string + Kind string // e.g., "field", "group", "block", "literal" + Range Range + Children []DocumentSymbol +} + +// DocumentSymbols returns a hierarchy of symbols within the protoscope file. +func DocumentSymbols(path string, text []byte) ([]DocumentSymbol, []Diagnostic) { + frames := splitFrames(text) + parentFile := source.NewFile(path, string(text)) + allSymbols := make([]DocumentSymbol, 0, len(frames)) + allDiags := make([]Diagnostic, 0, len(frames)) + + for _, frame := range frames { + src := source.NewFile(path, frame.text) + r := &report.Report{} + file, ok := parser.Parse(path, src, r) + report.ShiftReportSpans(r, parentFile, frame.byteOffset) + diags := convertDiagnostics(r) + allDiags = append(allDiags, diags...) + + if ok && file != nil { + var symbols []DocumentSymbol + for decl := range seq.Values(file.Decls()) { + symbols = append(symbols, collectSymbols(decl)...) + } + if frame.lineOffset > 0 { + for i := range symbols { + shiftSymbolRange(&symbols[i], frame.lineOffset) + } + } + allSymbols = append(allSymbols, symbols...) + } + } + return allSymbols, allDiags +} + +type InspectKind int + +const ( + InspectKindField InspectKind = iota + 1 + InspectKindLiteral + InspectKindBlock +) + +type FieldInspectInfo struct { + Tag string + WireType string +} + +type LiteralInspectInfo struct { + RawText string + Type string // "Number" or "String" + Suffix string + VarintBytes string + DecodedText string + IntValue uint64 + FloatValue float64 + Zigzag uint64 + HexLength int + ByteLength int + CharLength int + HasInt bool + HasFloat bool + IsHexHexQuote bool // true if backtick quote `...` +} + +type BlockInspectInfo struct { + Name string // "{" or "!{" +} + +// InspectInfo holds structured information to display on inspect (hover). +type InspectInfo struct { + Range Range + Kind InspectKind + Field *FieldInspectInfo + Literal *LiteralInspectInfo + Block *BlockInspectInfo +} + +// Inspect returns structured documentation for the token/node at the given line/column. +func Inspect(path string, text []byte, line, col int) (*InspectInfo, error) { + frames := splitFrames(text) + + var targetFrame *frameInfo + for i := len(frames) - 1; i >= 0; i-- { + if line > frames[i].lineOffset { + targetFrame = &frames[i] + break + } + } + if targetFrame == nil { + return nil, nil + } + + localLine := line - targetFrame.lineOffset + src := source.NewFile(path, targetFrame.text) + r := &report.Report{} + file, _ := parser.Parse(path, src, r) + if file == nil { + return nil, nil + } + + loc := src.InverseLocation(localLine, col, length.UTF16) + offset := loc.Offset + + node := findNode(file, offset) + if node.IsZero() { + return nil, nil + } + + inspectRange := convertSpan(node.Span()) + inspectRange.Start.Line += targetFrame.lineOffset + inspectRange.End.Line += targetFrame.lineOffset + + inspect := &InspectInfo{ + Range: inspectRange, + } + + switch node.Kind() { + case ast.DeclKindField: + f := node.AsField() + inspect.Kind = InspectKindField + inspect.Field = &FieldInspectInfo{ + Tag: f.Tag().Text(), + } + if wt := f.WireType(); !wt.IsZero() && wt.Text() != "" { + inspect.Field.WireType = wt.Text() + } + + case ast.DeclKindLiteral: + l := node.AsLiteral() + tok := l.Token() + inspect.Kind = InspectKindLiteral + inspect.Literal = &LiteralInspectInfo{ + RawText: tok.Text(), + } + + if tok.Kind() == token.Number { + inspect.Literal.Type = "Number" + num := tok.AsNumber() + inspect.Literal.Suffix = num.Suffix().Text() + if v, exact := num.Int(); exact { + inspect.Literal.HasInt = true + inspect.Literal.IntValue = v + inspect.Literal.VarintBytes = varintBytes(v) + + // Interpret as signed 64-bit to show zigzag encoding if applicable + sval := int64(v) + inspect.Literal.Zigzag = uint64((sval << 1) ^ (sval >> 63)) + } else if fval, exactf := num.Float(); exactf { + inspect.Literal.HasFloat = true + inspect.Literal.FloatValue = fval + } + } else if tok.Kind() == token.String { + inspect.Literal.Type = "String" + sToken := tok.AsString() + open, _ := sToken.Quotes() + if open.Text() == "`" { + inspect.Literal.IsHexHexQuote = true + // Hex string literal + decoded, err := hexDecode(tok.Text()) + if err == nil { + inspect.Literal.HexLength = len(decoded) + if isPrintable(decoded) { + inspect.Literal.DecodedText = string(decoded) + } + } + } else { + // Standard string literal + strVal := sToken.Text() + inspect.Literal.ByteLength = len(strVal) + inspect.Literal.CharLength = utf8.RuneCountInString(strVal) + } + } + + case ast.DeclKindBlock: + b := node.AsBlock() + inspect.Kind = InspectKindBlock + inspect.Block = &BlockInspectInfo{ + Name: b.Token().Text(), + } + } + + return inspect, nil +} + +func collectSymbols(decl ast.DeclAny) []DocumentSymbol { + if decl.IsZero() { + return nil + } + switch decl.Kind() { + case ast.DeclKindField: + f := decl.AsField() + tagText := f.Tag().Text() + + var children []DocumentSymbol + val := f.Value() + if !val.IsZero() { + children = collectSymbols(val) + } + + detail := "" + if wt := f.WireType(); !wt.IsZero() && wt.Text() != "" { + detail = ":" + wt.Text() + } + + return []DocumentSymbol{{ + Name: tagText + ":", + Detail: detail, + Kind: "field", + Range: convertSpan(f.Span()), + Children: children, + }} + + case ast.DeclKindLiteral: + l := decl.AsLiteral() + return []DocumentSymbol{{ + Name: l.Token().Text(), + Kind: "literal", + Range: convertSpan(l.Span()), + }} + + case ast.DeclKindBlock: + b := decl.AsBlock() + var children []DocumentSymbol + for child := range seq.Values(b.Decls()) { + children = append(children, collectSymbols(child)...) + } + + name := b.Token().Text() + detail := "" + switch name { + case "!{": + name = "Group" + detail = "!{}" + case "{": + name = "Length-Prefixed" + detail = "{}" + } + + return []DocumentSymbol{{ + Name: name, + Detail: detail, + Kind: "block", + Range: convertSpan(b.Span()), + Children: children, + }} + } + return nil +} + +func convertSpan(span source.Span) Range { + if span.IsZero() { + return Range{} + } + startLoc := span.StartLoc() + endLoc := span.EndLoc() + return Range{ + Start: Position{Line: startLoc.Line, Column: startLoc.Column}, + End: Position{Line: endLoc.Line, Column: endLoc.Column}, + } +} + +func varintBytes(v uint64) string { + var buf []string + for v >= 0x80 { + buf = append(buf, fmt.Sprintf("%02X", byte(v|0x80))) + v >>= 7 + } + buf = append(buf, fmt.Sprintf("%02X", byte(v))) + return strings.Join(buf, " ") +} + +func isPrintable(data []byte) bool { + if !utf8.Valid(data) { + return false + } + for len(data) > 0 { + r, size := utf8.DecodeRune(data) + if r == utf8.RuneError { + return false + } + if !unicode.IsPrint(r) && !unicode.IsSpace(r) { + return false + } + data = data[size:] + } + return true +} + +func hexDecode(text string) ([]byte, error) { + // Strip backticks + text = strings.Trim(text, "`") + // Remove whitespace + var cleaned strings.Builder + for _, r := range text { + if !unicode.IsSpace(r) { + cleaned.WriteRune(r) + } + } + return hex.DecodeString(cleaned.String()) +} + +func findNode(file *ast.File, offset int) ast.DeclAny { + var best ast.DeclAny + var search func(decl ast.DeclAny) + search = func(decl ast.DeclAny) { + if decl.IsZero() { + return + } + span := decl.Span() + if span.IsZero() { + return + } + if offset >= span.Start && offset <= span.End { + best = decl + if decl.Kind() == ast.DeclKindBlock { + for child := range seq.Values(decl.AsBlock().Decls()) { + search(child) + } + } else if decl.Kind() == ast.DeclKindField { + search(decl.AsField().Value()) + } + } + } + for decl := range seq.Values(file.Decls()) { + search(decl) + } + return best +} + +func convertDiagnostics(r *report.Report) []Diagnostic { + diagnostics := make([]Diagnostic, 0, len(r.Diagnostics)) + for _, diag := range r.Diagnostics { + severity := SeverityError + switch diag.Level() { + case report.Warning: + severity = SeverityWarning + case report.Remark: + severity = SeverityInfo + } + + span := diag.Primary() + var rangeVal Range + if !span.IsZero() { + startLoc := span.StartLoc() + endLoc := span.EndLoc() + rangeVal = Range{ + Start: Position{Line: startLoc.Line, Column: startLoc.Column}, + End: Position{Line: endLoc.Line, Column: endLoc.Column}, + } + } + + diagnostics = append(diagnostics, Diagnostic{ + Range: rangeVal, + Message: diag.Message(), + Level: severity, + }) + } + return diagnostics +} + +// Representation represents a possible translation/formatting of a protobuf value. +type Representation struct { + Type string // E.g., "message", "string", "bytes", "varint", "zigzag", "bool", "fixed32", "float32", "fixed64", "float64", "packed_varint", "packed_fixed32", "packed_fixed64" + Text string // The protoscope textual value representation + Description string // Human-readable description + Likelihood float64 // Likelihood score, between 0.0 and 1.0 (higher is more likely) +} + +func mapRepresentations(internalReps []disassembler.Representation) []Representation { + if internalReps == nil { + return nil + } + reps := make([]Representation, len(internalReps)) + for i, r := range internalReps { + reps[i] = Representation{ + Type: r.Type, + Text: r.Text, + Description: r.Description, + Likelihood: r.Likelihood, + } + } + return reps +} + +// Possibilities analyzes the raw payload bytes for a given wire type and returns +// all valid alternative representations sorted by likelihood. +func Possibilities(wireType int, payload []byte) []Representation { + return mapRepresentations(disassembler.Possibilities(wireType, payload)) +} + +type frameInfo struct { + text string + byteOffset int + lineOffset int +} + +func splitFrames(text []byte) []frameInfo { + var frames []frameInfo + s := string(text) + lines := strings.Split(s, "\n") + var currentFrame strings.Builder + frameLineOffset := 0 + frameByteOffset := 0 + currentByteOffset := 0 + + for i, line := range lines { + lineLen := len(line) + if i < len(lines)-1 { + lineLen++ // add 1 for '\n' + } + + trimmed := strings.TrimSpace(line) + if trimmed == "---" { + frames = append(frames, frameInfo{ + text: currentFrame.String(), + byteOffset: frameByteOffset, + lineOffset: frameLineOffset, + }) + currentFrame.Reset() + frameLineOffset = i + 1 + frameByteOffset = currentByteOffset + lineLen + } else { + currentFrame.WriteString(line) + if i < len(lines)-1 { + currentFrame.WriteByte('\n') + } + } + currentByteOffset += lineLen + } + frames = append(frames, frameInfo{ + text: currentFrame.String(), + byteOffset: frameByteOffset, + lineOffset: frameLineOffset, + }) + return frames +} + +func shiftSymbolRange(s *DocumentSymbol, lineOffset int) { + s.Range.Start.Line += lineOffset + s.Range.End.Line += lineOffset + for i := range s.Children { + shiftSymbolRange(&s.Children[i], lineOffset) + } +} diff --git a/experimental/protoscope/protoscope_test.go b/experimental/protoscope/protoscope_test.go new file mode 100644 index 00000000..26eb41a0 --- /dev/null +++ b/experimental/protoscope/protoscope_test.go @@ -0,0 +1,446 @@ +// Copyright 2020-2026 Buf Technologies, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package protoscope + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAssembleAndDisassemble(t *testing.T) { + input := `1: 150 +2: { + 1: "hello" +} +` + binary, diags := Assemble("test.protoscope", []byte(input)) + require.Empty(t, diags) + require.NotEmpty(t, binary) + + // Verify disassembled output matches input + text, err := Disassemble(binary, DisassembleOptions{}) + require.NoError(t, err) + assert.Contains(t, text, "1: 150") + assert.Contains(t, text, `2: {`+"`"+`0a 05 68 65 6c 6c 6f`+"`"+`}`) + + // Test MaxDepth + nestedInput := `1: { + 2: { + 3: 150 + } +} +` + nestedBinary, nestedDiags := Assemble("nested.protoscope", []byte(nestedInput)) + require.Empty(t, nestedDiags) + require.NotEmpty(t, nestedBinary) + + // Disassembling with default options should work + _, err = Disassemble(nestedBinary, DisassembleOptions{}) + require.NoError(t, err) + + // Disassembling with MaxDepth = 1 should fail + _, err = Disassemble(nestedBinary, DisassembleOptions{MaxDepth: 1}) + require.Error(t, err) + assert.Equal(t, "max depth exceeded", err.Error()) +} + +func TestDiagnostics(t *testing.T) { + // Syntactically invalid input + invalidInput := `1: +2: { +` + diags := Diagnostics("invalid.protoscope", []byte(invalidInput)) + assert.NotEmpty(t, diags) + + var hasError bool + for _, diag := range diags { + if diag.Level == SeverityError { + hasError = true + } + assert.NotEmpty(t, diag.Message) + assert.Positive(t, diag.Range.Start.Line) + } + assert.True(t, hasError) +} + +func TestDocumentSymbols(t *testing.T) { + input := `1: 150 +2: { + 3: "hello" +} +` + symbols, diags := DocumentSymbols("test.protoscope", []byte(input)) + require.Empty(t, diags) + require.Len(t, symbols, 2) + + // First symbol: Field 1 + assert.Equal(t, "1:", symbols[0].Name) + assert.Equal(t, "field", symbols[0].Kind) + + // Second symbol: Field 2 containing Block + assert.Equal(t, "2:", symbols[1].Name) + assert.Equal(t, "field", symbols[1].Kind) + require.Len(t, symbols[1].Children, 1) + + // Block symbol + block := symbols[1].Children[0] + assert.Equal(t, "Length-Prefixed", block.Name) + assert.Equal(t, "block", block.Kind) + require.Len(t, block.Children, 1) + + // Inside Block: Field 3 + field3 := block.Children[0] + assert.Equal(t, "3:", field3.Name) + assert.Equal(t, "field", field3.Kind) +} + +func TestInspect(t *testing.T) { + input := `1: 150 +2: { + 3: ` + "`" + `01 02 03` + "`" + ` +} +` + // Test hover over "1:" (line 1, column 1) + h1, err := Inspect("test.protoscope", []byte(input), 1, 1) + require.NoError(t, err) + require.NotNil(t, h1) + assert.Equal(t, InspectKindField, h1.Kind) + require.NotNil(t, h1.Field) + assert.Equal(t, "1", h1.Field.Tag) + + // Test hover over "150" (line 1, column 4) + h2, err := Inspect("test.protoscope", []byte(input), 1, 4) + require.NoError(t, err) + require.NotNil(t, h2) + assert.Equal(t, InspectKindLiteral, h2.Kind) + require.NotNil(t, h2.Literal) + assert.Equal(t, "Number", h2.Literal.Type) + assert.Equal(t, "150", h2.Literal.RawText) + assert.True(t, h2.Literal.HasInt) + assert.Equal(t, uint64(150), h2.Literal.IntValue) + + // Test hover over Hex string literal (line 3, column 6) + h3, err := Inspect("test.protoscope", []byte(input), 3, 6) + require.NoError(t, err) + require.NotNil(t, h3) + assert.Equal(t, InspectKindLiteral, h3.Kind) + require.NotNil(t, h3.Literal) + assert.Equal(t, "String", h3.Literal.Type) + assert.True(t, h3.Literal.IsHexHexQuote) + assert.Equal(t, 3, h3.Literal.HexLength) + + // Test hover over Hex string literal with UTF-8 + utf8Input := "1: {`e6 97 a5 e6 9c ac e8 aa 9e`}\n" + h4, err := Inspect("utf8.protoscope", []byte(utf8Input), 1, 5) + require.NoError(t, err) + require.NotNil(t, h4) + assert.Equal(t, InspectKindLiteral, h4.Kind) + require.NotNil(t, h4.Literal) + assert.True(t, h4.Literal.IsHexHexQuote) + assert.Equal(t, "\u65e5\u672c\u8a9e", h4.Literal.DecodedText) + + // Test hover over standard string literal with multi-byte runes + stdStringInput := "1: \"Hello, UTF-8 text! \u65e5\u672c\u8a9e, \U00010348, \U0001f4bb, \U0001f680\"" + h5, err := Inspect("std.protoscope", []byte(stdStringInput), 1, 5) + require.NoError(t, err) + require.NotNil(t, h5) + assert.Equal(t, InspectKindLiteral, h5.Kind) + require.NotNil(t, h5.Literal) + assert.Equal(t, "String", h5.Literal.Type) + assert.False(t, h5.Literal.IsHexHexQuote) + assert.Equal(t, 46, h5.Literal.ByteLength) + assert.Equal(t, 31, h5.Literal.CharLength) +} + +func TestPossibilities(t *testing.T) { + // wireVarint = 0, payload = [0x96, 0x01] (varint for 150) + reps := Possibilities(0, []byte{0x96, 0x01}) + require.NotEmpty(t, reps) + + var foundVarint bool + for _, r := range reps { + if r.Type == "varint" { + foundVarint = true + assert.Equal(t, "150", r.Text) + assert.Equal(t, "Varint", r.Description) + } + } + assert.True(t, foundVarint, "Should have found varint representation") +} + +func TestMultiFrameAndVariants(t *testing.T) { + // 1. No framing with single frame + rawInput := `1: 150 +2: "hello" +` + binary, diags := AssembleWithOptions("raw.protoscope", []byte(rawInput), AssembleOptions{Framing: FramingNone}) + require.Empty(t, diags) + // Output should be concatenated binary: + // 1: 150 -> 08 96 01 + // 2: "hello" -> 12 05 68 65 6c 6c 6f + expectedRaw := []byte{0x08, 0x96, 0x01, 0x12, 0x05, 0x68, 0x65, 0x6c, 0x6c, 0x6f} + assert.Equal(t, expectedRaw, binary) + + // Since raw has no headers, disassemble raw treats the whole stream as 1 message. + disText, err := Disassemble(binary, DisassembleOptions{Framing: FramingNone}) + require.NoError(t, err) + assert.Contains(t, disText, "1: 150") + assert.Contains(t, disText, `2: {"hello"}`) // disassembled as tag 2 since it was concatenated + + // 2. Varint delimited framing with multiple frames + varintInput := `1: 150 +--- +2: "hello" +` + binaryV, diagsV := AssembleWithOptions("varint.protoscope", []byte(varintInput), AssembleOptions{Framing: FramingVarint}) + require.Empty(t, diagsV) + // Frame 1: len 3, Frame 2: len 7 + expectedV := []byte{ + 3, 0x08, 0x96, 0x01, + 7, 0x12, 0x05, 0x68, 0x65, 0x6c, 0x6c, 0x6f, + } + assert.Equal(t, expectedV, binaryV) + + disTextV, err := Disassemble(binaryV, DisassembleOptions{Framing: FramingVarint}) + require.NoError(t, err) + assert.Equal(t, "1: 150\n---\n2: {\"hello\"}\n", disTextV) + + // 3. gRPC / ConnectRPC framing with custom flags + grpcInput := `# flags: 1 +1: 150 +--- +# flag: 2 +2: "hello" +` + binaryG, diagsG := AssembleWithOptions("grpc.protoscope", []byte(grpcInput), AssembleOptions{Framing: FramingGRPC}) + require.Empty(t, diagsG) + // Frame 1: flags 1, len 3 -> 01, 00 00 00 03, 08 96 01 + // Frame 2: flags 2, len 7 -> 02, 00 00 00 07, 12 05 68 65 6c 6c 6f + expectedG := []byte{ + 1, 0x00, 0x00, 0x00, 0x03, 0x08, 0x96, 0x01, + 2, 0x00, 0x00, 0x00, 0x07, 0x12, 0x05, 0x68, 0x65, 0x6c, 0x6c, 0x6f, + } + assert.Equal(t, expectedG, binaryG) + + disTextG, err := Disassemble(binaryG, DisassembleOptions{Framing: FramingGRPC}) + require.NoError(t, err) + assert.Equal(t, "# flags: 1\n1: 150\n---\n# flags: 2\n2: {\"hello\"}\n", disTextG) + + // 4. Test diagnostics shifting across multiple frames + invalidInput := `1: 150 +--- +2: { +# syntax error in second frame +` + diagsErr := Diagnostics("test.protoscope", []byte(invalidInput)) + require.NotEmpty(t, diagsErr) + // The error should be in the second frame (after line 2) + assert.Greater(t, diagsErr[0].Range.Start.Line, 2) + + // 5. Test DocumentSymbols and Hover on multi-frame inputs + symbolInput := `1: 150 +--- +2: 30 +` + symbols, diagsSym := DocumentSymbols("symbols.protoscope", []byte(symbolInput)) + require.Empty(t, diagsSym) + require.Len(t, symbols, 2) + // Symbol 1 starts on line 1 + assert.Equal(t, 1, symbols[0].Range.Start.Line) + // Symbol 2 starts on line 3 (after ---) + assert.Equal(t, 3, symbols[1].Range.Start.Line) + + // Hover test + hover1, err := Inspect("symbols.protoscope", []byte(symbolInput), 1, 1) + require.NoError(t, err) + require.NotNil(t, hover1) + assert.Equal(t, 1, hover1.Range.Start.Line) + + hover2, err := Inspect("symbols.protoscope", []byte(symbolInput), 3, 1) + require.NoError(t, err) + require.NotNil(t, hover2) + assert.Equal(t, 3, hover2.Range.Start.Line) + + // 6. Test multi-frame raw framing error (no framing) + multiRawInput := "1: 150\n---\n2: \"hello\"\n" + _, rawDiags := AssembleWithOptions("raw.protoscope", []byte(multiRawInput), AssembleOptions{Framing: FramingNone}) + require.NotEmpty(t, rawDiags) + assert.Equal(t, "multiple frames are not supported for no framing", rawDiags[0].Message) + assert.Equal(t, 2, rawDiags[0].Range.Start.Line) +} + +func TestAllVariantsRoundtrip(t *testing.T) { + t.Parallel() + + input := `1: 150 +--- +2: {"hello"} +` + + framings := []Framing{ + FramingGRPC, + FramingConnect, + FramingVarint, + } + + for _, framing := range framings { + t.Run(framing.String(), func(t *testing.T) { + t.Parallel() + // Assemble + binary, diags := AssembleWithOptions("test.protoscope", []byte(input), AssembleOptions{Framing: framing}) + require.Empty(t, diags) + require.NotEmpty(t, binary) + + // Disassemble + disassembled, err := Disassemble(binary, DisassembleOptions{Framing: framing}) + require.NoError(t, err) + + // The output should contain our fields and be properly split by --- + assert.Contains(t, disassembled, "1: 150") + assert.Contains(t, disassembled, "---") + assert.Contains(t, disassembled, `2: {"hello"}`) + }) + } +} + +func TestDisassembleFallback(t *testing.T) { + t.Parallel() + + // gRPC message `1: 55` has bytes: + // 00 00 00 00 02 08 37 + grpcBytes := []byte{0x00, 0x00, 0x00, 0x00, 0x02, 0x08, 0x37} + + // Disassembling without framing should trigger invalid tag 0 error fallback comment + disassembled, err := Disassemble(grpcBytes, DisassembleOptions{}) + require.NoError(t, err) + assert.Contains(t, disassembled, "# Error: invalid tag 0; this might be using a different framing (e.g. gRPC)") + assert.Contains(t, disassembled, "`00 00 00 00 02 08 37`") +} + +func TestParseFraming(t *testing.T) { + t.Parallel() + + tests := []struct { + input string + expected Framing + hasErr bool + }{ + {input: "none", expected: FramingNone}, + {input: "raw", expected: FramingNone}, + {input: "", expected: FramingNone}, + {input: "grpc", expected: FramingGRPC}, + {input: "GRPC", expected: FramingGRPC}, + {input: "connect", expected: FramingConnect}, + {input: "connectrpc", expected: FramingConnect}, + {input: "Connect-RPC", expected: FramingConnect}, + {input: "varint", expected: FramingVarint}, + {input: "varintdelimited", expected: FramingVarint}, + {input: "varint-delimited", expected: FramingVarint}, + {input: "varint delimited", expected: FramingVarint}, + {input: "invalid", hasErr: true}, + } + + for _, tc := range tests { + t.Run(tc.input, func(t *testing.T) { + actual, err := ParseFraming(tc.input) + if tc.hasErr { + assert.Error(t, err) + } else { + assert.NoError(t, err) + assert.Equal(t, tc.expected, actual) + } + }) + } +} + +func TestFramingString(t *testing.T) { + t.Parallel() + + assert.Equal(t, "none", FramingNone.String()) + assert.Equal(t, "grpc", FramingGRPC.String()) + assert.Equal(t, "connect", FramingConnect.String()) + assert.Equal(t, "varint", FramingVarint.String()) + assert.Equal(t, "unknown(-1)", Framing(-1).String()) +} + +func TestFramingDeepValidation(t *testing.T) { + t.Parallel() + + t.Run("exact bytes mapping", func(t *testing.T) { + t.Parallel() + input := "1: 150" + + // 1. None (raw wire format) + binNone, diags := AssembleWithOptions("test.protoscope", []byte(input), AssembleOptions{Framing: FramingNone}) + require.Empty(t, diags) + assert.Equal(t, []byte{0x08, 0x96, 0x01}, binNone) + + // 2. gRPC + binGRPC, diags := AssembleWithOptions("test.protoscope", []byte(input), AssembleOptions{Framing: FramingGRPC}) + require.Empty(t, diags) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00, 0x03, 0x08, 0x96, 0x01}, binGRPC) + + // 3. Connect + binConnect, diags := AssembleWithOptions("test.protoscope", []byte(input), AssembleOptions{Framing: FramingConnect}) + require.Empty(t, diags) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00, 0x03, 0x08, 0x96, 0x01}, binConnect) + + // 4. Varint delimited + binVarint, diags := AssembleWithOptions("test.protoscope", []byte(input), AssembleOptions{Framing: FramingVarint}) + require.Empty(t, diags) + assert.Equal(t, []byte{0x03, 0x08, 0x96, 0x01}, binVarint) + }) + + t.Run("flags parsing and roundtrip", func(t *testing.T) { + t.Parallel() + input := "# flags: 5\n1: 150" + + bin, diags := AssembleWithOptions("test.protoscope", []byte(input), AssembleOptions{Framing: FramingGRPC}) + require.Empty(t, diags) + // gRPC header should have flag byte = 5 + assert.Equal(t, []byte{0x05, 0x00, 0x00, 0x00, 0x03, 0x08, 0x96, 0x01}, bin) + + text, err := Disassemble(bin, DisassembleOptions{Framing: FramingGRPC}) + require.NoError(t, err) + assert.Contains(t, text, "# flags: 5") + assert.Contains(t, text, "1: 150") + }) + + t.Run("error handling on truncated/corrupted inputs", func(t *testing.T) { + t.Parallel() + + // 1. Truncated gRPC header + _, err := Disassemble([]byte{0x00, 0x00}, DisassembleOptions{Framing: FramingGRPC}) + require.Error(t, err) + assert.Contains(t, err.Error(), "unexpected EOF reading header") + + // 2. Truncated gRPC payload + _, err = Disassemble([]byte{0x00, 0x00, 0x00, 0x00, 0x05, 0x08, 0x96}, DisassembleOptions{Framing: FramingGRPC}) + require.Error(t, err) + assert.Contains(t, err.Error(), "length 5 out of bounds") + + // 3. Truncated/invalid Varint header + _, err = Disassemble([]byte{0x80}, DisassembleOptions{Framing: FramingVarint}) + require.Error(t, err) + assert.Contains(t, err.Error(), "invalid varint length prefix") + + // 4. Truncated Varint payload + _, err = Disassemble([]byte{0x05, 0x08, 0x96}, DisassembleOptions{Framing: FramingVarint}) + require.Error(t, err) + assert.Contains(t, err.Error(), "length 5 out of bounds") + }) +} diff --git a/experimental/protoscope/testdata/fuzz/FuzzDisassemble/5955458d53084c9e b/experimental/protoscope/testdata/fuzz/FuzzDisassemble/5955458d53084c9e new file mode 100644 index 00000000..7934adac --- /dev/null +++ b/experimental/protoscope/testdata/fuzz/FuzzDisassemble/5955458d53084c9e @@ -0,0 +1,3 @@ +go test fuzz v1 +[]byte("\xdf\xdf\xdf\xdf\xdfߒ\x92\xdf\x01") +int(63) diff --git a/experimental/report/diagnostic.go b/experimental/report/diagnostic.go index 11bf03c8..b3618f9c 100644 --- a/experimental/report/diagnostic.go +++ b/experimental/report/diagnostic.go @@ -380,3 +380,15 @@ func (pageBreak) apply(d *Diagnostic) { } d.snippets[len(d.snippets)-1].pageBreak = true } + +// ShiftSpan shifts all snippets in the diagnostic to be offsets within the new file by adding offset. +func (d *Diagnostic) ShiftSpan(file *source.File, offset int) { + if d == nil { + return + } + for i := range d.snippets { + d.snippets[i].Span.File = file + d.snippets[i].Span.Start += offset + d.snippets[i].Span.End += offset + } +} diff --git a/experimental/report/report.go b/experimental/report/report.go index 8b9f27a4..66b73236 100644 --- a/experimental/report/report.go +++ b/experimental/report/report.go @@ -491,3 +491,13 @@ func (r *Report) push(skip int, level Level) *Diagnostic { return d } + +// ShiftReportSpans shifts all diagnostics in the report to use the new file and add the given byte offset. +func ShiftReportSpans(r *Report, file *source.File, offset int) { + if r == nil { + return + } + for i := range r.Diagnostics { + r.Diagnostics[i].ShiftSpan(file, offset) + } +} diff --git a/experimental/token/keyword/keyword.go b/experimental/token/keyword/keyword.go index c32ca471..01b6bc46 100644 --- a/experimental/token/keyword/keyword.go +++ b/experimental/token/keyword/keyword.go @@ -1,17 +1,3 @@ -// Copyright 2020-2026 Buf Technologies, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - // Code generated by github.com/bufbuild/protocompile/internal/enum keyword.yaml. DO NOT EDIT. package keyword diff --git a/experimental/token/number.go b/experimental/token/number.go index b3e7686c..984a2ddf 100644 --- a/experimental/token/number.go +++ b/experimental/token/number.go @@ -79,13 +79,12 @@ func (n NumberToken) Prefix() source.Span { // Suffix returns an arbitrary suffix attached to this number (the suffix will // have no whitespace before the end of the digits). func (n NumberToken) Suffix() source.Span { - if n.Raw() == nil || n.Raw().Prefix == 0 { + if n.Raw() == nil || n.Raw().Suffix == 0 { return source.Span{} } span := n.Token().Span() - span.Start = span.End - int(n.Raw().Suffix) - return span + return span.Range(span.Len()-int(n.Raw().Suffix), span.Len()) } // Mantissa returns the mantissa digits for this literal, i.e., everything diff --git a/internal/decimal/decimal.go b/internal/decimal/decimal.go index bb5b0e39..8d5dd58c 100644 --- a/internal/decimal/decimal.go +++ b/internal/decimal/decimal.go @@ -249,3 +249,8 @@ func (z *Decimal) digits() int { func (z *Decimal) base2() bool { return z.flags&base2 != 0 } func (z *Decimal) base10() bool { return z.flags&base2 == 0 } + +// Exp returns this value's exponent. +func (z *Decimal) Exp() int { + return int(z.exp) +}