Skip to content

Commit 3c3822c

Browse files
git-hulkclaude
andauthored
Add structured ParseError with O(log n) position lookup (#274)
Replace the stringly-typed parse error and its per-error full-input rescan with a typed *ParseError that carries the byte offset, 1-based line/column, the offending token, and the expected token kinds/keyword. - error.go: ParseError type; Error() renders the line:col message plus a caret, matching the previous CLI presentation. - source.go: lineStarts type maps a byte offset to a 1-based line/column via binary search over line-start offsets, built once on the first error instead of rescanning the input every time. - parser_common.go: expectTokenKind/expectKeyword capture position and expected tokens at the failure site; wrapError finalizes any error into a *ParseError, leaving the long tail of fmt.Errorf sites working via the Msg field. Note: line/column are now 1-based (previously 0-based) and error messages were reworded; callers can now errors.As(err, &ParseError{}). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 5df9a6c commit 3c3822c

4 files changed

Lines changed: 213 additions & 36 deletions

File tree

parser/error.go

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package parser
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
)
7+
8+
// ParseError is a structured parse error. It carries the byte offset and the
9+
// 1-based line/column where parsing stopped, the offending token, and (when
10+
// known) the tokens the grammar expected at that point. Callers such as
11+
// editors or linters can inspect these fields programmatically; the CLI relies
12+
// on Error() to render a human-friendly message with a caret.
13+
type ParseError struct {
14+
Pos Pos // byte offset where parsing stopped
15+
Line int // 1-based line number
16+
Column int // 1-based column number
17+
Got *Token // the token we choked on; nil at end of input
18+
Expected []TokenKind // token kinds the grammar wanted here, if known
19+
Keyword string // a specific keyword that was expected, if any
20+
Msg string // free-form message for the long tail of error sites
21+
22+
input string // original input, for rendering the caret line
23+
starts lineStarts // shared line-start offsets, for extracting the offending line
24+
}
25+
26+
func (e *ParseError) Error() string {
27+
var b strings.Builder
28+
fmt.Fprintf(&b, "line %d:%d %s\n", e.Line, e.Column, e.summary())
29+
e.renderCaret(&b)
30+
return b.String()
31+
}
32+
33+
// summary returns the single-line description of the error, preferring the
34+
// most specific information available.
35+
func (e *ParseError) summary() string {
36+
switch {
37+
case e.Msg != "":
38+
return e.Msg
39+
case e.Keyword != "":
40+
return fmt.Sprintf("expected keyword <%q>, but got '%s'", e.Keyword, tokenDesc(e.Got))
41+
case len(e.Expected) == 1:
42+
return fmt.Sprintf("expected '%s', but got '%s'", e.Expected[0], tokenDesc(e.Got))
43+
case len(e.Expected) > 1:
44+
parts := make([]string, len(e.Expected))
45+
for i, k := range e.Expected {
46+
parts[i] = string(k)
47+
}
48+
return fmt.Sprintf("expected one of [%s], but got '%s'", strings.Join(parts, ", "), tokenDesc(e.Got))
49+
default:
50+
return "syntax error"
51+
}
52+
}
53+
54+
// renderCaret writes the offending source line followed by a caret pointing at
55+
// the error column.
56+
func (e *ParseError) renderCaret(b *strings.Builder) {
57+
if e.starts == nil {
58+
return
59+
}
60+
line := e.starts.lineText(e.input, e.Line)
61+
b.WriteString(line)
62+
b.WriteByte('\n')
63+
for i := 1; i < e.Column; i++ {
64+
b.WriteByte(' ')
65+
}
66+
width := 1
67+
if e.Got != nil && len(e.Got.String) > width {
68+
width = len(e.Got.String)
69+
}
70+
b.WriteString(strings.Repeat("^", width))
71+
b.WriteByte('\n')
72+
}
73+
74+
// tokenDesc describes a token for an error message, matching the kind-based
75+
// wording used throughout the parser. A nil token means end of input.
76+
func tokenDesc(t *Token) string {
77+
if t == nil {
78+
return string(TokenKindEOF)
79+
}
80+
return string(t.Kind)
81+
}

parser/error_test.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package parser
2+
3+
import (
4+
"errors"
5+
"strings"
6+
"testing"
7+
8+
"github.com/stretchr/testify/require"
9+
)
10+
11+
func TestParseError_Structured(t *testing.T) {
12+
// The error is on the second line, so line/column must reflect the
13+
// multi-line offset rather than a flat byte count.
14+
_, err := NewParser("SELECT 1\nFROM 123").ParseStmts()
15+
require.Error(t, err)
16+
17+
var pe *ParseError
18+
require.True(t, errors.As(err, &pe), "expected a *ParseError, got %T", err)
19+
require.Equal(t, 2, pe.Line)
20+
require.GreaterOrEqual(t, pe.Column, 1)
21+
require.Contains(t, pe.Error(), "line 2:")
22+
// The rendered message includes the offending source line and a caret.
23+
require.Contains(t, pe.Error(), "FROM 123")
24+
require.Contains(t, pe.Error(), "^")
25+
}
26+
27+
func TestParseError_ExpectedKeyword(t *testing.T) {
28+
// "IF" must be followed by "EXISTS" / "NOT EXISTS"; the failure carries the
29+
// expected keyword structurally.
30+
_, err := NewParser("DROP TABLE IF foo").ParseStmts()
31+
require.Error(t, err)
32+
33+
var pe *ParseError
34+
require.True(t, errors.As(err, &pe))
35+
require.Equal(t, "EXISTS", pe.Keyword)
36+
require.Equal(t, 1, pe.Line)
37+
}
38+
39+
func TestParseError_ExpectedTokenKind(t *testing.T) {
40+
// An unclosed function-call paren flows through expectTokenKind, so the
41+
// failure carries the expected token kind structurally.
42+
_, err := NewParser("SELECT count(a").ParseStmts()
43+
require.Error(t, err)
44+
45+
var pe *ParseError
46+
require.True(t, errors.As(err, &pe))
47+
require.Equal(t, []TokenKind{TokenKindRParen}, pe.Expected)
48+
require.True(t, strings.HasPrefix(pe.Error(), "line "))
49+
}

parser/parser_common.go

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ import (
88

99
type Parser struct {
1010
lexer *Lexer
11+
lines lineStarts // lazily built on the first error, for position lookup
12+
}
13+
14+
// lineStarts returns the line-start offsets for the input, building them on
15+
// first use. Errors are the cold path, so we avoid paying for this on success.
16+
func (p *Parser) lineStarts() lineStarts {
17+
if p.lines == nil {
18+
p.lines = newLineStarts(p.lexer.input)
19+
}
20+
return p.lines
1121
}
1222

1323
func NewParser(buffer string) *Parser {
@@ -62,7 +72,11 @@ func (p *Parser) expectTokenKind(kind TokenKind) error {
6272
if lastToken := p.tryConsumeTokenKind(kind); lastToken != nil {
6373
return nil
6474
}
65-
return fmt.Errorf("expected the last token kind is: %s, but got %s", kind, p.lastTokenKind())
75+
return &ParseError{
76+
Pos: p.Pos(),
77+
Got: p.last(),
78+
Expected: []TokenKind{kind},
79+
}
6680
}
6781

6882
func (p *Parser) tryConsumeTokenKind(kind TokenKind) *Token {
@@ -89,7 +103,11 @@ func (p *Parser) matchOneOfKeywords(keywords ...string) bool {
89103

90104
func (p *Parser) expectKeyword(keyword string) error {
91105
if !p.matchKeyword(keyword) {
92-
return fmt.Errorf("expected keyword: %s, but got %s", keyword, p.lastTokenKind())
106+
return &ParseError{
107+
Pos: p.Pos(),
108+
Got: p.last(),
109+
Keyword: keyword,
110+
}
93111
}
94112
_ = p.lexer.consumeToken()
95113
return nil
@@ -376,48 +394,30 @@ func (p *Parser) parseFormat(pos Pos) (*FormatClause, error) {
376394
}, nil
377395
}
378396

397+
// wrapError finalizes a parse error: it ensures the error is a *ParseError with
398+
// line/column resolved and the input attached for caret rendering. Errors that
399+
// already originate as *ParseError (from the expect* helpers) keep their
400+
// captured position and expected-token information; the long tail of
401+
// fmt.Errorf sites is wrapped here with the current position.
379402
func (p *Parser) wrapError(err error) error {
380403
if err == nil {
381404
return nil
382405
}
383406

384-
lineNo := 0
385-
column := 0
386-
387-
// p.Pos() can exceed the input length when the lexer is at EOF,
388-
// so clamp the upper bound to avoid an index-out-of-range panic.
389-
upperBound := int(p.Pos())
390-
if upperBound > len(p.lexer.input) {
391-
upperBound = len(p.lexer.input)
392-
}
393-
for i := 0; i < upperBound; i++ {
394-
if p.lexer.input[i] == '\n' {
395-
lineNo++
396-
column = 0
397-
} else {
398-
column++
407+
var pe *ParseError
408+
if !errors.As(err, &pe) {
409+
pe = &ParseError{
410+
Pos: p.Pos(),
411+
Got: p.last(),
412+
Msg: err.Error(),
399413
}
400414
}
401-
402-
lines := strings.Split(p.lexer.input, "\n")
403-
var buf strings.Builder
404-
buf.WriteString(fmt.Sprintf("line %d:%d %s\n", lineNo, column, err.Error()))
405-
for i, line := range lines {
406-
if i == lineNo {
407-
buf.WriteString(line)
408-
buf.WriteByte('\n')
409-
for j := 0; j < column; j++ {
410-
buf.WriteByte(' ')
411-
}
412-
if p.last() != nil {
413-
buf.WriteString(strings.Repeat("^", len(p.last().String)))
414-
} else {
415-
buf.WriteString("^")
416-
}
417-
buf.WriteByte('\n')
418-
}
415+
if pe.Line == 0 {
416+
pe.Line, pe.Column = p.lineStarts().position(int(pe.Pos))
419417
}
420-
return errors.New(buf.String())
418+
pe.input = p.lexer.input
419+
pe.starts = p.lineStarts()
420+
return pe
421421
}
422422

423423
func (p *Parser) parseRatioExpr(pos Pos) (*RatioExpr, error) {

parser/source.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package parser
2+
3+
import "sort"
4+
5+
// lineStarts holds the byte offset where each line of the input begins
6+
// (lineStarts[i] is the start of 0-based line i). It is built once so that
7+
// error reporting does not re-scan the whole buffer for every error.
8+
type lineStarts []int
9+
10+
func newLineStarts(input string) lineStarts {
11+
starts := lineStarts{0}
12+
for i := 0; i < len(input); i++ {
13+
if input[i] == '\n' {
14+
starts = append(starts, i+1)
15+
}
16+
}
17+
return starts
18+
}
19+
20+
// position returns the 1-based line and column for a byte offset.
21+
func (s lineStarts) position(offset int) (line, col int) {
22+
// Find the largest line start that is <= offset.
23+
i := sort.Search(len(s), func(i int) bool {
24+
return s[i] > offset
25+
}) - 1
26+
if i < 0 {
27+
i = 0
28+
}
29+
return i + 1, offset - s[i] + 1
30+
}
31+
32+
// lineText returns the text of the given 1-based line, without the trailing
33+
// line terminator. It returns an empty string for out-of-range lines.
34+
func (s lineStarts) lineText(input string, line int) string {
35+
if line < 1 || line > len(s) {
36+
return ""
37+
}
38+
start := s[line-1]
39+
end := len(input)
40+
if line < len(s) {
41+
end = s[line] - 1 // exclude the '\n'
42+
}
43+
if end > start && input[end-1] == '\r' {
44+
end-- // exclude a '\r' from a '\r\n' terminator
45+
}
46+
return input[start:end]
47+
}

0 commit comments

Comments
 (0)