Add structured ParseError with O(log n) position lookup (#274)

git-hulk · claude · web-flow · commit 3c3822c5e43a · 2026-06-10T21:23:42.000+08:00
Replace the stringly-typed parse error and its per-error full-input
rescan with a typed *ParseError that carries the byte offset, 1-based
line/column, the offending token, and the expected token kinds/keyword.

- error.go: ParseError type; Error() renders the line:col message plus a
  caret, matching the previous CLI presentation.
- source.go: lineStarts type maps a byte offset to a 1-based line/column
  via binary search over line-start offsets, built once on the first
  error instead of rescanning the input every time.
- parser_common.go: expectTokenKind/expectKeyword capture position and
  expected tokens at the failure site; wrapError finalizes any error
  into a *ParseError, leaving the long tail of fmt.Errorf sites working
  via the Msg field.

Note: line/column are now 1-based (previously 0-based) and error
messages were reworded; callers can now errors.As(err, &amp;ParseError{}).

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/parser/error.go b/parser/error.go
@@ -0,0 +1,81 @@
+package parser
+
+import (
+	"fmt"
+	"strings"
+)
+
+// ParseError is a structured parse error. It carries the byte offset and the
+// 1-based line/column where parsing stopped, the offending token, and (when
+// known) the tokens the grammar expected at that point. Callers such as
+// editors or linters can inspect these fields programmatically; the CLI relies
+// on Error() to render a human-friendly message with a caret.
+type ParseError struct {
+	Pos      Pos         // byte offset where parsing stopped
+	Line     int         // 1-based line number
+	Column   int         // 1-based column number
+	Got      *Token      // the token we choked on; nil at end of input
+	Expected []TokenKind // token kinds the grammar wanted here, if known
+	Keyword  string      // a specific keyword that was expected, if any
+	Msg      string      // free-form message for the long tail of error sites
+
+	input  string     // original input, for rendering the caret line
+	starts lineStarts // shared line-start offsets, for extracting the offending line
+}
+
+func (e *ParseError) Error() string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "line %d:%d %s\n", e.Line, e.Column, e.summary())
+	e.renderCaret(&b)
+	return b.String()
+}
+
+// summary returns the single-line description of the error, preferring the
+// most specific information available.
+func (e *ParseError) summary() string {
+	switch {
+	case e.Msg != "":
+		return e.Msg
+	case e.Keyword != "":
+		return fmt.Sprintf("expected keyword <%q>, but got '%s'", e.Keyword, tokenDesc(e.Got))
+	case len(e.Expected) == 1:
+		return fmt.Sprintf("expected '%s', but got '%s'", e.Expected[0], tokenDesc(e.Got))
+	case len(e.Expected) > 1:
+		parts := make([]string, len(e.Expected))
+		for i, k := range e.Expected {
+			parts[i] = string(k)
+		}
+		return fmt.Sprintf("expected one of [%s], but got '%s'", strings.Join(parts, ", "), tokenDesc(e.Got))
+	default:
+		return "syntax error"
+	}
+}
+
+// renderCaret writes the offending source line followed by a caret pointing at
+// the error column.
+func (e *ParseError) renderCaret(b *strings.Builder) {
+	if e.starts == nil {
+		return
+	}
+	line := e.starts.lineText(e.input, e.Line)
+	b.WriteString(line)
+	b.WriteByte('\n')
+	for i := 1; i < e.Column; i++ {
+		b.WriteByte(' ')
+	}
+	width := 1
+	if e.Got != nil && len(e.Got.String) > width {
+		width = len(e.Got.String)
+	}
+	b.WriteString(strings.Repeat("^", width))
+	b.WriteByte('\n')
+}
+
+// tokenDesc describes a token for an error message, matching the kind-based
+// wording used throughout the parser. A nil token means end of input.
+func tokenDesc(t *Token) string {
+	if t == nil {
+		return string(TokenKindEOF)
+	}
+	return string(t.Kind)
+}
diff --git a/parser/error_test.go b/parser/error_test.go
@@ -0,0 +1,49 @@
+package parser
+
+import (
+	"errors"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestParseError_Structured(t *testing.T) {
+	// The error is on the second line, so line/column must reflect the
+	// multi-line offset rather than a flat byte count.
+	_, err := NewParser("SELECT 1\nFROM 123").ParseStmts()
+	require.Error(t, err)
+
+	var pe *ParseError
+	require.True(t, errors.As(err, &pe), "expected a *ParseError, got %T", err)
+	require.Equal(t, 2, pe.Line)
+	require.GreaterOrEqual(t, pe.Column, 1)
+	require.Contains(t, pe.Error(), "line 2:")
+	// The rendered message includes the offending source line and a caret.
+	require.Contains(t, pe.Error(), "FROM 123")
+	require.Contains(t, pe.Error(), "^")
+}
+
+func TestParseError_ExpectedKeyword(t *testing.T) {
+	// "IF" must be followed by "EXISTS" / "NOT EXISTS"; the failure carries the
+	// expected keyword structurally.
+	_, err := NewParser("DROP TABLE IF foo").ParseStmts()
+	require.Error(t, err)
+
+	var pe *ParseError
+	require.True(t, errors.As(err, &pe))
+	require.Equal(t, "EXISTS", pe.Keyword)
+	require.Equal(t, 1, pe.Line)
+}
+
+func TestParseError_ExpectedTokenKind(t *testing.T) {
+	// An unclosed function-call paren flows through expectTokenKind, so the
+	// failure carries the expected token kind structurally.
+	_, err := NewParser("SELECT count(a").ParseStmts()
+	require.Error(t, err)
+
+	var pe *ParseError
+	require.True(t, errors.As(err, &pe))
+	require.Equal(t, []TokenKind{TokenKindRParen}, pe.Expected)
+	require.True(t, strings.HasPrefix(pe.Error(), "line "))
+}
diff --git a/parser/parser_common.go b/parser/parser_common.go
@@ -8,6 +8,16 @@ import (
 
 type Parser struct {
 	lexer *Lexer
+	lines lineStarts // lazily built on the first error, for position lookup
+}
+
+// lineStarts returns the line-start offsets for the input, building them on
+// first use. Errors are the cold path, so we avoid paying for this on success.
+func (p *Parser) lineStarts() lineStarts {
+	if p.lines == nil {
+		p.lines = newLineStarts(p.lexer.input)
+	}
+	return p.lines
 }
 
 func NewParser(buffer string) *Parser {
@@ -62,7 +72,11 @@ func (p *Parser) expectTokenKind(kind TokenKind) error {
 	if lastToken := p.tryConsumeTokenKind(kind); lastToken != nil {
 		return nil
 	}
-	return fmt.Errorf("expected the last token kind is: %s, but got %s", kind, p.lastTokenKind())
+	return &ParseError{
+		Pos:      p.Pos(),
+		Got:      p.last(),
+		Expected: []TokenKind{kind},
+	}
 }
 
 func (p *Parser) tryConsumeTokenKind(kind TokenKind) *Token {
@@ -89,7 +103,11 @@ func (p *Parser) matchOneOfKeywords(keywords ...string) bool {
 
 func (p *Parser) expectKeyword(keyword string) error {
 	if !p.matchKeyword(keyword) {
-		return fmt.Errorf("expected keyword: %s, but got %s", keyword, p.lastTokenKind())
+		return &ParseError{
+			Pos:     p.Pos(),
+			Got:     p.last(),
+			Keyword: keyword,
+		}
 	}
 	_ = p.lexer.consumeToken()
 	return nil
@@ -376,48 +394,30 @@ func (p *Parser) parseFormat(pos Pos) (*FormatClause, error) {
 	}, nil
 }
 
+// wrapError finalizes a parse error: it ensures the error is a *ParseError with
+// line/column resolved and the input attached for caret rendering. Errors that
+// already originate as *ParseError (from the expect* helpers) keep their
+// captured position and expected-token information; the long tail of
+// fmt.Errorf sites is wrapped here with the current position.
 func (p *Parser) wrapError(err error) error {
 	if err == nil {
 		return nil
 	}
 
-	lineNo := 0
-	column := 0
-
-	// p.Pos() can exceed the input length when the lexer is at EOF,
-	// so clamp the upper bound to avoid an index-out-of-range panic.
-	upperBound := int(p.Pos())
-	if upperBound > len(p.lexer.input) {
-		upperBound = len(p.lexer.input)
-	}
-	for i := 0; i < upperBound; i++ {
-		if p.lexer.input[i] == '\n' {
-			lineNo++
-			column = 0
-		} else {
-			column++
+	var pe *ParseError
+	if !errors.As(err, &pe) {
+		pe = &ParseError{
+			Pos: p.Pos(),
+			Got: p.last(),
+			Msg: err.Error(),
 		}
 	}
-
-	lines := strings.Split(p.lexer.input, "\n")
-	var buf strings.Builder
-	buf.WriteString(fmt.Sprintf("line %d:%d %s\n", lineNo, column, err.Error()))
-	for i, line := range lines {
-		if i == lineNo {
-			buf.WriteString(line)
-			buf.WriteByte('\n')
-			for j := 0; j < column; j++ {
-				buf.WriteByte(' ')
-			}
-			if p.last() != nil {
-				buf.WriteString(strings.Repeat("^", len(p.last().String)))
-			} else {
-				buf.WriteString("^")
-			}
-			buf.WriteByte('\n')
-		}
+	if pe.Line == 0 {
+		pe.Line, pe.Column = p.lineStarts().position(int(pe.Pos))
 	}
-	return errors.New(buf.String())
+	pe.input = p.lexer.input
+	pe.starts = p.lineStarts()
+	return pe
 }
 
 func (p *Parser) parseRatioExpr(pos Pos) (*RatioExpr, error) {
diff --git a/parser/source.go b/parser/source.go
@@ -0,0 +1,47 @@
+package parser
+
+import "sort"
+
+// lineStarts holds the byte offset where each line of the input begins
+// (lineStarts[i] is the start of 0-based line i). It is built once so that
+// error reporting does not re-scan the whole buffer for every error.
+type lineStarts []int
+
+func newLineStarts(input string) lineStarts {
+	starts := lineStarts{0}
+	for i := 0; i < len(input); i++ {
+		if input[i] == '\n' {
+			starts = append(starts, i+1)
+		}
+	}
+	return starts
+}
+
+// position returns the 1-based line and column for a byte offset.
+func (s lineStarts) position(offset int) (line, col int) {
+	// Find the largest line start that is <= offset.
+	i := sort.Search(len(s), func(i int) bool {
+		return s[i] > offset
+	}) - 1
+	if i < 0 {
+		i = 0
+	}
+	return i + 1, offset - s[i] + 1
+}
+
+// lineText returns the text of the given 1-based line, without the trailing
+// line terminator. It returns an empty string for out-of-range lines.
+func (s lineStarts) lineText(input string, line int) string {
+	if line < 1 || line > len(s) {
+		return ""
+	}
+	start := s[line-1]
+	end := len(input)
+	if line < len(s) {
+		end = s[line] - 1 // exclude the '\n'
+	}
+	if end > start && input[end-1] == '\r' {
+		end-- // exclude a '\r' from a '\r\n' terminator
+	}
+	return input[start:end]
+}