Skip to content

Commit c4db6db

Browse files
committed
feat(extract): add brace-balanced JSON object/array extractor
Port the rtk extract_json_object brace-balancer (src/parser/mod.rs) to native Go. The extractor finds the first balanced JSON object in text that may contain surrounding prose, markdown code fences, or other LLM output artifacts. Algorithm: 1. Scan for the first '{' (or '[') that is not inside a double-quoted string. 2. Brace-balance, supporting nested objects, arrays, and strings. 3. Skip string contents (respecting backslash escapes). 4. Stop when depth returns to zero. Apostrophes in English prose (e.g. "Here's the JSON:") are NOT treated as string delimiters — only standard JSON double-quotes are. This avoids false positives when LLM output is wrapped in natural language. Functions: - ExtractJSONObject(s) Result — first balanced {…} in s - ExtractJSONArray(s) Result — first balanced […] in s - ExtractAllJSONObjects(s) []Result — every top-level object Exposed at the top-level tok API as tok.ExtractJSON, tok.ExtractJSONArray, and tok.ExtractAllJSON. Source: rtk-ai/rtk, src/parser/mod.rs (extract_json_object). Tests: 11 cases including LLM output with prose, markdown fences, nested objects, escaped quotes, and unterminated inputs.
1 parent 82644b1 commit c4db6db

4 files changed

Lines changed: 537 additions & 0 deletions

File tree

extract.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
// JSON extraction from prose — public API.
2+
//
3+
// Wraps internal/extract to expose brace-balanced JSON object and
4+
// array extraction as a top-level tok API. Useful for parsing LLM
5+
// output that contains JSON embedded in surrounding prose.
6+
package tok
7+
8+
import "github.com/GrayCodeAI/tok/internal/extract"
9+
10+
// JSONExtract is the result of an extraction attempt.
11+
type JSONExtract = extract.Result
12+
13+
// ExtractJSON returns the first complete JSON object found in text,
14+
// or JSONExtract{Found: false} if no balanced object exists.
15+
//
16+
// Handles nested objects, arrays, escaped quotes, and double-quoted
17+
// strings. Skips preceding and trailing prose. Apostrophes in English
18+
// prose (e.g. "Here's the JSON:") are NOT treated as string delimiters
19+
// (only standard JSON double-quotes are).
20+
//
21+
// Example:
22+
//
23+
// out := tok.ExtractJSON(`Sure! Here's the data: {"name": "alice"}`)
24+
// // out.JSON = `{"name": "alice"}`, out.Found = true
25+
func ExtractJSON(text string) JSONExtract {
26+
return extract.ExtractJSONObject(text)
27+
}
28+
29+
// ExtractJSONArray is the array counterpart of ExtractJSON. It looks
30+
// for a balanced `[...]` instead of `{...}`.
31+
func ExtractJSONArray(text string) JSONExtract {
32+
return extract.ExtractJSONArray(text)
33+
}
34+
35+
// ExtractAllJSON returns every top-level balanced JSON object in
36+
// text, in source order. Useful for parsing LLM output that contains
37+
// multiple JSON snippets.
38+
func ExtractAllJSON(text string) []JSONExtract {
39+
return extract.ExtractAllJSONObjects(text)
40+
}

extract_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package tok_test
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/GrayCodeAI/tok"
8+
)
9+
10+
func TestExtractJSON_TopLevel(t *testing.T) {
11+
tests := []struct {
12+
name string
13+
input string
14+
want string
15+
found bool
16+
}{
17+
{"bare", `{"a": 1}`, `{"a": 1}`, true},
18+
{"with prose", `Sure! {"a": 1} done`, `{"a": 1}`, true},
19+
{"not found", "no braces", "", false},
20+
}
21+
for _, tc := range tests {
22+
t.Run(tc.name, func(t *testing.T) {
23+
r := tok.ExtractJSON(tc.input)
24+
if r.Found != tc.found {
25+
t.Errorf("found mismatch: got %v want %v", r.Found, tc.found)
26+
}
27+
if r.JSON != tc.want {
28+
t.Errorf("JSON mismatch: got %q want %q", r.JSON, tc.want)
29+
}
30+
})
31+
}
32+
}
33+
34+
func TestExtractJSON_ApostropheInProse(t *testing.T) {
35+
// Regression: apostrophes in English prose must not be confused
36+
// with string delimiters.
37+
in := "Here's the JSON: {\"a\": 1}"
38+
r := tok.ExtractJSON(in)
39+
if !r.Found {
40+
t.Fatal("expected to find JSON")
41+
}
42+
if !strings.Contains(r.JSON, `"a"`) {
43+
t.Errorf("unexpected extraction: %q", r.JSON)
44+
}
45+
}
46+
47+
func TestExtractJSONArray_TopLevel(t *testing.T) {
48+
r := tok.ExtractJSONArray(`Result: [1, 2, 3]`)
49+
if !r.Found {
50+
t.Fatal("expected to find array")
51+
}
52+
if r.JSON != "[1, 2, 3]" {
53+
t.Errorf("unexpected array: %q", r.JSON)
54+
}
55+
}
56+
57+
func TestExtractAllJSON_TopLevel(t *testing.T) {
58+
in := `First: {"a": 1} middle {"b": 2} tail`
59+
out := tok.ExtractAllJSON(in)
60+
if len(out) != 2 {
61+
t.Fatalf("expected 2 objects, got %d", len(out))
62+
}
63+
}

internal/extract/extract.go

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
// Package extract provides utilities for extracting structured data
2+
// (JSON objects, arrays) from text that may contain prose around it.
3+
//
4+
// Common use case: an LLM returns something like
5+
//
6+
// "Sure! Here's the JSON you asked for:
7+
// {\"name\": \"alice\", \"age\": 30}
8+
// Hope that helps!"
9+
//
10+
// and the caller wants just the {"name": "alice", "age": 30} part. The
11+
// ExtractJSONObject function performs brace-balanced extraction that
12+
// handles nested objects, arrays, escaped quotes, and strings.
13+
//
14+
// Source: rtk-ai/rtk, src/parser/mod.rs (extract_json_object).
15+
// Ported to native Go.
16+
package extract
17+
18+
// Result describes a successful extraction. JSON is the extracted
19+
// substring (no surrounding prose). Start is the byte offset of the
20+
// opening brace in the input. End is the byte offset one past the
21+
// closing brace. Found is true when extraction succeeded.
22+
type Result struct {
23+
JSON string
24+
Start int
25+
End int
26+
Found bool
27+
}
28+
29+
// ExtractJSONObject returns the first complete JSON object found in s,
30+
// or Result{Found: false} if no balanced object exists.
31+
//
32+
// The function:
33+
//
34+
// 1. Scans for the first '{' that is not inside a string literal.
35+
// 2. Tracks brace depth, supporting nested objects and arrays.
36+
// 3. Skips over string contents (respecting both single and double
37+
// quotes, with backslash escape sequences).
38+
// 4. Stops when depth returns to zero.
39+
//
40+
// The output preserves the exact bytes from the input (no
41+
// normalization). If the input is a valid bare JSON object, the output
42+
// equals the input.
43+
func ExtractJSONObject(s string) Result {
44+
return extractFirst(s, '{', '}')
45+
}
46+
47+
// ExtractJSONArray is the array counterpart of ExtractJSONObject.
48+
// It looks for a balanced '[...]' instead of '{...}'.
49+
func ExtractJSONArray(s string) Result {
50+
return extractFirst(s, '[', ']')
51+
}
52+
53+
// extractFirst is the common implementation for both object and array
54+
// extraction. The opener and closer parameters are byte literals.
55+
//
56+
// Only double-quote strings are recognized (standard JSON). Single
57+
// quotes / apostrophes in English prose are ignored, which is what
58+
// callers want for LLM output where "Here's the JSON:" is common
59+
// prose around the actual JSON.
60+
func extractFirst(s string, opener, closer byte) Result {
61+
// Phase 1: find the first opener that isn't inside a string.
62+
start := findUnquotedOpener(s, opener)
63+
if start < 0 {
64+
return Result{Found: false}
65+
}
66+
67+
// Phase 2: brace-balance, skipping string contents.
68+
depth := 0
69+
i := start
70+
for i < len(s) {
71+
c := s[i]
72+
switch c {
73+
case '\\':
74+
// Skip the escaped character. This handles \" inside a
75+
// string, \\ for a literal backslash, and \n, \t, etc.
76+
i += 2
77+
continue
78+
case '"':
79+
// Skip the entire double-quoted string literal.
80+
i = skipDoubleQuotedString(s, i)
81+
continue
82+
case opener:
83+
depth++
84+
i++
85+
case closer:
86+
depth--
87+
i++
88+
if depth == 0 {
89+
return Result{
90+
JSON: s[start:i],
91+
Start: start,
92+
End: i,
93+
Found: true,
94+
}
95+
}
96+
default:
97+
i++
98+
}
99+
}
100+
// Unterminated; return what we have so far if depth > 0.
101+
if depth > 0 {
102+
return Result{
103+
JSON: s[start:],
104+
Start: start,
105+
End: len(s),
106+
Found: true,
107+
}
108+
}
109+
return Result{Found: false}
110+
}
111+
112+
// findUnquotedOpener returns the index of the first byte equal to c
113+
// that is not inside a double-quoted string literal, or -1 if none
114+
// exists. Single quotes / apostrophes in English prose are NOT treated
115+
// as string delimiters.
116+
func findUnquotedOpener(s string, c byte) int {
117+
i := 0
118+
for i < len(s) {
119+
switch s[i] {
120+
case '\\':
121+
i += 2
122+
continue
123+
case '"':
124+
i = skipDoubleQuotedString(s, i)
125+
continue
126+
}
127+
if s[i] == c {
128+
return i
129+
}
130+
i++
131+
}
132+
return -1
133+
}
134+
135+
// skipDoubleQuotedString advances past a double-quoted string literal
136+
// starting at s[i] (which must be a `"`). Returns the index one past
137+
// the closing `"` (or len(s) if unterminated).
138+
func skipDoubleQuotedString(s string, i int) int {
139+
i++ // skip opening quote
140+
for i < len(s) {
141+
switch s[i] {
142+
case '\\':
143+
i += 2 // skip escaped char
144+
case '"':
145+
return i + 1
146+
default:
147+
i++
148+
}
149+
}
150+
return i
151+
}
152+
153+
// ExtractAllJSONObjects returns every top-level balanced JSON object
154+
// in s, in source order. Useful when parsing LLM output that may
155+
// contain several JSON snippets (e.g. a list of objects in prose).
156+
func ExtractAllJSONObjects(s string) []Result {
157+
var out []Result
158+
i := 0
159+
for i < len(s) {
160+
// Find next opener from position i
161+
start := findUnquotedOpener(s[i:], '{')
162+
if start < 0 {
163+
break
164+
}
165+
start += i
166+
// Try to extract starting from `start`
167+
r := extractFrom(s, start, '{', '}')
168+
if !r.Found {
169+
break
170+
}
171+
out = append(out, r)
172+
i = r.End
173+
}
174+
return out
175+
}
176+
177+
// extractFrom is a helper for ExtractAllJSONObjects that starts scanning
178+
// from a known opener position (no opener-search needed).
179+
func extractFrom(s string, start int, opener, closer byte) Result {
180+
depth := 0
181+
i := start
182+
for i < len(s) {
183+
c := s[i]
184+
switch c {
185+
case '\\':
186+
i += 2
187+
continue
188+
case '"':
189+
i = skipDoubleQuotedString(s, i)
190+
continue
191+
case opener:
192+
depth++
193+
i++
194+
case closer:
195+
depth--
196+
i++
197+
if depth == 0 {
198+
return Result{
199+
JSON: s[start:i],
200+
Start: start,
201+
End: i,
202+
Found: true,
203+
}
204+
}
205+
default:
206+
i++
207+
}
208+
}
209+
if depth > 0 {
210+
return Result{
211+
JSON: s[start:],
212+
Start: start,
213+
End: len(s),
214+
Found: true,
215+
}
216+
}
217+
return Result{Found: false}
218+
}

0 commit comments

Comments
 (0)