Skip to content

Commit 89a4f25

Browse files
authored
perf(ingest): sub-slice indexing in splitOnDelimiter (closes #354) (#467)
The Line Protocol parser's tokenizer appended bytes one-at-a-time into a growing []byte per part, allocating a fresh backing array for every space and every comma split in every line-protocol line. Replaced with sub-slice indexing: track a `start` index, emit `data[start:i]` sub-slices on delimiter hits, return slices that alias the input buffer. Behavior unchanged at all edges (consecutive/leading/trailing delimiters, escape pairs, quote tracking, empty input). Aliasing safe because every downstream caller copies via string(...) or unescape() before storage — no []byte references escape the parse call. Bench (Apple M3 Max, telegraf line + 10-line batch): - ParseLine: 1547 ns → 915 ns (-41% ns, -52% allocs) - ParseBatch: 9215 ns → 6326 ns (-31% ns, -50% allocs) Additional optimizations landed in the same PR: - Pre-sized parts slice (cap=4) — first append allocation-free - Empty-input fast path (Gemini R1) — defensive guard - Lazy parts allocation (Gemini R2) — no-delimiter inputs return 1-element literal instead of cap=4 (-720 B / -6.6% on ParseBatch) Declined with bench evidence: dynamic cap=8 on comma path (Gemini R1) regressed the full path +9% / +1.9 KB because most production comma-splits have ≤4 parts. BenchmarkSplitOnDelimiter added covering both delimiter paths so future regressions don't slip past CI. 3 Gemini rounds (R3 clean pass). All 33 line-protocol tests pass under -race × 2.
1 parent c1ef424 commit 89a4f25

2 files changed

Lines changed: 79 additions & 13 deletions

File tree

internal/ingest/lineprotocol.go

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -146,31 +146,74 @@ func (p *LineProtocolParser) parseLineWithPrecision(line []byte, precision strin
146146

147147
// splitOnDelimiter splits data on an unescaped delimiter, respecting escaped chars and quoted strings.
148148
// Used by splitLine (space) and splitOnComma (comma).
149+
//
150+
// The returned slices alias `data` — callers MUST NOT mutate either
151+
// the returned parts or the input buffer. (Current callers only read,
152+
// passing parts into parseMeasurementTags / parseFields / strconv.)
153+
// Sub-slice indexing avoids the per-byte append seen in the previous
154+
// implementation, which on a typical telegraf line allocates a fresh
155+
// growing slice for every part — measurable cost on the ingest hot
156+
// path (#354).
149157
func splitOnDelimiter(data []byte, delim byte) [][]byte {
158+
// Empty-input fast path: avoid the make([]) allocation when there's
159+
// nothing to split. Not reachable on the production hot path today
160+
// (callers pre-check len(line) > 0) but defensive for hypothetical
161+
// future callers.
162+
if len(data) == 0 {
163+
return nil
164+
}
165+
// Lazy parts allocation: many real inputs have zero delimiters
166+
// (measurement with no tags `cpu`, single-field writes `v=1`).
167+
// Deferring the make() until we actually need it lets the
168+
// no-delimiter case end with a 1-element literal slice instead
169+
// of cap=4 — saves the cap-4 allocation on every such call. The
170+
// with-delimiter case still allocates cap=4 once on first split
171+
// (same as before, just lazily). Verified: this is a net win on
172+
// full-path ParseBatch.
173+
//
174+
// Note: tried dynamic capacity (cap=8 on comma) per Gemini R1 but
175+
// the full-path ParseBatch bench regressed by +9% / +1.9 KB
176+
// because most production comma-splits have ≤4 parts. Kept at 4
177+
// with bench evidence.
150178
var parts [][]byte
151-
var current []byte
179+
start := 0
152180
inQuotes := false
153181

182+
// Note on `continue` below: bytes consumed by the escape and quote
183+
// branches are NOT dropped — they remain inside the current sub-slice
184+
// because sub-slicing captures the whole [start:next-delim] range.
185+
// `continue` here only skips the delimiter check; the byte itself is
186+
// implicitly captured when the next delimiter or end-of-data fires.
154187
for i := 0; i < len(data); i++ {
155188
if data[i] == '\\' && i+1 < len(data) {
156-
// Escaped character - include both backslash and next char
157-
current = append(current, data[i], data[i+1])
189+
// Escape consumes the next byte verbatim. Advance past it
190+
// so the delimiter check below doesn't fire on, e.g., an
191+
// escaped space ('\ ') or escaped comma ('\,').
158192
i++
159-
} else if data[i] == '"' {
193+
continue
194+
}
195+
if data[i] == '"' {
160196
inQuotes = !inQuotes
161-
current = append(current, data[i])
162-
} else if data[i] == delim && !inQuotes {
163-
if len(current) > 0 {
164-
parts = append(parts, current)
165-
current = nil
197+
continue
198+
}
199+
if data[i] == delim && !inQuotes {
200+
if i > start {
201+
if parts == nil {
202+
parts = make([][]byte, 0, 4)
203+
}
204+
parts = append(parts, data[start:i])
166205
}
167-
} else {
168-
current = append(current, data[i])
206+
start = i + 1
169207
}
170208
}
171209

172-
if len(current) > 0 {
173-
parts = append(parts, current)
210+
if len(data) > start {
211+
if parts == nil {
212+
// No delimiters seen — single-element literal beats
213+
// cap=4 make+append for this common case.
214+
return [][]byte{data[start:]}
215+
}
216+
parts = append(parts, data[start:])
174217
}
175218

176219
return parts

internal/ingest/lineprotocol_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,3 +560,26 @@ disk,host=server03 used=30.0 1609459200000000002`)
560560
parser.ParseBatch(batch)
561561
}
562562
}
563+
564+
// BenchmarkSplitOnDelimiter exercises the underlying splitter directly
565+
// (closes #354 — verifies the sub-slice indexing fix vs the previous
566+
// per-byte append loop). The two inputs cover the two real call sites:
567+
// `splitLine` (spaces, ~4 parts per telegraf line) and `splitOnComma`
568+
// (tags/fields, often 5-8 parts).
569+
func BenchmarkSplitOnDelimiter(b *testing.B) {
570+
line := []byte("cpu,host=server01,region=us-west,env=prod usage_idle=90.5,usage_system=2.1,usage_user=7.4 1609459200000000000")
571+
tags := []byte("cpu,host=server01,region=us-west,env=prod,role=primary,zone=a")
572+
573+
b.Run("space_delimiter", func(b *testing.B) {
574+
b.ReportAllocs()
575+
for i := 0; i < b.N; i++ {
576+
_ = splitOnDelimiter(line, ' ')
577+
}
578+
})
579+
b.Run("comma_delimiter", func(b *testing.B) {
580+
b.ReportAllocs()
581+
for i := 0; i < b.N; i++ {
582+
_ = splitOnDelimiter(tags, ',')
583+
}
584+
})
585+
}

0 commit comments

Comments
 (0)