perf(ingest): sub-slice indexing in splitOnDelimiter (closes #354) (#467)

xe-nvdk · web-flow · commit 89a4f250791e · 2026-05-27T13:44:03.000-06:00
The Line Protocol parser's tokenizer appended bytes one-at-a-time into a growing []byte per part, allocating a fresh backing array for every space and every comma split in every line-protocol line. Replaced with sub-slice indexing: track a `start` index, emit `data[start:i]` sub-slices on delimiter hits, return slices that alias the input buffer.

Behavior unchanged at all edges (consecutive/leading/trailing delimiters, escape pairs, quote tracking, empty input). Aliasing safe because every downstream caller copies via string(...) or unescape() before storage — no []byte references escape the parse call.

Bench (Apple M3 Max, telegraf line + 10-line batch):
- ParseLine:  1547 ns → 915 ns  (-41% ns, -52% allocs)
- ParseBatch: 9215 ns → 6326 ns (-31% ns, -50% allocs)

Additional optimizations landed in the same PR:
- Pre-sized parts slice (cap=4) — first append allocation-free
- Empty-input fast path (Gemini R1) — defensive guard
- Lazy parts allocation (Gemini R2) — no-delimiter inputs return 1-element literal instead of cap=4 (-720 B / -6.6% on ParseBatch)

Declined with bench evidence: dynamic cap=8 on comma path (Gemini R1) regressed the full path +9% / +1.9 KB because most production comma-splits have ≤4 parts.

BenchmarkSplitOnDelimiter added covering both delimiter paths so future regressions don't slip past CI.

3 Gemini rounds (R3 clean pass). All 33 line-protocol tests pass under -race × 2.
diff --git a/internal/ingest/lineprotocol.go b/internal/ingest/lineprotocol.go
@@ -146,31 +146,74 @@ func (p *LineProtocolParser) parseLineWithPrecision(line []byte, precision strin
 
 // splitOnDelimiter splits data on an unescaped delimiter, respecting escaped chars and quoted strings.
 // Used by splitLine (space) and splitOnComma (comma).
+//
+// The returned slices alias `data` — callers MUST NOT mutate either
+// the returned parts or the input buffer. (Current callers only read,
+// passing parts into parseMeasurementTags / parseFields / strconv.)
+// Sub-slice indexing avoids the per-byte append seen in the previous
+// implementation, which on a typical telegraf line allocates a fresh
+// growing slice for every part — measurable cost on the ingest hot
+// path (#354).
 func splitOnDelimiter(data []byte, delim byte) [][]byte {
+	// Empty-input fast path: avoid the make([]) allocation when there's
+	// nothing to split. Not reachable on the production hot path today
+	// (callers pre-check len(line) > 0) but defensive for hypothetical
+	// future callers.
+	if len(data) == 0 {
+		return nil
+	}
+	// Lazy parts allocation: many real inputs have zero delimiters
+	// (measurement with no tags `cpu`, single-field writes `v=1`).
+	// Deferring the make() until we actually need it lets the
+	// no-delimiter case end with a 1-element literal slice instead
+	// of cap=4 — saves the cap-4 allocation on every such call. The
+	// with-delimiter case still allocates cap=4 once on first split
+	// (same as before, just lazily). Verified: this is a net win on
+	// full-path ParseBatch.
+	//
+	// Note: tried dynamic capacity (cap=8 on comma) per Gemini R1 but
+	// the full-path ParseBatch bench regressed by +9% / +1.9 KB
+	// because most production comma-splits have ≤4 parts. Kept at 4
+	// with bench evidence.
 	var parts [][]byte
-	var current []byte
+	start := 0
 	inQuotes := false
 
+	// Note on `continue` below: bytes consumed by the escape and quote
+	// branches are NOT dropped — they remain inside the current sub-slice
+	// because sub-slicing captures the whole [start:next-delim] range.
+	// `continue` here only skips the delimiter check; the byte itself is
+	// implicitly captured when the next delimiter or end-of-data fires.
 	for i := 0; i < len(data); i++ {
 		if data[i] == '\\' && i+1 < len(data) {
-			// Escaped character - include both backslash and next char
-			current = append(current, data[i], data[i+1])
+			// Escape consumes the next byte verbatim. Advance past it
+			// so the delimiter check below doesn't fire on, e.g., an
+			// escaped space ('\ ') or escaped comma ('\,').
 			i++
-		} else if data[i] == '"' {
+			continue
+		}
+		if data[i] == '"' {
 			inQuotes = !inQuotes
-			current = append(current, data[i])
-		} else if data[i] == delim && !inQuotes {
-			if len(current) > 0 {
-				parts = append(parts, current)
-				current = nil
+			continue
+		}
+		if data[i] == delim && !inQuotes {
+			if i > start {
+				if parts == nil {
+					parts = make([][]byte, 0, 4)
+				}
+				parts = append(parts, data[start:i])
 			}
-		} else {
-			current = append(current, data[i])
+			start = i + 1
 		}
 	}
 
-	if len(current) > 0 {
-		parts = append(parts, current)
+	if len(data) > start {
+		if parts == nil {
+			// No delimiters seen — single-element literal beats
+			// cap=4 make+append for this common case.
+			return [][]byte{data[start:]}
+		}
+		parts = append(parts, data[start:])
 	}
 
 	return parts
diff --git a/internal/ingest/lineprotocol_test.go b/internal/ingest/lineprotocol_test.go
@@ -560,3 +560,26 @@ disk,host=server03 used=30.0 1609459200000000002`)
 		parser.ParseBatch(batch)
 	}
 }
+
+// BenchmarkSplitOnDelimiter exercises the underlying splitter directly
+// (closes #354 — verifies the sub-slice indexing fix vs the previous
+// per-byte append loop). The two inputs cover the two real call sites:
+// `splitLine` (spaces, ~4 parts per telegraf line) and `splitOnComma`
+// (tags/fields, often 5-8 parts).
+func BenchmarkSplitOnDelimiter(b *testing.B) {
+	line := []byte("cpu,host=server01,region=us-west,env=prod usage_idle=90.5,usage_system=2.1,usage_user=7.4 1609459200000000000")
+	tags := []byte("cpu,host=server01,region=us-west,env=prod,role=primary,zone=a")
+
+	b.Run("space_delimiter", func(b *testing.B) {
+		b.ReportAllocs()
+		for i := 0; i < b.N; i++ {
+			_ = splitOnDelimiter(line, ' ')
+		}
+	})
+	b.Run("comma_delimiter", func(b *testing.B) {
+		b.ReportAllocs()
+		for i := 0; i < b.N; i++ {
+			_ = splitOnDelimiter(tags, ',')
+		}
+	})
+}