Skip to content

Commit b8300ab

Browse files
feat(LOG-9387): Introduce Ingester metrics to better understand LOG-9387
1 parent 3e5aaf9 commit b8300ab

2 files changed

Lines changed: 39 additions & 0 deletions

File tree

pkg/ingester/instance.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,13 +229,18 @@ func (i *instance) consumeChunk(ctx context.Context, ls labels.Labels, chunk *lo
229229
// happened to *the last stream in the request*. Ex: if three streams are part of the PushRequest
230230
// and all three failed, the returned error only describes what happened to the last processed stream.
231231
func (i *instance) Push(ctx context.Context, req *logproto.PushRequest) error {
232+
i.metrics.pushInflight.Inc()
233+
defer i.metrics.pushInflight.Dec()
234+
232235
record := recordPool.GetRecord()
233236
record.UserID = i.instanceID
234237
defer recordPool.PutRecord(record)
235238
rateLimitWholeStream := i.limiter.limits.ShardStreams(i.instanceID).Enabled
236239

237240
var appendErr error
241+
var chunkLockTotal time.Duration
238242
for _, reqStream := range req.Streams {
243+
lockStart := time.Now()
239244

240245
s, _, err := i.streams.LoadOrStoreNew(reqStream.Labels,
241246
func() (*stream, error) {
@@ -259,9 +264,12 @@ func (i *instance) Push(ctx context.Context, req *logproto.PushRequest) error {
259264

260265
_, appendErr = s.Push(ctx, reqStream.Entries, record, 0, false, rateLimitWholeStream, i.customStreamsTracker, req.Format)
261266
s.chunkMtx.Unlock()
267+
chunkLockTotal += time.Since(lockStart)
262268
}
269+
i.metrics.pushChunkMemoryDuration.Observe(chunkLockTotal.Seconds())
263270

264271
if !record.IsEmpty() {
272+
walStart := time.Now()
265273
if err := i.wal.Log(record); err != nil {
266274
if e, ok := err.(*os.PathError); ok && e.Err == syscall.ENOSPC {
267275
i.metrics.walDiskFullFailures.Inc()
@@ -272,9 +280,15 @@ func (i *instance) Push(ctx context.Context, req *logproto.PushRequest) error {
272280
)
273281
})
274282
} else {
283+
i.metrics.pushWALDuration.Observe(time.Since(walStart).Seconds())
275284
return err
276285
}
277286
}
287+
i.metrics.pushWALDuration.Observe(time.Since(walStart).Seconds())
288+
}
289+
290+
if ctx.Err() != nil {
291+
i.metrics.pushContextAlreadyDone.Inc()
278292
}
279293

280294
return appendErr

pkg/ingester/metrics.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ type ingesterMetrics struct {
7171
flushQueueLength prometheus.Gauge
7272
duplicateLogBytesTotal *prometheus.CounterVec
7373
streamsOwnershipCheck prometheus.Histogram
74+
75+
// Push path instrumentation (LOG-9387 investigation)
76+
pushChunkMemoryDuration prometheus.Histogram
77+
pushWALDuration prometheus.Histogram
78+
pushInflight prometheus.Gauge
79+
pushContextAlreadyDone prometheus.Counter
7480
}
7581

7682
// setRecoveryBytesInUse bounds the bytes reports to >= 0.
@@ -334,5 +340,24 @@ func newIngesterMetrics(r prometheus.Registerer, metricsNamespace string) *inges
334340
Name: "duplicate_log_bytes_total",
335341
Help: "The total number of bytes that were discarded for duplicate log lines.",
336342
}, []string{"tenant"}),
343+
344+
pushChunkMemoryDuration: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
345+
Name: "loki_ingester_push_chunk_memory_duration_seconds",
346+
Help: "Time spent reading entries into chunk memory per push request.",
347+
Buckets: prometheus.ExponentialBuckets(0.001, 4, 9),
348+
}),
349+
pushWALDuration: promauto.With(r).NewHistogram(prometheus.HistogramOpts{
350+
Name: "loki_ingester_push_wal_duration_seconds",
351+
Help: "Time spent writing to the WAL per push request.",
352+
Buckets: prometheus.ExponentialBuckets(0.001, 4, 9),
353+
}),
354+
pushInflight: promauto.With(r).NewGauge(prometheus.GaugeOpts{
355+
Name: "loki_ingester_push_inflight",
356+
Help: "Number of push requests currently being processed.",
357+
}),
358+
pushContextAlreadyDone: promauto.With(r).NewCounter(prometheus.CounterOpts{
359+
Name: "loki_ingester_push_context_already_done_total",
360+
Help: "Total push requests that completed after their context was already canceled.",
361+
}),
337362
}
338363
}

0 commit comments

Comments
 (0)