goodhistogram/quantile.go at main · cockroachdb/goodhistogram · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
// Copyright 2026 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0

package goodhistogram

import (
	"math"
	"sort"
)

// ValueAtQuantile returns the estimated value at the given quantile q ∈ [0, 1]
// using trapezoidal interpolation.
//
// Unlike the standard linear (uniform-density) interpolation used by Prometheus,
// trapezoidal interpolation estimates the probability density at each bucket
// boundary by averaging the densities of adjacent buckets. Within each bucket
// the density is modeled as varying linearly from the left boundary density to
// the right boundary density — a trapezoid. The CDF within a bucket is then the
// integral of this linear density, which is a quadratic. The quantile value is
// found by solving this quadratic.
//
// This produces more accurate estimates when the true distribution is not
// uniform within buckets (i.e., nearly always), especially near bucket
// boundaries where the density changes.
func (s *Snapshot) ValueAtQuantile(q float64) float64 {
	if s.TotalCount == 0 {
		return 0
	}
	// Target rank (fractional, 0-based) across all observations,
	// including underflow and overflow.
	rank := q * float64(s.TotalCount)
	if rank <= 0 {
		// Return the lower bound of the first non-empty region.
		if s.ZeroCount+s.Underflow > 0 {
			return s.cfg.lo
		}
		for i, c := range s.Counts {
			if c > 0 {
				return s.cfg.boundaries[i]
			}
		}
		return s.cfg.hi
	}
	if rank >= float64(s.TotalCount) {
		// Return the upper bound of the last non-empty region.
		if s.Overflow > 0 {
			return s.cfg.hi
		}
		for i := len(s.Counts) - 1; i >= 0; i-- {
			if s.Counts[i] > 0 {
				return s.cfg.boundaries[i+1]
			}
		}
		return s.cfg.lo
	}

	// Underflow and zero observations form an implicit region below lo.
	// If the quantile falls within this region, clamp to lo.
	belowLo := float64(s.ZeroCount + s.Underflow)
	if rank <= belowLo {
		return s.cfg.lo
	}
	// Adjust rank to be relative to the in-range buckets.
	rank -= belowLo

	// Count of in-range observations (buckets only, excluding
	// underflow, overflow, and zeros).
	var inRangeCount uint64
	for _, c := range s.Counts {
		inRangeCount += c
	}

	// If the quantile falls beyond all in-range observations, it's
	// in the overflow region. Clamp to hi, matching Prometheus's
	// behavior of clamping to the last explicit bucket boundary.
	if rank > float64(inRangeCount) {
		return s.cfg.hi
	}

	n := len(s.Counts)

	// Step 1: Compute average density in each bucket.
	// density[i] = count[i] / width[i]
	avgDensity := make([]float64, n)
	for i := range n {
		w := s.cfg.boundaries[i+1] - s.cfg.boundaries[i]
		if w > 0 && s.Counts[i] > 0 {
			avgDensity[i] = float64(s.Counts[i]) / w
		}
	}

	// Step 2: Estimate density at each boundary by averaging neighbors.
	// boundaryDensity has length n+1 (one per boundary). At the outer edges
	// there is no neighbor on one side, so we use the adjacent bucket's
	// density directly rather than averaging with zero — otherwise the
	// rightmost-bucket interpolation gets biased low (which matters a lot
	// for p99 in long-tailed distributions).
	boundaryDensity := make([]float64, n+1)
	boundaryDensity[0] = avgDensity[0]
	boundaryDensity[n] = avgDensity[n-1]
	for i := 1; i < n; i++ {
		boundaryDensity[i] = (avgDensity[i-1] + avgDensity[i]) / 2.0
	}

	// Step 3: Walk buckets to find which one contains the target rank,
	// then solve the trapezoidal CDF within that bucket.
	var cumCount float64
	for i := range n {
		fc := float64(s.Counts[i])
		if cumCount+fc >= rank {
			localRank := rank - cumCount
			lo := s.cfg.boundaries[i]
			hi := s.cfg.boundaries[i+1]
			w := hi - lo
			if w <= 0 || fc == 0 {
				return lo
			}

			dL := boundaryDensity[i]
			dR := boundaryDensity[i+1]

			return trapezoidalSolve(lo, w, fc, dL, dR, localRank)
		}
		cumCount += fc
	}
	// Should not reach here, but clamp to upper bound.
	return s.cfg.boundaries[n]
}

// ValuesAtQuantiles returns the estimated values at the given quantiles
// in a single pass through the bucket array. The input quantiles should be
// in [0, 1]. Results are returned in the same order as the input quantiles.
//
// This is more efficient than calling ValueAtQuantile in a loop because it
// computes bucket densities once and walks the bucket array once regardless
// of how many quantiles are requested.
func (s *Snapshot) ValuesAtQuantiles(qs []float64) []float64 {
	results := make([]float64, len(qs))
	if len(qs) == 0 || s.TotalCount == 0 {
		return results
	}

	belowLo := float64(s.ZeroCount + s.Underflow)
	var inRangeCount uint64
	for _, c := range s.Counts {
		inRangeCount += c
	}

	// Resolve edge cases per-quantile and collect those needing a bucket walk.
	type walkEntry struct {
		idx  int     // index in results
		rank float64 // adjusted rank within in-range buckets
	}
	var walk []walkEntry
	for i, q := range qs {
		rank := q * float64(s.TotalCount)
		if rank <= 0 {
			if s.ZeroCount+s.Underflow > 0 {
				results[i] = s.cfg.lo
			} else {
				results[i] = s.cfg.hi
				for j, c := range s.Counts {
					if c > 0 {
						results[i] = s.cfg.boundaries[j]
						break
					}
				}
			}
			continue
		}
		if rank >= float64(s.TotalCount) {
			if s.Overflow > 0 {
				results[i] = s.cfg.hi
			} else {
				results[i] = s.cfg.lo
				for j := len(s.Counts) - 1; j >= 0; j-- {
					if s.Counts[j] > 0 {
						results[i] = s.cfg.boundaries[j+1]
						break
					}
				}
			}
			continue
		}
		if rank <= belowLo {
			results[i] = s.cfg.lo
			continue
		}
		adjusted := rank - belowLo
		if adjusted > float64(inRangeCount) {
			results[i] = s.cfg.hi
			continue
		}
		walk = append(walk, walkEntry{idx: i, rank: adjusted})
	}

	if len(walk) == 0 {
		return results
	}

	// Sort by rank for a single ascending pass through the buckets.
	sort.Slice(walk, func(i, j int) bool {
		return walk[i].rank < walk[j].rank
	})

	n := len(s.Counts)

	// Compute densities once (same as ValueAtQuantile).
	avgDensity := make([]float64, n)
	for i := range n {
		w := s.cfg.boundaries[i+1] - s.cfg.boundaries[i]
		if w > 0 && s.Counts[i] > 0 {
			avgDensity[i] = float64(s.Counts[i]) / w
		}
	}
	boundaryDensity := make([]float64, n+1)
	boundaryDensity[0] = avgDensity[0]
	boundaryDensity[n] = avgDensity[n-1]
	for i := 1; i < n; i++ {
		boundaryDensity[i] = (avgDensity[i-1] + avgDensity[i]) / 2.0
	}

	// Single-pass bucket walk: process all quantiles whose rank falls
	// within each bucket before advancing to the next.
	var cumCount float64
	wi := 0
	for i := range n {
		fc := float64(s.Counts[i])
		nextCum := cumCount + fc
		for wi < len(walk) && nextCum >= walk[wi].rank {
			localRank := walk[wi].rank - cumCount
			lo := s.cfg.boundaries[i]
			hi := s.cfg.boundaries[i+1]
			w := hi - lo
			if w <= 0 || fc == 0 {
				results[walk[wi].idx] = lo
			} else {
				dL := boundaryDensity[i]
				dR := boundaryDensity[i+1]
				results[walk[wi].idx] = trapezoidalSolve(lo, w, fc, dL, dR, localRank)
			}
			wi++
		}
		cumCount = nextCum
		if wi >= len(walk) {
			break
		}
	}

	// Any remaining entries (shouldn't happen, but safety).
	for ; wi < len(walk); wi++ {
		results[walk[wi].idx] = s.cfg.boundaries[n]
	}

	return results
}

// Mean returns the mean of all recorded values. Returns NaN if no
// observations have been recorded, matching Prometheus histogram behavior.
func (s *Snapshot) Mean() float64 {
	if s.TotalCount == 0 {
		return math.NaN()
	}
	return float64(s.TotalSum) / float64(s.TotalCount)
}

// Total returns the total count and sum.
func (s *Snapshot) Total() (int64, float64) {
	return int64(s.TotalCount), float64(s.TotalSum)
}

// trapezoidalSolve finds the value x ∈ [lo, lo+w] such that the area under a
// linear density function from lo to x equals localRank.
//
// The density is modeled as f(t) = dL + (dR - dL) * (t - lo) / w, where dL and
// dR are the densities at the left and right bucket boundaries. However, the
// raw boundary densities may not integrate to exactly the bucket count over
// [lo, lo+w], so we scale them: the area of the trapezoid w*(dL+dR)/2 should
// equal the bucket count fc.
//
// The CDF within the bucket is:
//
//	F(x) = a*x + b*x^2  (with x measured from lo)
//
// where a = scaled dL, b = (scaled dR - scaled dL) / (2*w).
// We solve a*x + b*x^2 = localRank via the quadratic formula.
func trapezoidalSolve(lo, w, fc, dL, dR, localRank float64) float64 {
	// Scale boundary densities so the trapezoid area matches the count.
	rawArea := w * (dL + dR) / 2.0
	if rawArea <= 0 {
		// Both boundary densities are zero — fall back to linear interpolation.
		return lo + w*(localRank/fc)
	}
	scale := fc / rawArea
	sL := dL * scale // scaled left density
	sR := dR * scale // scaled right density

	// Degenerate case: uniform density (sL ≈ sR). Linear interpolation.
	if math.Abs(sR-sL) < 1e-12*(sL+sR) {
		return lo + w*(localRank/fc)
	}

	// Quadratic: sL*x + ((sR-sL)/(2*w))*x^2 = localRank
	// Rewrite as: ((sR-sL)/(2*w))*x^2 + sL*x - localRank = 0
	a := (sR - sL) / (2.0 * w)
	b := sL
	c := -localRank
	disc := b*b - 4*a*c
	if disc < 0 {
		// Numerical issue; fall back to linear.
		return lo + w*(localRank/fc)
	}
	// Use the numerically stable quadratic formula.
	var x float64
	if b >= 0 {
		x = (2 * c) / (-b - math.Sqrt(disc))
	} else {
		x = (-b + math.Sqrt(disc)) / (2 * a)
	}

	// Clamp to bucket range.
	if x < 0 {
		x = 0
	} else if x > w {
		x = w
	}
	return lo + x
}