Skip to content

Commit 182fdad

Browse files
aepfliclaude
andauthored
test(go): add p99 latency stability tests for GC pressure detection (#103)
Verify that p99 evaluation latency remains stable over 1M sustained evaluations, catching the "spike and stay" pattern seen in other WASM-based SDKs where per-evaluation allocations cause growing GC pauses. Sequential test covers 4 scenarios: small context, large context (55 keys), deeply nested targeting (9 variables, 3-level nesting), and big store (500 flags). Concurrent test runs 4 goroutines against the big store with big targeting. Both skip in -short mode. ## Description <!-- Provide a brief description of your changes --> ## Related Issue <!-- Link to the related issue(s) --> Closes # ## Type of Change <!-- Mark the relevant option with an "x" --> - [ ] `feat`: New feature (minor version bump) - [ ] `fix`: Bug fix (patch version bump) - [ ] `docs`: Documentation only changes - [ ] `chore`: Maintenance tasks, dependency updates - [ ] `refactor`: Code refactoring without functional changes - [ ] `test`: Adding or updating tests - [ ] `ci`: CI/CD changes - [ ] `perf`: Performance improvements - [ ] `build`: Build system changes - [ ] `style`: Code style/formatting changes ## PR Title Format **IMPORTANT**: Since we use squash and merge, your PR title will become the commit message. Please ensure your PR title follows the [Conventional Commits](https://www.conventionalcommits.org/) format: ``` <type>(<optional-scope>): <description> ``` ### Examples: - `feat(operators): add new string comparison operator` - `fix(wasm): correct memory allocation bug` - `docs: update API examples in README` - `chore(deps): update rust dependencies` For breaking changes, use `!` after the type/scope or include `BREAKING CHANGE:` in the PR description: - `feat(api)!: redesign evaluation API` ## Testing <!-- Describe the testing you've performed --> - [ ] Unit tests added/updated - [ ] Integration tests added/updated - [ ] Manual testing performed - [ ] All tests pass (`cargo test`) - [ ] Code is formatted (`cargo fmt`) - [ ] Clippy checks pass (`cargo clippy -- -D warnings`) - [ ] WASM builds successfully (if applicable) ## Breaking Changes <!-- If this introduces breaking changes, describe them here --> - [ ] This PR includes breaking changes - [ ] Documentation has been updated to reflect breaking changes - [ ] Migration guide included (if needed) ## Additional Notes <!-- Any additional information, context, or screenshots --> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent cb7d304 commit 182fdad

1 file changed

Lines changed: 365 additions & 0 deletions

File tree

go/latency_test.go

Lines changed: 365 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,365 @@
1+
package evaluator
2+
3+
import (
4+
"fmt"
5+
"sort"
6+
"sync"
7+
"testing"
8+
"time"
9+
)
10+
11+
// Deeply nested targeting rule that checks many variables across multiple
12+
// levels of if/and/or. Each evaluation triggers more WASM memory allocations
13+
// for variable lookups and intermediate results than complexTargetingConfig.
14+
const bigTargetingConfig = `{
15+
"flags": {
16+
"big-flag": {
17+
"state": "ENABLED",
18+
"defaultVariant": "none",
19+
"variants": {
20+
"premium": "premium-tier",
21+
"standard": "standard-tier",
22+
"basic": "basic-tier",
23+
"none": "no-tier"
24+
},
25+
"targeting": {
26+
"if": [
27+
{ "and": [
28+
{ "==": [{ "var": "tier" }, "premium"] },
29+
{ ">": [{ "var": "score" }, 90] },
30+
{ "==": [{ "var": "region" }, "us-east"] },
31+
{ "in": [{ "var": "role" }, ["admin", "superadmin"]] }
32+
]},
33+
"premium",
34+
{ "if": [
35+
{ "or": [
36+
{ "and": [
37+
{ "==": [{ "var": "tier" }, "standard"] },
38+
{ ">": [{ "var": "score" }, 50] }
39+
]},
40+
{ "and": [
41+
{ "==": [{ "var": "department" }, "engineering"] },
42+
{ ">=": [{ "var": "experience" }, 5] }
43+
]},
44+
{ "and": [
45+
{ "==": [{ "var": "country" }, "US"] },
46+
{ ">": [{ "var": "level" }, 3] }
47+
]}
48+
]},
49+
"standard",
50+
{ "if": [
51+
{ "or": [
52+
{ ">": [{ "var": "score" }, 20] },
53+
{ "!=": [{ "var": "plan" }, "free"] }
54+
]},
55+
"basic",
56+
null
57+
]}
58+
]}
59+
]
60+
}
61+
}
62+
}
63+
}`
64+
65+
// generateBigStoreConfig creates a flag store with n padding flags (mix of static,
66+
// targeting, disabled) plus the big targeting flag. This bloats WASM linear memory
67+
// and increases per-evaluation overhead from the larger flag index.
68+
func generateBigStoreConfig(n int) string {
69+
var buf []byte
70+
buf = append(buf, `{"flags":{`...)
71+
72+
// The big targeting flag we'll actually evaluate
73+
buf = append(buf, `"big-flag":{
74+
"state":"ENABLED","defaultVariant":"none",
75+
"variants":{"premium":"premium-tier","standard":"standard-tier","basic":"basic-tier","none":"no-tier"},
76+
"targeting":{"if":[
77+
{"and":[{"==":[{"var":"tier"},"premium"]},{">":[{"var":"score"},90]},{"==":[{"var":"region"},"us-east"]},{"in":[{"var":"role"},["admin","superadmin"]]}]},
78+
"premium",
79+
{"if":[
80+
{"or":[{"and":[{"==":[{"var":"tier"},"standard"]},{">":[{"var":"score"},50]}]},{"and":[{"==":[{"var":"department"},"engineering"]},{">=":[{"var":"experience"},5]}]},{"and":[{"==":[{"var":"country"},"US"]},{">":[{"var":"level"},3]}]}]},
81+
"standard",
82+
{"if":[{"or":[{">":[{"var":"score"},20]},{"!=":[{"var":"plan"},"free"]}]},"basic",null]}
83+
]}
84+
]}}`...)
85+
86+
// Padding flags: mix of static, targeting, and disabled
87+
for i := 0; i < n; i++ {
88+
buf = append(buf, ',')
89+
switch i % 3 {
90+
case 0: // static
91+
buf = append(buf, fmt.Sprintf(`"pad-flag-%d":{"state":"ENABLED","defaultVariant":"on","variants":{"on":true,"off":false}}`, i)...)
92+
case 1: // targeting
93+
buf = append(buf, fmt.Sprintf(`"pad-flag-%d":{"state":"ENABLED","defaultVariant":"off","variants":{"on":true,"off":false},"targeting":{"if":[{"==":[{"var":"tier"},"premium"]},"on","off"]}}`, i)...)
94+
case 2: // disabled
95+
buf = append(buf, fmt.Sprintf(`"pad-flag-%d":{"state":"DISABLED","defaultVariant":"off","variants":{"on":true,"off":false}}`, i)...)
96+
}
97+
}
98+
buf = append(buf, `}}`...)
99+
return string(buf)
100+
}
101+
102+
// TestP99LatencyStability runs sustained sequential evaluations across 10-second
103+
// time windows for 2 minutes per scenario, asserting that p99 latency does not
104+
// degrade over time. This catches GC pressure issues where per-evaluation
105+
// allocations accumulate and cause GC pauses to grow.
106+
func TestP99LatencyStability(t *testing.T) {
107+
if testing.Short() {
108+
t.Skip("skipping latency stability test in short mode")
109+
}
110+
111+
const (
112+
windowDuration = 10 * time.Second
113+
numWindows = 12 // 12 × 10s = 2 minutes per sub-test
114+
)
115+
116+
type subTest struct {
117+
name string
118+
config string
119+
flagKey string
120+
makeCtx func(i int) map[string]interface{}
121+
}
122+
123+
tests := []subTest{
124+
{
125+
name: "SmallContext",
126+
config: complexTargetingConfig,
127+
flagKey: "complex-flag",
128+
makeCtx: func(i int) map[string]interface{} {
129+
return map[string]interface{}{
130+
"targetingKey": fmt.Sprintf("user-%d", i),
131+
"tier": "premium",
132+
"role": "admin",
133+
"region": "us-east",
134+
"score": i % 100,
135+
}
136+
},
137+
},
138+
{
139+
name: "LargeContext",
140+
config: complexTargetingConfig,
141+
flagKey: "complex-flag",
142+
makeCtx: func(i int) map[string]interface{} {
143+
ctx := map[string]interface{}{
144+
"targetingKey": fmt.Sprintf("user-%d", i),
145+
"tier": "premium",
146+
"role": "admin",
147+
"region": "us-east",
148+
"score": i % 100,
149+
}
150+
for j := 0; j < 50; j++ {
151+
ctx[fmt.Sprintf("attr_%d", j)] = fmt.Sprintf("value-%d-%d", i, j)
152+
}
153+
return ctx
154+
},
155+
},
156+
{
157+
name: "BigTargeting",
158+
config: bigTargetingConfig,
159+
flagKey: "big-flag",
160+
makeCtx: func(i int) map[string]interface{} {
161+
ctx := map[string]interface{}{
162+
"targetingKey": fmt.Sprintf("user-%d", i),
163+
"tier": "premium",
164+
"role": "admin",
165+
"region": "us-east",
166+
"score": i % 100,
167+
"department": "engineering",
168+
"experience": i % 15,
169+
"country": "US",
170+
"level": i % 10,
171+
"plan": "pro",
172+
}
173+
for j := 0; j < 50; j++ {
174+
ctx[fmt.Sprintf("attr_%d", j)] = fmt.Sprintf("value-%d-%d", i, j)
175+
}
176+
return ctx
177+
},
178+
},
179+
{
180+
name: "BigStore",
181+
config: generateBigStoreConfig(500),
182+
flagKey: "big-flag",
183+
makeCtx: func(i int) map[string]interface{} {
184+
ctx := map[string]interface{}{
185+
"targetingKey": fmt.Sprintf("user-%d", i),
186+
"tier": "premium",
187+
"role": "admin",
188+
"region": "us-east",
189+
"score": i % 100,
190+
"department": "engineering",
191+
"experience": i % 15,
192+
"country": "US",
193+
"level": i % 10,
194+
"plan": "pro",
195+
}
196+
for j := 0; j < 50; j++ {
197+
ctx[fmt.Sprintf("attr_%d", j)] = fmt.Sprintf("value-%d-%d", i, j)
198+
}
199+
return ctx
200+
},
201+
},
202+
}
203+
204+
for _, tc := range tests {
205+
t.Run(tc.name, func(t *testing.T) {
206+
e := newTestEvaluator(t)
207+
_, err := e.UpdateState(tc.config)
208+
if err != nil {
209+
t.Fatalf("UpdateState failed: %v", err)
210+
}
211+
212+
p99s := make([]time.Duration, numWindows)
213+
evalCounts := make([]int, numWindows)
214+
215+
for w := 0; w < numWindows; w++ {
216+
var latencies []time.Duration
217+
deadline := time.Now().Add(windowDuration)
218+
i := 0
219+
for time.Now().Before(deadline) {
220+
ctx := tc.makeCtx(w*1_000_000 + i)
221+
start := time.Now()
222+
_, err := e.EvaluateFlag(tc.flagKey, ctx)
223+
latencies = append(latencies, time.Since(start))
224+
if err != nil {
225+
t.Fatalf("EvaluateFlag failed: %v", err)
226+
}
227+
i++
228+
}
229+
p99s[w] = percentile(latencies, 0.99)
230+
evalCounts[w] = len(latencies)
231+
}
232+
233+
for w := range p99s {
234+
t.Logf("window %2d: p99 = %-12v evals = %d", w, p99s[w], evalCounts[w])
235+
}
236+
237+
// Window 0 is warmup; window 1 is baseline
238+
baseline := p99s[1]
239+
t.Logf("baseline (window 1): p99 = %v", baseline)
240+
241+
// Check 1: No window exceeds 3x baseline
242+
for w := 2; w < numWindows; w++ {
243+
if p99s[w] > 3*baseline {
244+
t.Errorf("window %d p99 (%v) exceeds 3x baseline (%v)", w, p99s[w], 3*baseline)
245+
}
246+
}
247+
248+
// Check 2: Last 5+ consecutive windows all above 1.5x = "spike and stay" pattern
249+
consecutiveAbove := 0
250+
for w := numWindows - 1; w >= 2; w-- {
251+
if p99s[w] > baseline+(baseline/2) {
252+
consecutiveAbove++
253+
} else {
254+
break
255+
}
256+
}
257+
if consecutiveAbove >= 5 {
258+
t.Errorf("last %d consecutive windows all above 1.5x baseline — p99 is not recovering", consecutiveAbove)
259+
}
260+
})
261+
}
262+
}
263+
264+
// TestP99LatencyStabilityConcurrent runs sustained parallel evaluations across
265+
// 10-second time windows for 2 minutes with big targeting + big store.
266+
// This catches issues that only manifest under pool contention + GC pressure.
267+
func TestP99LatencyStabilityConcurrent(t *testing.T) {
268+
if testing.Short() {
269+
t.Skip("skipping concurrent latency stability test in short mode")
270+
}
271+
272+
const (
273+
numGoroutines = 4
274+
windowDuration = 10 * time.Second
275+
numWindows = 12
276+
)
277+
278+
e, err := NewFlagEvaluator(WithPermissiveValidation(), WithPoolSize(numGoroutines))
279+
if err != nil {
280+
t.Fatalf("failed to create evaluator: %v", err)
281+
}
282+
t.Cleanup(func() { e.Close() })
283+
284+
_, err = e.UpdateState(generateBigStoreConfig(500))
285+
if err != nil {
286+
t.Fatalf("UpdateState failed: %v", err)
287+
}
288+
289+
p99s := make([]time.Duration, numWindows)
290+
evalCounts := make([]int, numWindows)
291+
292+
for w := 0; w < numWindows; w++ {
293+
var mu sync.Mutex
294+
var allLatencies []time.Duration
295+
296+
var wg sync.WaitGroup
297+
wg.Add(numGoroutines)
298+
for g := 0; g < numGoroutines; g++ {
299+
go func(gID int) {
300+
defer wg.Done()
301+
var local []time.Duration
302+
deadline := time.Now().Add(windowDuration)
303+
i := 0
304+
for time.Now().Before(deadline) {
305+
ctx := map[string]interface{}{
306+
"targetingKey": fmt.Sprintf("user-%d-%d-%d", w, gID, i),
307+
"tier": "premium",
308+
"role": "admin",
309+
"region": "us-east",
310+
"score": (gID*100_000 + i) % 100,
311+
"department": "engineering",
312+
"experience": i % 15,
313+
"country": "US",
314+
"level": i % 10,
315+
"plan": "pro",
316+
}
317+
start := time.Now()
318+
e.EvaluateFlag("big-flag", ctx)
319+
local = append(local, time.Since(start))
320+
i++
321+
}
322+
mu.Lock()
323+
allLatencies = append(allLatencies, local...)
324+
mu.Unlock()
325+
}(g)
326+
}
327+
wg.Wait()
328+
p99s[w] = percentile(allLatencies, 0.99)
329+
evalCounts[w] = len(allLatencies)
330+
}
331+
332+
for w := range p99s {
333+
t.Logf("window %2d: p99 = %-12v evals = %d", w, p99s[w], evalCounts[w])
334+
}
335+
336+
baseline := p99s[1]
337+
t.Logf("baseline (window 1): p99 = %v", baseline)
338+
339+
// Relaxed to 4x for concurrent (contention adds variance)
340+
for w := 2; w < numWindows; w++ {
341+
if p99s[w] > 4*baseline {
342+
t.Errorf("window %d p99 (%v) exceeds 4x baseline (%v)", w, p99s[w], 4*baseline)
343+
}
344+
}
345+
346+
consecutiveAbove := 0
347+
for w := numWindows - 1; w >= 2; w-- {
348+
if p99s[w] > baseline+(baseline/2) {
349+
consecutiveAbove++
350+
} else {
351+
break
352+
}
353+
}
354+
if consecutiveAbove >= 5 {
355+
t.Errorf("last %d consecutive windows all above 1.5x baseline — p99 is not recovering", consecutiveAbove)
356+
}
357+
}
358+
359+
// percentile returns the p-th percentile from a slice of durations.
360+
// p should be between 0 and 1 (e.g., 0.99 for p99).
361+
func percentile(latencies []time.Duration, p float64) time.Duration {
362+
sort.Slice(latencies, func(i, j int) bool { return latencies[i] < latencies[j] })
363+
idx := int(float64(len(latencies)-1) * p)
364+
return latencies[idx]
365+
}

0 commit comments

Comments
 (0)