Skip to content

Commit abd6a50

Browse files
authored
obs(redis): expose Lua VM pool counters to Prometheus + Grafana panels (#787)
## Summary Wire the bounded Lua VM pool (PR #785) counters into Prometheus and add a Grafana row. **New metrics** (Counter / Gauge funcs — read at scrape time, zero per-EVAL cost): - `elastickv_lua_pool_hits_total` — counter - `elastickv_lua_pool_misses_total` — counter - `elastickv_lua_pool_drops_total` — counter ← the **"raise --redisLuaMaxIdleStates"** alarm - `elastickv_lua_pool_idle` — gauge - `elastickv_lua_pool_max_idle` — gauge **New Grafana row** "Lua VM Pool" in `elastickv-redis-summary.json`: - Idle vs MaxIdle (timeseries, max_idle dashed in red) - Hits / Misses / Drops rate (timeseries, drops in red) - Drops over Range (stat per node, green/orange/red) - Pool Saturation idle/max_idle (bargauge, 0–100%) ## Background PR #785 (merged) added the bounded pool with internal `atomic.Uint64` counters, but they were not exposed. With this PR operators can watch `elastickv_lua_pool_drops_total` and `elastickv_lua_pool_idle / elastickv_lua_pool_max_idle` to decide whether `--redisLuaMaxIdleStates` is right-sized. ## Caller audit (LuaPoolSource interface) - New interface in monitoring/; only implemented by `adapter.luaStatePool`. Verified at the call site by the type system; monitoring tests use a fake stub. - All accessors are atomic loads — pure, can't panic / block / mutate. Safe from the scrape goroutine. ## Self-review (5 lenses) 1. **Data loss** — N/A (read-only observability). 2. **Concurrency** — atomic loads; -race tests pass. 3. **Performance** — zero per-EVAL cost. Scrape (every ~15s) does 5x atomic.Load. 4. **Data consistency** — N/A. 5. **Test coverage** — 3 new tests: value exposure, live-update contract, nil-safety. ## Test plan - [x] `go test -race -count=1 ./adapter ./monitoring` — all green - [x] `golangci-lint run` — 0 issues - [x] JSON valid for dashboard - [ ] Deploy to production cluster (192.168.0.210-214), verify `/metrics` shows the new counters - [ ] Grafana dashboard re-import, verify the new "Lua VM Pool" row renders <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Prometheus metrics added for Lua VM pool: idle capacity, max idle, hits, misses, drops. * New Grafana dashboard row with four panels visualizing Lua VM pool health, saturation, rates, and drops per node. * Metrics are registered during service startup and will log a warning if unavailable. * **Tests** * Added tests validating metric exposure, live updates, nil-safety, and idempotent registration. <!-- review_stack_entry_start --> [![Review Change Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/bootjp/elastickv/pull/787?utm_source=github_walkthrough&utm_medium=github&utm_campaign=change_stack) <!-- review_stack_entry_end --> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
2 parents 7a05466 + 2f6012d commit abd6a50

6 files changed

Lines changed: 622 additions & 0 deletions

File tree

adapter/redis.go

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
pb "github.com/bootjp/elastickv/proto"
2525
"github.com/bootjp/elastickv/store"
2626
"github.com/cockroachdb/errors"
27+
"github.com/prometheus/client_golang/prometheus"
2728
"github.com/redis/go-redis/v9"
2829
"github.com/tidwall/redcon"
2930
)
@@ -556,6 +557,31 @@ func (r *RedisServer) Close() error {
556557
return nil
557558
}
558559

560+
// RegisterLuaPoolMetrics wires this server's bounded Lua VM pool
561+
// into the supplied Prometheus registerer, exposing five metrics
562+
// (hits / misses / drops / idle / max_idle). See
563+
// monitoring.RegisterLuaPool for the per-metric definitions.
564+
//
565+
// Returns nil if r, the pool, or registerer is nil — callers can
566+
// invoke this unconditionally from main.go without guarding for
567+
// test fixtures. The registration uses prometheus.NewCounterFunc /
568+
// NewGaugeFunc, so the values are read from the pool's atomic
569+
// counters at scrape time; no observability load is added to the
570+
// EVAL hot path.
571+
func (r *RedisServer) RegisterLuaPoolMetrics(registerer prometheus.Registerer) error {
572+
if r == nil || registerer == nil {
573+
return nil
574+
}
575+
pool := r.getLuaPool()
576+
if pool == nil {
577+
return nil
578+
}
579+
if err := monitoring.RegisterLuaPool(registerer, pool); err != nil {
580+
return errors.Wrap(err, "register lua pool metrics")
581+
}
582+
return nil
583+
}
584+
559585
func (r *RedisServer) Run() error {
560586
err := redcon.Serve(r.listen,
561587
func(conn redcon.Conn, cmd redcon.Command) {

main.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1533,6 +1533,16 @@ func startRedisServer(ctx context.Context, lc *net.ListenConfig, eg *errgroup.Gr
15331533
adapter.WithRedisCompactor(deltaCompactor),
15341534
adapter.WithLuaPoolMaxIdle(*redisLuaMaxIdleStates),
15351535
)
1536+
// Wire the bounded Lua VM pool into Prometheus. The metrics
1537+
// (hits/misses/drops/idle/max_idle) are read at scrape time via
1538+
// CounterFunc / GaugeFunc, so the EVAL hot path stays
1539+
// observability-free. A registration error degrades observability
1540+
// only — keep running and surface via slog so the operator can
1541+
// notice on the next dashboard load rather than seeing a crash
1542+
// loop here.
1543+
if err := redisServer.RegisterLuaPoolMetrics(metricsRegistry.Registerer()); err != nil {
1544+
slog.Warn("failed to register lua pool metrics; pool counters will be invisible in Prometheus", "err", err)
1545+
}
15361546
eg.Go(func() error {
15371547
defer redisServer.Stop()
15381548
stop := make(chan struct{})

monitoring/grafana/dashboards/elastickv-redis-summary.json

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1737,6 +1737,314 @@
17371737
],
17381738
"title": "Hot Path (legacy PR #560)",
17391739
"type": "row"
1740+
},
1741+
{
1742+
"collapsed": true,
1743+
"gridPos": {
1744+
"h": 1,
1745+
"w": 24,
1746+
"x": 0,
1747+
"y": 71
1748+
},
1749+
"id": 33,
1750+
"panels": [
1751+
{
1752+
"datasource": "$datasource",
1753+
"description": "Idle *lua.LState count retained by each node's bounded Lua VM pool (elastickv_lua_pool_idle) plotted against the configured cap (elastickv_lua_pool_max_idle). Idle staying pinned at MaxIdle for sustained periods + non-zero Drops below = pool is undersized; raise --redisLuaMaxIdleStates. Idle hovering near zero with no Drops = pool is right-sized or workload is below capacity.",
1754+
"fieldConfig": {
1755+
"defaults": {
1756+
"color": {
1757+
"mode": "palette-classic"
1758+
},
1759+
"custom": {
1760+
"axisPlacement": "auto",
1761+
"lineInterpolation": "stepAfter",
1762+
"lineWidth": 1,
1763+
"showPoints": "auto"
1764+
},
1765+
"unit": "short"
1766+
},
1767+
"overrides": [
1768+
{
1769+
"matcher": {
1770+
"id": "byRegexp",
1771+
"options": "max_idle.*"
1772+
},
1773+
"properties": [
1774+
{
1775+
"id": "custom.lineStyle",
1776+
"value": {
1777+
"dash": [
1778+
10,
1779+
10
1780+
],
1781+
"fill": "dash"
1782+
}
1783+
},
1784+
{
1785+
"id": "color",
1786+
"value": {
1787+
"fixedColor": "red",
1788+
"mode": "fixed"
1789+
}
1790+
}
1791+
]
1792+
}
1793+
]
1794+
},
1795+
"gridPos": {
1796+
"h": 8,
1797+
"w": 12,
1798+
"x": 0,
1799+
"y": 0
1800+
},
1801+
"id": 240,
1802+
"options": {
1803+
"legend": {
1804+
"displayMode": "list",
1805+
"placement": "bottom",
1806+
"showLegend": true
1807+
},
1808+
"tooltip": {
1809+
"mode": "multi",
1810+
"sort": "desc"
1811+
}
1812+
},
1813+
"targets": [
1814+
{
1815+
"datasource": "$datasource",
1816+
"editorMode": "code",
1817+
"expr": "elastickv_lua_pool_idle{job=\"elastickv\",node_id=~\"$node_id\"}",
1818+
"legendFormat": "idle {{node_id}}",
1819+
"range": true,
1820+
"refId": "A"
1821+
},
1822+
{
1823+
"datasource": "$datasource",
1824+
"editorMode": "code",
1825+
"expr": "elastickv_lua_pool_max_idle{job=\"elastickv\",node_id=~\"$node_id\"}",
1826+
"legendFormat": "max_idle {{node_id}}",
1827+
"range": true,
1828+
"refId": "B"
1829+
}
1830+
],
1831+
"title": "Lua VM Pool — Idle vs MaxIdle",
1832+
"type": "timeseries"
1833+
},
1834+
{
1835+
"datasource": "$datasource",
1836+
"description": "Rate of Lua VM pool outcomes: hits (get served from pool), misses (get had to allocate), drops (put rejected because pool full). Steady-state hits >> misses means the pool is doing its job; a sustained drops rate means --redisLuaMaxIdleStates is too low.",
1837+
"fieldConfig": {
1838+
"defaults": {
1839+
"color": {
1840+
"mode": "palette-classic"
1841+
},
1842+
"custom": {
1843+
"axisPlacement": "auto",
1844+
"lineInterpolation": "linear",
1845+
"lineWidth": 1,
1846+
"showPoints": "auto"
1847+
},
1848+
"unit": "ops"
1849+
},
1850+
"overrides": [
1851+
{
1852+
"matcher": {
1853+
"id": "byRegexp",
1854+
"options": "drops.*"
1855+
},
1856+
"properties": [
1857+
{
1858+
"id": "color",
1859+
"value": {
1860+
"fixedColor": "red",
1861+
"mode": "fixed"
1862+
}
1863+
}
1864+
]
1865+
}
1866+
]
1867+
},
1868+
"gridPos": {
1869+
"h": 8,
1870+
"w": 12,
1871+
"x": 12,
1872+
"y": 0
1873+
},
1874+
"id": 241,
1875+
"options": {
1876+
"legend": {
1877+
"displayMode": "list",
1878+
"placement": "bottom",
1879+
"showLegend": true
1880+
},
1881+
"tooltip": {
1882+
"mode": "multi",
1883+
"sort": "desc"
1884+
}
1885+
},
1886+
"targets": [
1887+
{
1888+
"datasource": "$datasource",
1889+
"editorMode": "code",
1890+
"expr": "sum by (node_id) (rate(elastickv_lua_pool_hits_total{job=\"elastickv\",node_id=~\"$node_id\"}[5m]))",
1891+
"legendFormat": "hits {{node_id}}",
1892+
"range": true,
1893+
"refId": "A"
1894+
},
1895+
{
1896+
"datasource": "$datasource",
1897+
"editorMode": "code",
1898+
"expr": "sum by (node_id) (rate(elastickv_lua_pool_misses_total{job=\"elastickv\",node_id=~\"$node_id\"}[5m]))",
1899+
"legendFormat": "misses {{node_id}}",
1900+
"range": true,
1901+
"refId": "B"
1902+
},
1903+
{
1904+
"datasource": "$datasource",
1905+
"editorMode": "code",
1906+
"expr": "sum by (node_id) (rate(elastickv_lua_pool_drops_total{job=\"elastickv\",node_id=~\"$node_id\"}[5m]))",
1907+
"legendFormat": "drops {{node_id}}",
1908+
"range": true,
1909+
"refId": "C"
1910+
}
1911+
],
1912+
"title": "Lua VM Pool — Hits / Misses / Drops Rate",
1913+
"type": "timeseries"
1914+
},
1915+
{
1916+
"datasource": "$datasource",
1917+
"description": "Cluster-wide Lua VM pool drops over the dashboard time range. Non-zero values mean at least one node's pool overflowed — raise --redisLuaMaxIdleStates on the affected nodes.",
1918+
"fieldConfig": {
1919+
"defaults": {
1920+
"color": {
1921+
"mode": "thresholds"
1922+
},
1923+
"thresholds": {
1924+
"mode": "absolute",
1925+
"steps": [
1926+
{
1927+
"color": "green",
1928+
"value": 0
1929+
},
1930+
{
1931+
"color": "orange",
1932+
"value": 1
1933+
},
1934+
{
1935+
"color": "red",
1936+
"value": 100
1937+
}
1938+
]
1939+
},
1940+
"unit": "short"
1941+
},
1942+
"overrides": []
1943+
},
1944+
"gridPos": {
1945+
"h": 6,
1946+
"w": 8,
1947+
"x": 0,
1948+
"y": 8
1949+
},
1950+
"id": 242,
1951+
"options": {
1952+
"colorMode": "value",
1953+
"graphMode": "area",
1954+
"justifyMode": "auto",
1955+
"orientation": "auto",
1956+
"reduceOptions": {
1957+
"calcs": [
1958+
"lastNotNull"
1959+
],
1960+
"fields": "",
1961+
"values": false
1962+
},
1963+
"textMode": "auto"
1964+
},
1965+
"pluginVersion": "11.0.0",
1966+
"targets": [
1967+
{
1968+
"datasource": "$datasource",
1969+
"editorMode": "code",
1970+
"expr": "sum by (node_id) (increase(elastickv_lua_pool_drops_total{job=\"elastickv\",node_id=~\"$node_id\"}[$__range]))",
1971+
"legendFormat": "{{node_id}}",
1972+
"range": true,
1973+
"refId": "A"
1974+
}
1975+
],
1976+
"title": "Drops over Range (per Node)",
1977+
"type": "stat"
1978+
},
1979+
{
1980+
"datasource": "$datasource",
1981+
"description": "Lua VM pool saturation = idle / max_idle, per node. 100% means the pool is fully populated (every put() is at risk of dropping). Sustained 100% + non-zero Drops rate = raise --redisLuaMaxIdleStates.",
1982+
"fieldConfig": {
1983+
"defaults": {
1984+
"color": {
1985+
"mode": "thresholds"
1986+
},
1987+
"max": 1,
1988+
"min": 0,
1989+
"thresholds": {
1990+
"mode": "absolute",
1991+
"steps": [
1992+
{
1993+
"color": "green",
1994+
"value": 0
1995+
},
1996+
{
1997+
"color": "orange",
1998+
"value": 0.8
1999+
},
2000+
{
2001+
"color": "red",
2002+
"value": 1
2003+
}
2004+
]
2005+
},
2006+
"unit": "percentunit"
2007+
},
2008+
"overrides": []
2009+
},
2010+
"gridPos": {
2011+
"h": 6,
2012+
"w": 16,
2013+
"x": 8,
2014+
"y": 8
2015+
},
2016+
"id": 243,
2017+
"options": {
2018+
"displayMode": "gradient",
2019+
"minVizHeight": 10,
2020+
"minVizWidth": 0,
2021+
"orientation": "horizontal",
2022+
"reduceOptions": {
2023+
"calcs": [
2024+
"lastNotNull"
2025+
],
2026+
"fields": "",
2027+
"values": false
2028+
},
2029+
"showUnfilled": true
2030+
},
2031+
"pluginVersion": "11.0.0",
2032+
"targets": [
2033+
{
2034+
"datasource": "$datasource",
2035+
"editorMode": "code",
2036+
"expr": "elastickv_lua_pool_idle{job=\"elastickv\",node_id=~\"$node_id\"} / elastickv_lua_pool_max_idle{job=\"elastickv\",node_id=~\"$node_id\"}",
2037+
"legendFormat": "{{node_id}}",
2038+
"range": true,
2039+
"refId": "A"
2040+
}
2041+
],
2042+
"title": "Pool Saturation (idle / max_idle)",
2043+
"type": "bargauge"
2044+
}
2045+
],
2046+
"title": "Lua VM Pool",
2047+
"type": "row"
17402048
}
17412049
],
17422050
"refresh": "10s",

0 commit comments

Comments
 (0)