Skip to content

Commit 9270dad

Browse files
Mathos1432Copilot
andauthored
Add INFO COMMANDSTATS tracking with per-command success rates (#1702)
* Add INFO COMMANDSTATS tracking with per-command success rates Implements Redis-compatible INFO COMMANDSTATS support, gated behind --commandstats-monitor config flag. Tracks per-command calls, failed_calls, and rejected_calls using lightweight counter increments (no Stopwatch). Key design decisions: - Array-indexed by RespCommand enum for O(1) access - Per-session counters (single-writer, no locking on hot path) - On-demand aggregation from active sessions + disposed session history - No latency tracking (usec=0) to avoid Stopwatch overhead (~4.4x perf hit) - Monitor dispose handles case where Start() was never called (EmbeddedServer) Benchmark results (100 PINGs, .NET 10): Disabled: 1.687 us | Enabled: 1.820 us | Overhead: ~7.8% SET/GET overhead: 0-4% (within noise for real workloads) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Fix double-counting bug in PopulateCommandStatsInfo and remove stale latency references When MetricsSamplingFrequency > 0, globalCommandStats already includes history from disposed sessions. PopulateCommandStatsInfo was adding historyCommandStats on top, double-counting disposed session stats. Also removed stale 'latency' references from doc comments and help text since per-command latency tracking was removed to avoid Stopwatch overhead. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Address PR review: simplify error detection, use canonical command names, fix tests - Remove bufferFlushedDuringCommand field and pointer-based error detection; rely solely on commandErrorWritten flag set by WriteError/AbortWithErrorMessage - Use RespCommandsInfo.GetRespCommandName() for canonical Redis command names instead of enum ToString with string replacement - Switch most tests to metricsSamplingFreq: 0 (on-demand aggregation) to remove Thread.Sleep delays; keep one test with periodic sampling for coverage - Fix CommandStatsFailedCallsTest to use SETRANGE with invalid offset (goes through AbortWithErrorMessage) instead of WRONGTYPE which bypasses the flag Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Include LatencyMonitor in monitor creation condition GarnetLatencyMetricsSession dereferences monitor.monitor_iterations, but StoreWrapper only created the monitor when MetricsSamplingFrequency > 0 or CommandStatsMonitor was enabled. With --latency-monitor alone and no sampling frequency, this would cause a NullReferenceException at session creation. Add LatencyMonitor to the condition so the monitor is always created when latency tracking is enabled. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> * Update REspInfo test --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 063e8a1 commit 9270dad

17 files changed

Lines changed: 602 additions & 10 deletions

libs/common/Metrics/InfoMetricsType.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,10 @@ public enum InfoMetricsType : byte
8686
/// Scan and return distribution of in-memory portion of hybrid logs for main store and object store
8787
/// </summary>
8888
HLOGSCAN,
89+
/// <summary>
90+
/// Per-command usage statistics (calls, failures, rejections)
91+
/// </summary>
92+
COMMANDSTATS,
8993
}
9094

9195
/// <summary>

libs/host/Configuration/Options.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ internal sealed class Options : ICloneable
351351
[Option("latency-monitor", Required = false, HelpText = "Track latency of various events.")]
352352
public bool? LatencyMonitor { get; set; }
353353

354+
[OptionValidation]
355+
[Option("commandstats-monitor", Required = false, HelpText = "Track per-command usage statistics (calls, failures, rejections). Exposed via INFO COMMANDSTATS.")]
356+
public bool? CommandStatsMonitor { get; set; }
357+
354358
[IntRangeValidation(0, int.MaxValue)]
355359
[Option("slowlog-log-slower-than", Required = false, HelpText = "Threshold (microseconds) for logging command in the slow log. 0 to disable.")]
356360
public int SlowLogThreshold { get; set; }
@@ -916,6 +920,7 @@ endpoint is IPEndPoint listenEp && clusterAnnounceEndpoint[0] is IPEndPoint anno
916920
ServerCertificateRequired.GetValueOrDefault(),
917921
logger: logger) : null,
918922
LatencyMonitor = LatencyMonitor.GetValueOrDefault(),
923+
CommandStatsMonitor = CommandStatsMonitor.GetValueOrDefault(),
919924
SlowLogThreshold = SlowLogThreshold,
920925
SlowLogMaxEntries = SlowLogMaxEntries,
921926
MetricsSamplingFrequency = MetricsSamplingFrequency,

libs/host/Configuration/Redis/RedisOptions.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ internal class RedisOptions
8383
[RedisOption("latency-tracking", nameof(Options.LatencyMonitor))]
8484
public Option<RedisBoolean> LatencyTracking { get; set; }
8585

86+
[RedisOption("commandstats-tracking", nameof(Options.CommandStatsMonitor))]
87+
public Option<RedisBoolean> CommandStatsTracking { get; set; }
88+
8689
[RedisOption("loglevel", nameof(Options.LogLevel))]
8790
public Option<RedisLogLevel> LogLevel { get; set; }
8891

libs/host/defaults.conf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,9 @@
254254
/* Track latency of various events. */
255255
"LatencyMonitor" : false,
256256

257+
/* Track per-command usage statistics (calls, failures, rejections). Exposed via INFO COMMANDSTATS. */
258+
"CommandStatsMonitor" : false,
259+
257260
/* Threshold (microseconds) for logging command in the slow log. 0 to disable. */
258261
"SlowLogThreshold": 0,
259262

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
4+
using System;
5+
using System.Runtime.CompilerServices;
6+
7+
namespace Garnet.server
8+
{
9+
/// <summary>
10+
/// Per-command statistics entry tracking calls, failures, and rejections.
11+
/// Follows the Redis COMMANDSTATS convention.
12+
/// </summary>
13+
public struct CommandStatsEntry
14+
{
15+
/// <summary>
16+
/// Total number of times this command was called.
17+
/// </summary>
18+
public ulong Calls;
19+
20+
/// <summary>
21+
/// Total number of times this command failed (returned an error response).
22+
/// </summary>
23+
public ulong FailedCalls;
24+
25+
/// <summary>
26+
/// Total number of times this command was rejected before execution (e.g., ACL denied, OOM).
27+
/// </summary>
28+
public ulong RejectedCalls;
29+
}
30+
31+
/// <summary>
32+
/// Tracks per-command usage statistics for built-in commands.
33+
/// Array-indexed by RespCommand enum value for O(1) access.
34+
/// Each session owns its own instance (single-writer, no locking needed).
35+
/// </summary>
36+
public class CommandStats
37+
{
38+
/// <summary>
39+
/// Number of entries in the stats array, sized to hold all valid RespCommand values.
40+
/// </summary>
41+
internal static readonly int EntryCount = (int)RespCommandExtensions.LastValidCommand + 1;
42+
43+
/// <summary>
44+
/// Per-command statistics entries indexed by (int)RespCommand.
45+
/// </summary>
46+
internal CommandStatsEntry[] entries;
47+
48+
/// <summary>
49+
/// Creates a new CommandStats instance with zeroed entries.
50+
/// </summary>
51+
public CommandStats()
52+
{
53+
entries = new CommandStatsEntry[EntryCount];
54+
}
55+
56+
/// <summary>
57+
/// Increment the calls counter for the given command.
58+
/// </summary>
59+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
60+
public void IncrementCalls(RespCommand cmd)
61+
{
62+
ushort idx = (ushort)cmd;
63+
if (idx < entries.Length)
64+
entries[idx].Calls++;
65+
}
66+
67+
/// <summary>
68+
/// Increment the failed calls counter for the given command.
69+
/// </summary>
70+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
71+
public void IncrementFailed(RespCommand cmd)
72+
{
73+
ushort idx = (ushort)cmd;
74+
if (idx < entries.Length)
75+
entries[idx].FailedCalls++;
76+
}
77+
78+
/// <summary>
79+
/// Increment the rejected calls counter for the given command.
80+
/// </summary>
81+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
82+
public void IncrementRejected(RespCommand cmd)
83+
{
84+
ushort idx = (ushort)cmd;
85+
if (idx < entries.Length)
86+
entries[idx].RejectedCalls++;
87+
}
88+
89+
/// <summary>
90+
/// Get the stats entry for the given command.
91+
/// </summary>
92+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
93+
public CommandStatsEntry GetEntry(RespCommand cmd)
94+
{
95+
ushort idx = (ushort)cmd;
96+
if (idx < entries.Length)
97+
return entries[idx];
98+
return default;
99+
}
100+
101+
/// <summary>
102+
/// Add another CommandStats instance into this one (for aggregation).
103+
/// </summary>
104+
internal void Add(CommandStats other)
105+
{
106+
if (other?.entries == null)
107+
return;
108+
109+
int len = Math.Min(entries.Length, other.entries.Length);
110+
for (int i = 0; i < len; i++)
111+
{
112+
entries[i].Calls += other.entries[i].Calls;
113+
entries[i].FailedCalls += other.entries[i].FailedCalls;
114+
entries[i].RejectedCalls += other.entries[i].RejectedCalls;
115+
}
116+
}
117+
118+
/// <summary>
119+
/// Reset all entries to zero.
120+
/// </summary>
121+
internal void Reset()
122+
{
123+
Array.Clear(entries, 0, entries.Length);
124+
}
125+
126+
}
127+
}

libs/server/Metrics/GarnetServerMetrics.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,17 @@ internal struct GarnetServerMetrics
3535
/// </summary>
3636
public readonly GarnetLatencyMetrics globalLatencyMetrics;
3737

38-
public GarnetServerMetrics(bool trackStats, bool trackLatency, GarnetServerMonitor monitor)
38+
/// <summary>
39+
/// Global per-command usage statistics (calls, failures, rejections).
40+
/// </summary>
41+
public CommandStats globalCommandStats;
42+
43+
/// <summary>
44+
/// History of per-command usage statistics from disposed sessions.
45+
/// </summary>
46+
public CommandStats historyCommandStats;
47+
48+
public GarnetServerMetrics(bool trackStats, bool trackLatency, bool trackCommandStats, GarnetServerMonitor monitor)
3949
{
4050
total_connections_received = 0;
4151
total_connections_disposed = 0;
@@ -49,6 +59,9 @@ public GarnetServerMetrics(bool trackStats, bool trackLatency, GarnetServerMonit
4959
historySessionMetrics = trackStats ? new GarnetSessionMetrics() : null;
5060

5161
globalLatencyMetrics = trackLatency ? new() : null;
62+
63+
globalCommandStats = trackCommandStats ? new CommandStats() : null;
64+
historyCommandStats = trackCommandStats ? new CommandStats() : null;
5265
}
5366

5467
public void Dispose()

libs/server/Metrics/GarnetServerMonitor.cs

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ internal enum EventType : byte
2020
internal sealed class GarnetServerMonitor
2121
{
2222
public readonly Dictionary<InfoMetricsType, bool>
23-
resetEventFlags = GarnetInfoMetrics.DefaultInfo.ToDictionary(x => x, y => false);
23+
resetEventFlags = Enum.GetValues<InfoMetricsType>().ToDictionary(x => x, y => false);
2424

2525
public readonly Dictionary<LatencyMetricsType, bool>
2626
resetLatencyMetrics = GarnetLatencyMetrics.defaultLatencyTypes.ToDictionary(x => x, y => false);
@@ -33,17 +33,20 @@ public readonly Dictionary<LatencyMetricsType, bool>
3333

3434
GarnetServerMetrics globalMetrics;
3535
readonly GarnetSessionMetrics accSessionMetrics;
36+
readonly CommandStats accCommandStats;
3637
private ulong instant_input_net_bytes;
3738
private ulong instant_output_net_bytes;
3839
private ulong instant_commands_processed;
3940

4041
readonly CancellationTokenSource cts = new();
41-
readonly ManualResetEvent done = new(false);
42+
readonly ManualResetEvent done = new(true);
4243

4344
readonly ILogger logger;
4445

4546
public GarnetServerMetrics GlobalMetrics => globalMetrics;
4647

48+
internal IGarnetServer[] Servers => servers;
49+
4750
SingleWriterMultiReaderLock rwLock = new();
4851

4952
public GarnetServerMonitor(StoreWrapper storeWrapper, GarnetServerOptions opts, IGarnetServer[] servers, ILogger logger = null)
@@ -58,9 +61,10 @@ public GarnetServerMonitor(StoreWrapper storeWrapper, GarnetServerOptions opts,
5861
instant_input_net_bytes = 0;
5962
instant_output_net_bytes = 0;
6063
instant_commands_processed = 0;
61-
globalMetrics = new(true, opts.LatencyMonitor, this);
64+
globalMetrics = new(true, opts.LatencyMonitor, opts.CommandStatsMonitor, this);
6265

6366
accSessionMetrics = new GarnetSessionMetrics();
67+
accCommandStats = opts.CommandStatsMonitor ? new CommandStats() : null;
6468
}
6569

6670
public void Dispose()
@@ -74,16 +78,23 @@ public void Dispose()
7478

7579
public void Start()
7680
{
77-
Task.Run(() => MainMonitorTask(cts.Token));
81+
// Only run the periodic sampling task if a sampling frequency is configured.
82+
// The monitor may be created solely for command stats history (without periodic sampling).
83+
if (monitorSamplingFrequency > TimeSpan.Zero)
84+
{
85+
done.Reset();
86+
Task.Run(() => MainMonitorTask(cts.Token));
87+
}
7888
}
7989

80-
public void AddMetricsHistorySessionDispose(GarnetSessionMetrics currSessionMetrics, GarnetLatencyMetricsSession currLatencyMetrics)
90+
public void AddMetricsHistorySessionDispose(GarnetSessionMetrics currSessionMetrics, GarnetLatencyMetricsSession currLatencyMetrics, CommandStats currCommandStats = null)
8191
{
8292
rwLock.WriteLock();
8393
try
8494
{
8595
if (currSessionMetrics != null) globalMetrics.historySessionMetrics.Add(currSessionMetrics);
8696
if (currLatencyMetrics != null) globalMetrics.globalLatencyMetrics.Merge(currLatencyMetrics);
97+
if (currCommandStats != null) globalMetrics.historyCommandStats?.Add(currCommandStats);
8798
currLatencyMetrics?.Return();
8899
}
89100
finally { rwLock.WriteUnlock(); }
@@ -133,6 +144,12 @@ private void AddCurrentServerStats(IGarnetServer server)
133144
// Accumulate session metrics
134145
accSessionMetrics.Add(session.GetSessionMetrics);
135146

147+
// Accumulate command stats if enabled
148+
if (accCommandStats != null)
149+
{
150+
accCommandStats.Add(session.GetCommandStats);
151+
}
152+
136153
// Accumulate latency metrics if latency monitor is enabled
137154
if (opts.LatencyMonitor)
138155
{
@@ -154,6 +171,12 @@ private void AddCurrentServerStats(IGarnetServer server)
154171
// Add accumulated session metrics for this iteration
155172
globalMetrics.globalSessionMetrics.Add(accSessionMetrics);
156173

174+
// Reset global command stats and add accumulated for this iteration
175+
if (accCommandStats != null)
176+
{
177+
globalMetrics.globalCommandStats.Reset();
178+
globalMetrics.globalCommandStats.Add(accCommandStats);
179+
}
157180
}
158181

159182
private void CleanupGlobalStats()
@@ -189,6 +212,24 @@ private void CleanupGlobalStats()
189212

190213
resetEventFlags[InfoMetricsType.STATS] = false;
191214
}
215+
216+
if (resetEventFlags.TryGetValue(InfoMetricsType.COMMANDSTATS, out bool resetCommandStats) && resetCommandStats)
217+
{
218+
logger?.LogInformation("Resetting command stats");
219+
globalMetrics.globalCommandStats?.Reset();
220+
globalMetrics.historyCommandStats?.Reset();
221+
222+
foreach (var garnetServer in servers.Cast<GarnetServerBase>())
223+
{
224+
var sessions = garnetServer.ActiveConsumers();
225+
foreach (var s in sessions)
226+
{
227+
((RespServerSession)s).GetCommandStats?.Reset();
228+
}
229+
}
230+
231+
resetEventFlags[InfoMetricsType.COMMANDSTATS] = false;
232+
}
192233
}
193234

194235
private void CleanupGlobalLatencyMetrics()
@@ -283,6 +324,13 @@ void ResetAndAddGlobalHistory()
283324
accSessionMetrics.Reset();
284325
// Add session metrics history in accumulator
285326
accSessionMetrics.Add(globalMetrics.historySessionMetrics);
327+
328+
// Reset command stats accumulator and add history
329+
if (accCommandStats != null)
330+
{
331+
accCommandStats.Reset();
332+
accCommandStats.Add(globalMetrics.historyCommandStats);
333+
}
286334
}
287335

288336
void ResetLatencySessionMetrics()

0 commit comments

Comments
 (0)