Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
5901c4f
Implement graceful shutdown for Garnet server
yuseok-kim-edushare Nov 26, 2025
9ea9339
Update hosting/Windows/Garnet.worker/Program.cs
yuseok-kim-edushare Nov 26, 2025
7731df6
Update libs/host/GarnetServer.cs
yuseok-kim-edushare Nov 26, 2025
597ec57
Update main/GarnetServer/Program.cs
yuseok-kim-edushare Nov 26, 2025
9e168f4
Update main/GarnetServer/Program.cs
yuseok-kim-edushare Nov 26, 2025
67a7f2e
Update main/GarnetServer/Program.cs
yuseok-kim-edushare Nov 26, 2025
5b25407
Update libs/host/GarnetServer.cs
yuseok-kim-edushare Nov 26, 2025
476d629
πŸ› Resolve Race Condition risk in "StopListening" impl at GarnetServer…
yuseok-kim-edushare Nov 26, 2025
9bf52de
βœ… add test for gracefulshutdown about main/garnetserver
yuseok-kim-edushare Nov 26, 2025
11d115b
πŸ› Fix risk of shutdown handler remaining
yuseok-kim-edushare Nov 26, 2025
3b3df07
✏️ fix by dotnet format
yuseok-kim-edushare Nov 26, 2025
2489995
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Dec 3, 2025
9d5017a
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Dec 6, 2025
5eb025c
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Dec 13, 2025
ff37e9b
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Dec 22, 2025
c542789
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 7, 2026
efd4c75
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 14, 2026
0b113a7
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 19, 2026
19b1a13
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 25, 2026
d77bd6b
βœ…πŸ”€ Fix Test with Allure related Requirements (reflect #1457)
yuseok-kim-edushare Jan 25, 2026
f14fb2c
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 25, 2026
c8da690
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 27, 2026
2e9a53b
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Jan 29, 2026
30fc80d
Merge branch 'main' into feat/graceful_shutdown
yuseok-kim-edushare Feb 8, 2026
3177001
βͺ Revert : Windows Service shutdown timeout custom
yuseok-kim-edushare Feb 8, 2026
920c328
Use long for active connection count
yuseok-kim-edushare Feb 8, 2026
8ea0c24
minor perf fix in connection counting
yuseok-kim-edushare Feb 8, 2026
bf05d23
Use linked CancellationTokenSource for shutdown wait
yuseok-kim-edushare Feb 8, 2026
01a2547
update log levels infomation -> debug for perfomance inhancing
yuseok-kim-edushare Feb 8, 2026
65b1391
minor bug risk of cancelled canclationToken provide into finally
yuseok-kim-edushare Feb 8, 2026
5145094
πŸ§ͺ temporal test code save (test for gracefull shutown in single insta…
yuseok-kim-edushare Feb 8, 2026
cf9c997
Fix Program.CS main method refine
yuseok-kim-edushare Feb 8, 2026
514456c
Update libs/host/GarnetServer.cs
yuseok-kim-edushare Feb 8, 2026
db633e3
Update test/Garnet.test/GarnetServerTcpTests.cs
yuseok-kim-edushare Feb 8, 2026
de4e462
Update main/GarnetServer/Program.cs
yuseok-kim-edushare Feb 8, 2026
74f1c0a
Apply suggestion from @Copilot
yuseok-kim-edushare Feb 8, 2026
7bb2ab2
Fix about copilot's concern about race conditions
yuseok-kim-edushare Feb 8, 2026
7cb1e9c
fix issue claimed by copilot
yuseok-kim-edushare Feb 8, 2026
2530260
πŸ”₯ remove duplicated test
yuseok-kim-edushare Feb 8, 2026
784f630
Fix Test code running flow
yuseok-kim-edushare Feb 8, 2026
bc6580d
Fix test codes to reflect copilot's suggestions
yuseok-kim-edushare Feb 8, 2026
e9b0a3e
✨Add noSave Arg to Server.ShutdownAsync()
yuseok-kim-edushare Feb 16, 2026
aaaf45b
Add shutdown data consistency tests
yuseok-kim-edushare Feb 16, 2026
714c2f6
Rearrange AOF commit and checkpoint during Data saving
yuseok-kim-edushare Feb 16, 2026
08e48a2
Save data only once during single shutdown process
yuseok-kim-edushare Feb 16, 2026
c6fcb19
Remove isListening Flag
yuseok-kim-edushare Feb 16, 2026
4ccad8e
✏️ Fix format
yuseok-kim-edushare Feb 16, 2026
d37ad7c
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Feb 16, 2026
60faf96
✏️ Fix comment about infomational Test
yuseok-kim-edushare Feb 16, 2026
e0616b7
Merge branch 'yuseok-kim/graceful_shutdown' of https://github.com/yus…
yuseok-kim-edushare Feb 16, 2026
36a85b9
Skip graceful save on forced shutdown
yuseok-kim-edushare Feb 16, 2026
d29c716
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Feb 18, 2026
b7d4d25
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Feb 20, 2026
42d7bd2
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Feb 25, 2026
e6be406
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Feb 25, 2026
1a2fff7
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 2, 2026
fccbbd0
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 3, 2026
6c883a2
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 4, 2026
6d97896
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 5, 2026
9d98495
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 5, 2026
b682fff
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 15, 2026
34ebabd
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Mar 21, 2026
606c542
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 2, 2026
dac8423
♻️ Minor fix to align recent PR's edit about socket managing
yuseok-kim-edushare Apr 2, 2026
422166f
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 5, 2026
2891611
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 8, 2026
27a87d5
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 10, 2026
7466bef
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 12, 2026
011a5c5
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 15, 2026
bf4c49a
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 21, 2026
5562f3c
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 24, 2026
a2a0e69
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 27, 2026
3a87f5b
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 28, 2026
d5e1dbf
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare Apr 30, 2026
48714c5
🚨 Fix main/GarnetServer/Program.cs
yuseok-kim-edushare May 2, 2026
658a594
Merge branch 'main' into yuseok-kim/graceful_shutdown
yuseok-kim-edushare May 2, 2026
07aecc8
🚨 Fix libs/host/GarnetServer.cs & libs/server/Servers/StoreApi.cs
yuseok-kim-edushare May 2, 2026
f2b89c3
Fix: shutdownDataConsistencyTests.cs
yuseok-kim-edushare May 2, 2026
c6fd43b
Update ShutdownDataConsistencyTests.cs
yuseok-kim-edushare May 2, 2026
5588486
Potential fix for pull request finding
yuseok-kim-edushare May 2, 2026
871f268
Add quiesce support for graceful shutdown
yuseok-kim-edushare May 3, 2026
446f1af
Add configurable shutdown-timeout and host buffer
yuseok-kim-edushare May 3, 2026
efe361f
Fix Options and defaults.conf to fix CI failure
yuseok-kim-edushare May 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion hosting/Windows/Garnet.worker/Program.cs
Original file line number Diff line number Diff line change
@@ -1,16 +1,35 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

using System;
using Garnet;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;

class Program
{
// Data finalization (AOF commit / checkpoint) uses up to 15 seconds internally (see GarnetServer.FinalizeDataAsync).
// Add this buffer on top of the connection-drain timeout so the host shutdown budget covers the full shutdown sequence.
private const int DataFinalizationBufferSeconds = 20;

static void Main(string[] args)
{
// Pre-parse only the shutdown-timeout argument so we can configure both
// the host shutdown budget and the Worker's connection-drain timeout from a single value.
var shutdownTimeoutSeconds = ParseShutdownTimeoutSeconds(args, defaultSeconds: 5);
var shutdownTimeout = TimeSpan.FromSeconds(shutdownTimeoutSeconds);

var builder = Host.CreateApplicationBuilder(args);
builder.Services.AddHostedService(_ => new Worker(args));

// Tell the .NET host (and the Windows SCM via WindowsServiceLifetime) how long to wait
// before forcibly killing the process. We add DataFinalizationBufferSeconds so that AOF
// commit / checkpoint can complete after connection draining finishes.
builder.Services.Configure<HostOptions>(opts =>
{
opts.ShutdownTimeout = shutdownTimeout + TimeSpan.FromSeconds(DataFinalizationBufferSeconds);
});

builder.Services.AddHostedService(_ => new Worker(args, shutdownTimeout));

builder.Services.AddWindowsService(options =>
{
Expand All @@ -20,4 +39,22 @@ static void Main(string[] args)
var host = builder.Build();
host.Run();
}

/// <summary>
/// Scans <paramref name="args"/> for <c>--shutdown-timeout &lt;value&gt;</c> and returns
/// the parsed integer, or <paramref name="defaultSeconds"/> if the argument is absent or invalid.
/// This lightweight pre-parse avoids a full CommandLineParser pass before the host is built.
/// </summary>
private static int ParseShutdownTimeoutSeconds(string[] args, int defaultSeconds)
{
for (var i = 0; i < args.Length - 1; i++)
{
if (args[i] is "--shutdown-timeout" or "-shutdown-timeout" &&
int.TryParse(args[i + 1], out var value) && value > 0)
{
return value;
}
}
return defaultSeconds;
}
}
34 changes: 31 additions & 3 deletions hosting/Windows/Garnet.worker/Worker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,20 @@ public class Worker : BackgroundService
{
private bool _isDisposed = false;
private readonly string[] args;
private readonly TimeSpan _shutdownTimeout;

private GarnetServer server;

public Worker(string[] args)
/// <param name="args">Command line arguments forwarded to <see cref="GarnetServer"/>.</param>
/// <param name="shutdownTimeout">
/// How long to wait for active connections to drain during graceful shutdown.
/// Must be less than the host <see cref="Microsoft.Extensions.Hosting.HostOptions.ShutdownTimeout"/>
/// so that data finalization (AOF commit / checkpoint) can also complete within the host budget.
/// </param>
public Worker(string[] args, TimeSpan shutdownTimeout)
{
this.args = args;
_shutdownTimeout = shutdownTimeout;
}

protected override async Task ExecuteAsync(CancellationToken stoppingToken)
Expand All @@ -43,8 +51,26 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken)
/// <param name="cancellationToken">Indicates that the shutdown process should no longer be graceful.</param>
public override async Task StopAsync(CancellationToken cancellationToken)
{
Dispose();
await base.StopAsync(cancellationToken).ConfigureAwait(false);
try
{
if (server != null)
{
// If cancellation is requested, we will skip the graceful shutdown and proceed to dispose immediately
bool isForceShutdown = cancellationToken.IsCancellationRequested;
// Perform graceful shutdown with AOF commit and checkpoint when not forced Shutdown From OS.
await server.ShutdownAsync(timeout: _shutdownTimeout, noSave: isForceShutdown, token: cancellationToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException)
{
// Force shutdown requested - proceed to dispose
}
finally
{
// Ensure base class cleanup although cancellationToken is cancelled
await base.StopAsync(CancellationToken.None).ConfigureAwait(false);
Dispose();
}
}

public override void Dispose()
Expand All @@ -55,6 +81,8 @@ public override void Dispose()
}
server?.Dispose();
_isDisposed = true;
base.Dispose();
GC.SuppressFinalize(this);
}
}
}
6 changes: 6 additions & 0 deletions libs/host/Configuration/Options.cs
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,11 @@ internal sealed class Options : ICloneable
[Option("network-connection-limit", Required = false, HelpText = "Maximum number of simultaneously active network connections.")]
public int NetworkConnectionLimit { get; set; }

[IntRangeValidation(1, int.MaxValue)]
[Option("shutdown-timeout", Required = false, HelpText = "Timeout in seconds to wait for active connections to drain during graceful shutdown. " +
"The Windows SCM default pre-kill wait is 5 seconds, so values below 5 are not recommended when running as a Windows service.")]
public int ShutdownTimeoutSeconds { get; set; }

[OptionValidation]
[Option("use-azure-storage", Required = false, HelpText = "Use Azure Page Blobs for storage instead of local storage.")]
public bool? UseAzureStorage { get; set; }
Expand Down Expand Up @@ -938,6 +943,7 @@ endpoint is IPEndPoint listenEp && clusterAnnounceEndpoint[0] is IPEndPoint anno
ThreadPoolMinIOCompletionThreads = ThreadPoolMinIOCompletionThreads,
ThreadPoolMaxIOCompletionThreads = ThreadPoolMaxIOCompletionThreads,
NetworkConnectionLimit = NetworkConnectionLimit,
ShutdownTimeoutSeconds = ShutdownTimeoutSeconds,
DeviceFactoryCreator = deviceType == DeviceType.AzureStorage ? azureFactoryCreator()
: new LocalStorageNamedDeviceFactoryCreator(deviceType: deviceType, logger: logger),
CheckpointThrottleFlushDelayMs = CheckpointThrottleFlushDelayMs,
Expand Down
207 changes: 207 additions & 0 deletions libs/host/GarnetServer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Garnet.cluster;
using Garnet.common;
using Garnet.networking;
Expand Down Expand Up @@ -450,6 +451,212 @@ public void Start()
Console.WriteLine("* Ready to accept connections");
}

/// <summary>
/// Performs graceful shutdown of the server.
/// Stops accepting new connections, waits for active connections to complete, commits AOF, and takes checkpoint if needed.
/// </summary>
/// <param name="timeout">Timeout for waiting on active connections (default: 30 seconds)</param>
/// <param name="noSave">If true, skip data persistence (AOF commit and checkpoint) during shutdown</param>
/// <param name="token">Cancellation token</param>
/// <returns>Task representing the async shutdown operation</returns>
public async Task ShutdownAsync(TimeSpan? timeout = null, bool noSave = false, CancellationToken token = default)
{
var shutdownTimeout = timeout ?? TimeSpan.FromSeconds(30);

try
{
// Quiesce existing sessions first: they will reject the next incoming message
// and close themselves, so FinalizeDataAsync runs with no concurrent writers.
if (servers != null)
{
foreach (var server in servers)
server.BeginQuiesce();
}

// Quiesce pub/sub fan-out so no new messages are delivered after this point.
subscribeBroker?.BeginQuiesce();

// Stop accepting new connections.
StopListening();

// Wait for existing connections to complete (cancellable)
try
{
await WaitForActiveConnectionsAsync(shutdownTimeout, token).ConfigureAwait(false);
}
catch (OperationCanceledException)
{
logger?.LogWarning("Connection draining was cancelled. Proceeding with data finalization...");
}
}
catch (Exception ex)
{
logger?.LogError(ex, "Error during graceful shutdown");
}
finally
{
if (!noSave)
{
// Attempt AOF commit or checkpoint as best-effort,
// even if connection draining was cancelled or failed.
// Use a bounded timeout instead of the caller's token to ensure completion.
using var finalizeCts = new CancellationTokenSource(TimeSpan.FromSeconds(15));
try
{
await FinalizeDataAsync(finalizeCts.Token).ConfigureAwait(false);
}
catch (Exception ex)
{
logger?.LogError(ex, "Error during data finalization");
}
Comment thread
yuseok-kim-edushare marked this conversation as resolved.
}
else
{
logger?.LogInformation("Shutdown with noSave flag - skipping data persistence.");
}
}
}

/// <summary>
/// Stop all servers from accepting new connections.
/// </summary>
private void StopListening()
{
if (servers == null) return;

logger?.LogDebug("Stopping listeners to prevent new connections...");
foreach (var server in servers)
{
try
{
server?.StopListening();
}
catch (Exception ex)
{
logger?.LogWarning(ex, "Error stopping listener");
}
}
}

/// <summary>
/// Waits for active connections to complete within the specified timeout.
/// </summary>
private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationToken token)
{
if (servers == null) return;

// Linked Token : between external token and timeout
using var cts = CancellationTokenSource.CreateLinkedTokenSource(token);
cts.CancelAfter(timeout);

var delays = new[] { 50, 300, 1000 };
var delayIndex = 0;

try
{
while (!cts.Token.IsCancellationRequested)
{
var activeConnections = GetActiveConnectionCount();
if (activeConnections == 0)
{
logger?.LogInformation("All connections have been closed gracefully.");
return;
}

logger?.LogInformation("Waiting for {ActiveConnections} active connections to complete...", activeConnections);

var currentDelay = delays[delayIndex];
if (delayIndex < delays.Length - 1) delayIndex++;

await Task.Delay(currentDelay, cts.Token).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (token.IsCancellationRequested)
{
throw;
}
catch (OperationCanceledException)
{
// timeout reached error logging
logger?.LogWarning("Timeout reached after {TimeoutSeconds} seconds. Some connections may still be active.",
timeout.TotalSeconds);
}
catch (Exception ex)
{
logger?.LogWarning(ex, "Error checking active connections");
await Task.Delay(500, token).ConfigureAwait(false);
}
}

/// <summary>
/// Gets the current number of active connections directly from server instances.
/// </summary>
private long GetActiveConnectionCount()
{
long count = 0;
if (servers != null)
{
foreach (var garnetServer in servers)
{
if (garnetServer is GarnetServerBase garnetServerBase)
{
count += garnetServerBase.get_conn_active();
}
Comment thread
yuseok-kim-edushare marked this conversation as resolved.
}
}
return count;
}

/// <summary>
/// Persists data during shutdown using AOF or checkpoint based on configuration.
/// </summary>
private async Task FinalizeDataAsync(CancellationToken token)
{
if (opts.EnableAOF)
{
logger?.LogDebug("Committing AOF before shutdown...");
try
{
var commitSuccess = await Store.CommitAOFAsync(token).ConfigureAwait(false);
if (commitSuccess)
{
logger?.LogDebug("AOF committed successfully.");
}
else
{
logger?.LogInformation("AOF commit skipped (another commit in progress or replica mode).");
}
}
catch (Exception ex)
{
logger?.LogError(ex, "Error committing AOF during shutdown");
}

return;
}

if (!opts.EnableStorageTier)
return;

logger?.LogDebug("Taking checkpoint for tiered storage...");
try
{
var checkpointSuccess = await Store.TakeCheckpointAsync(background: false, token: token).ConfigureAwait(false);
if (checkpointSuccess)
{
logger?.LogDebug("Checkpoint completed successfully.");
}
else
{
logger?.LogInformation("Checkpoint skipped (another checkpoint in progress or replica mode).");
}
}
catch (Exception ex)
{
logger?.LogError(ex, "Error taking checkpoint during shutdown");
}
}

/// <summary>
/// Dispose store (including log and checkpoint directory)
/// </summary>
Expand Down
3 changes: 3 additions & 0 deletions libs/host/defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,9 @@
/* Maximum number of simultaneously active network connections. */
"NetworkConnectionLimit" : -1,

/* Timeout in seconds to wait for active connections to drain during graceful shutdown. */
"ShutdownTimeoutSeconds" : 5,

/* Use Azure Page Blobs for storage instead of local storage. */
"UseAzureStorage" : false,

Expand Down
Loading
Loading