Skip to content

Commit 71d48e3

Browse files
authored
Merge pull request #115 from hyp3rd/feat/dist-mem-cache
fix(dist-cache): enable gossip by default to fix silent node-rejoin failure
2 parents 28567c9 + 854c30d commit 71d48e3

4 files changed

Lines changed: 69 additions & 10 deletions

File tree

cmd/hypercache-server/main.go

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,25 @@ import (
4848
// here so operators see one canonical reference and so the magic-number
4949
// linter doesn't flag repeated literals at the env-parse sites.
5050
const (
51-
defaultReplication = 3
52-
defaultCapacity = 100_000
53-
defaultVirtualNodes = 64
54-
defaultIndirectK = 2
55-
suspectMultiplier = 3 // suspect after = N × heartbeat interval
56-
deadMultiplier = 6 // dead after = N × heartbeat interval
57-
defaultHintTTL = 30 * time.Second
58-
defaultHintReplay = 200 * time.Millisecond
59-
defaultHeartbeat = 1 * time.Second
60-
defaultRebalance = 250 * time.Millisecond
51+
defaultReplication = 3
52+
defaultCapacity = 100_000
53+
defaultVirtualNodes = 64
54+
defaultIndirectK = 2
55+
suspectMultiplier = 3 // suspect after = N × heartbeat interval
56+
deadMultiplier = 6 // dead after = N × heartbeat interval
57+
defaultHintTTL = 30 * time.Second
58+
defaultHintReplay = 200 * time.Millisecond
59+
defaultHeartbeat = 1 * time.Second
60+
defaultRebalance = 250 * time.Millisecond
61+
// Membership gossip cadence. Without an enabled gossip loop the
62+
// cluster has no path to re-introduce a previously-removed node:
63+
// peers' heartbeats only probe nodes already in their membership
64+
// list, and the Health endpoint is one-way. A graceful drain →
65+
// restart (the canonical operator workflow) leaves the restarted
66+
// node invisible to the rest of the cluster forever. Default 1s
67+
// matches the heartbeat cadence — gossip+heartbeat together
68+
// disseminate membership changes within a couple of ticks.
69+
defaultGossip = 1 * time.Second
6170
clientAPIReadTimeout = 5 * time.Second
6271
clientAPIWriteTimeout = 5 * time.Second
6372
clientAPIIdleTimeout = 60 * time.Second
@@ -85,6 +94,7 @@ type envConfig struct {
8594
Heartbeat time.Duration
8695
IndirectK int
8796
RebalanceInt time.Duration
97+
GossipInt time.Duration
8898
}
8999

90100
// loadConfig pulls every knob from the environment and applies sane
@@ -122,6 +132,7 @@ func loadConfig() (envConfig, error) {
122132
Heartbeat: envDuration("HYPERCACHE_HEARTBEAT", defaultHeartbeat),
123133
IndirectK: envInt("HYPERCACHE_INDIRECT_PROBE_K", defaultIndirectK),
124134
RebalanceInt: envDuration("HYPERCACHE_REBALANCE_INTERVAL", defaultRebalance),
135+
GossipInt: envDuration("HYPERCACHE_GOSSIP_INTERVAL", defaultGossip),
125136
}
126137

127138
return cfg, nil
@@ -235,6 +246,7 @@ func buildHyperCache(ctx context.Context, cfg envConfig, logger *slog.Logger) (*
235246
backend.WithDistWriteConsistency(backend.ConsistencyQuorum),
236247
backend.WithDistHeartbeat(cfg.Heartbeat, suspectMultiplier*cfg.Heartbeat, deadMultiplier*cfg.Heartbeat),
237248
backend.WithDistIndirectProbes(cfg.IndirectK, cfg.Heartbeat/2),
249+
backend.WithDistGossipInterval(cfg.GossipInt),
238250
backend.WithDistHintTTL(cfg.HintTTL),
239251
backend.WithDistHintReplayInterval(cfg.HintReplay),
240252
backend.WithDistRebalanceInterval(cfg.RebalanceInt),

cmd/hypercache-server/main_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,49 @@ func TestDecodeBase64Bytes_NotPadded(t *testing.T) {
174174
t.Errorf("expected 5-char input to be rejected (len%%4 != 0)")
175175
}
176176
}
177+
178+
// TestLoadConfigGossipInterval pins the gossip-interval wiring that
179+
// fixes the "previously-removed node never rejoins the cluster" bug.
180+
//
181+
// Without gossip enabled, dist_memory.startGossipIfEnabled bails on
182+
// gossipInterval <= 0 and no membership state ever propagates beyond
183+
// the initial seed list. After a graceful drain, the peers' heartbeat
184+
// loop removes the drained node; on restart, the rejoining node
185+
// populates ITS own membership from seeds but has no path to
186+
// re-introduce itself to those peers — only gossip carries that
187+
// information. The Health endpoint is one-way ("ok" / "draining"),
188+
// the heartbeat loop only probes peers already in membership, and
189+
// no other propagation mechanism exists.
190+
//
191+
// The regression pins both the unset-env default (must be > 0 so
192+
// gossip starts by default) and the override path. An integration
193+
// test that exercises actual rejoin propagation across an in-process
194+
// cluster is a follow-up; this pins the load-bearing wiring without
195+
// the harness complexity.
196+
func TestLoadConfigGossipInterval(t *testing.T) {
197+
t.Run("default is non-zero so gossip starts", func(t *testing.T) {
198+
t.Setenv("HYPERCACHE_GOSSIP_INTERVAL", "")
199+
200+
cfg, err := loadConfig()
201+
if err != nil {
202+
t.Fatalf("loadConfig: %v", err)
203+
}
204+
205+
if cfg.GossipInt <= 0 {
206+
t.Fatalf("GossipInt = %v; default must be > 0 (gossip disabled = silent rejoin breakage)", cfg.GossipInt)
207+
}
208+
})
209+
210+
t.Run("env override is honored", func(t *testing.T) {
211+
t.Setenv("HYPERCACHE_GOSSIP_INTERVAL", "750ms")
212+
213+
cfg, err := loadConfig()
214+
if err != nil {
215+
t.Fatalf("loadConfig: %v", err)
216+
}
217+
218+
if cfg.GossipInt.String() != "750ms" {
219+
t.Errorf("GossipInt = %v, want 750ms", cfg.GossipInt)
220+
}
221+
})
222+
}

hypercache-server

176 Bytes
Binary file not shown.

pkg/backend/dist_http_server.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ func (s *distHTTPServer) handleSet(fctx fiber.Ctx, dm *DistMemory) error {
384384
Version: req.Version,
385385
Origin: req.Origin,
386386
LastUpdated: time.Now(),
387+
LastAccess: time.Now(),
387388
}
388389

389390
dm.applySet(s.ctx, it, req.Replicate)

0 commit comments

Comments
 (0)