Skip to content

Commit a50c8cc

Browse files
author
antengye
committed
fix(discov): prevent etcd key leak when Watch DELETE races with lease expiry
The publisher's Watch-based self-healing (added after v1.8) re-puts the key with p.lease whenever a DELETE event is observed. When the DELETE is caused by lease expiry and races with the KeepAlive channel close, the publisher may Put with an expired or zero LeaseID, which makes the key permanent in etcd (no TTL) and leaks forever. This fix adds a TimeToLive check before re-putting: if the lease is already expired or gone, the publisher skips the Put and immediately restarts the keepalive flow instead, avoiding orphaned keys.
1 parent 22bdae0 commit a50c8cc

3 files changed

Lines changed: 39 additions & 0 deletions

File tree

core/discov/internal/etcdclient.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ type EtcdClient interface {
1919
KeepAlive(ctx context.Context, id clientv3.LeaseID) (<-chan *clientv3.LeaseKeepAliveResponse, error)
2020
Put(ctx context.Context, key, val string, opts ...clientv3.OpOption) (*clientv3.PutResponse, error)
2121
Revoke(ctx context.Context, id clientv3.LeaseID) (*clientv3.LeaseRevokeResponse, error)
22+
TimeToLive(ctx context.Context, id clientv3.LeaseID, opts ...clientv3.LeaseOption) (*clientv3.LeaseTimeToLiveResponse, error)
2223
Watch(ctx context.Context, key string, opts ...clientv3.OpOption) clientv3.WatchChan
2324
}

core/discov/internal/etcdclient_mock.go

Lines changed: 20 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

core/discov/publisher.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,24 @@ func (p *Publisher) keepAliveAsync(cli internal.EtcdClient) error {
151151
if evt.Type == clientv3.EventTypeDelete {
152152
logc.Infof(cli.Ctx(), "etcd publisher watch: %s, event: %v",
153153
evt.Kv.Key, evt.Type)
154+
155+
// Make sure the lease is still valid before re-putting the key.
156+
// Otherwise the Put may happen with an already-expired or zero
157+
// LeaseID (e.g. when the DELETE event is caused by lease expiry
158+
// and races with KeepAlive channel close), which makes the key
159+
// permanent in etcd (no TTL) and leaks forever.
160+
ttlResp, ttlErr := cli.TimeToLive(cli.Ctx(), p.lease)
161+
if ttlErr != nil || ttlResp == nil || ttlResp.TTL <= 0 {
162+
logc.Errorf(cli.Ctx(),
163+
"etcd publisher lease expired, skip re-put and restart keepalive: leaseID=%d, err=%v",
164+
p.lease, ttlErr)
165+
p.revoke(cli)
166+
if err := p.doKeepAlive(); err != nil {
167+
logc.Errorf(cli.Ctx(), "etcd publisher KeepAlive: %v", err)
168+
}
169+
return
170+
}
171+
154172
_, err := cli.Put(cli.Ctx(), p.fullKey, p.value, clientv3.WithLease(p.lease))
155173
if err != nil {
156174
logc.Errorf(cli.Ctx(), "etcd publisher re-put key: %v", err)

0 commit comments

Comments
 (0)