fix(keychain): make Linux write path and dedup tests relock-aware

Benehiko · claude · Benehiko · commit e03dc7a6458d · 2026-06-24T14:23:30.000+02:00
The real-keyring tests added by this PR fail intermittently in CI with "Cannot create an item in a locked collection" and a duplicate backlog that never collapses. Both are the gnome-keyring relock race: the store dials a fresh D-Bus connection per operation, and a prior op's closing connection relocks the collection asynchronously — after the unlock but before the call against the collection runs. IsLocked cannot guard it because the state changes between the check and the call. React to the authoritative signal instead. Add withRelockRetry, which retries an operation with bounded exponential backoff when it fails with the org.freedesktop.Secret.Error.IsLocked D-Bus error, re-unlocking each attempt and aborting immediately if Unlock itself fails (e.g. a dismissed prompt). Wrap the lock-gated mutations with it: - Save: the create-when-absent CreateItem, the in-place SetItemSecret, and the best-effort duplicate-collapse DeleteItem loop. The collapse delete stays best-effort but is now relock-aware: a swallowed locked error would otherwise leave the duplicates this feature exists to drain (issue #446). - Delete: DeleteItem. Reads (Get/Filter GetSecret) are equally race-prone but are left to the dedicated relock-retry change (#560); this commit covers the write path the new tests exercise. Also make the tests' own direct-daemon helpers relock-aware, since they talk to the daemon outside the store: seedRealDuplicates' CreateItem and purgeRealItems' DeleteItem now go through withRelockRetry. Tests: fake-backed TestKeychainSaveRetriesWhenCreateRelocks, TestKeychainSaveRetriesWhenSetSecretRelocks, TestKeychainSaveCollapseRetriesWhenDeleteRelocks and TestKeychainSaveStopsRetryingAfterMaxRelocks. Verified green against real gnome-keyring on the fedora-43 and ubuntu-24 CI harnesses; lint clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/store/keychain/keychain_linux.go b/store/keychain/keychain_linux.go
@@ -23,6 +23,7 @@ import (
 	"fmt"
 	"maps"
 	"slices"
+	"time"
 
 	dbus "github.com/godbus/dbus/v5"
 
@@ -148,6 +149,81 @@ func isCollectionUnlocked(collectionPath dbus.ObjectPath, service secretService)
 	return errCollectionLocked
 }
 
+// secretServiceIsLockedError is the D-Bus error name the secret service returns
+// when a mutating call (e.g. CreateItem) targets a locked collection.
+//
+// https://specifications.freedesktop.org/secret-service-spec/latest/errors.html
+const secretServiceIsLockedError = "org.freedesktop.Secret.Error.IsLocked"
+
+// isLockedDBusError reports whether err is the secret service's "collection is
+// locked" D-Bus error. The lock state is matched on the structured D-Bus error
+// name rather than the human-readable message so it is stable across backends
+// and locales.
+func isLockedDBusError(err error) bool {
+	var dbusErr dbus.Error
+	return errors.As(err, &dbusErr) && dbusErr.Name == secretServiceIsLockedError
+}
+
+// Relock retry tuning. An operation that hits a relocked collection is retried
+// with exponential backoff: the relock is a brief race that settles on its own,
+// and spacing the attempts out avoids hammering the secret service (or, on a
+// password-protected keyring, re-issuing Unlock fast enough to spam the user
+// with authentication prompts).
+//
+// relockRetryMaxDelay caps the backoff growth; with the current
+// maxRelockRetries the slept delays are 20,40,80,160,320ms (the cap only takes
+// effect once maxRelockRetries reaches 6, where the sixth delay would otherwise
+// be 640ms).
+const (
+	maxRelockRetries     = 5
+	relockRetryBaseDelay = 20 * time.Millisecond
+	relockRetryMaxDelay  = 500 * time.Millisecond
+)
+
+// sleepFn is the sleep seam used by the relock backoff so tests can exercise the
+// retry loop without real delays. It is a package-level var with no
+// synchronisation, so tests that swap it must not run in parallel.
+var sleepFn = time.Sleep
+
+// withRelockRetry runs a collection operation, retrying it with exponential
+// backoff when the secret service rejects it because the collection is locked.
+//
+// The store dials a fresh D-Bus connection for every operation and closes it on
+// return. gnome-keyring scopes an unlock to the session that performed it, so
+// when a previous operation's connection closes the daemon relocks the
+// collection — and that relock can land asynchronously in the middle of a later
+// operation, after we have already observed the collection as unlocked but
+// before the call against the collection runs. The result is an intermittent
+// "Cannot create an item in a locked collection" error even though we unlocked
+// moments earlier. IsLocked cannot guard against this because the state changes
+// between the check and the call, so we react to the authoritative signal — the
+// operation's own locked error — by unlocking again and retrying.
+//
+// In the common case this is the passwordless auto-unlock path (e.g. the
+// PAM-unlocked login keyring), where Unlock returns the null prompt and asks
+// the user for nothing. withRelockRetry cannot itself prove the keyring is
+// passwordless, so on a password-protected keyring a retry could surface an
+// authentication prompt; the bounded retry count and backoff keep that to a
+// handful of spaced-out prompts at worst, and a dismissed prompt makes Unlock
+// return an error that aborts the loop immediately rather than re-prompting.
+func withRelockRetry(service secretService, collectionPath dbus.ObjectPath, op func() error) error {
+	err := op()
+	delay := relockRetryBaseDelay
+	for attempt := 0; attempt < maxRelockRetries && isLockedDBusError(err); attempt++ {
+		sleepFn(delay)
+		delay = min(delay*2, relockRetryMaxDelay)
+		if unlockErr := service.Unlock([]dbus.ObjectPath{collectionPath}); unlockErr != nil {
+			// Surface why the retry stopped while preserving errors.Is on the
+			// underlying Unlock error (e.g. a dismissed prompt). The original
+			// locked error is intentionally dropped: the failed unlock is the
+			// actionable cause once we have decided to stop retrying.
+			return fmt.Errorf("unlock after relock: %w", unlockErr)
+		}
+		err = op()
+	}
+	return err
+}
+
 type keychainStore[T store.Secret] struct {
 	serviceGroup string
 	serviceName  string
@@ -198,7 +274,9 @@ func (k *keychainStore[T]) Delete(_ context.Context, id store.ID) error {
 		return nil
 	}
 
-	return service.DeleteItem(items[0])
+	return withRelockRetry(service, objectPath, func() error {
+		return service.DeleteItem(items[0])
+	})
 }
 
 func (k *keychainStore[T]) Get(ctx context.Context, id store.ID) (store.Secret, error) {
@@ -406,8 +484,10 @@ func (k *keychainStore[T]) Save(_ context.Context, id store.ID, secret store.Sec
 	// Nothing stored yet: create a fresh item.
 	if len(items) == 0 {
 		properties := kc.NewSecretProperties(label, attributes)
-		_, err = service.CreateItem(objectPath, properties, sessSecret, kc.ReplaceBehaviorReplace)
-		return err
+		return withRelockRetry(service, objectPath, func() error {
+			_, createErr := service.CreateItem(objectPath, properties, sessSecret, kc.ReplaceBehaviorReplace)
+			return createErr
+		})
 	}
 
 	// Update the first match in place. Its object path is preserved, so the
@@ -416,13 +496,20 @@ func (k *keychainStore[T]) Save(_ context.Context, id store.ID, secret store.Sec
 	// the attributes and label and collapsing any pre-existing duplicates are
 	// best-effort (the secret is already stored) and must not flip the result.
 	primary := items[0]
-	if err := service.SetItemSecret(primary, sessSecret); err != nil {
+	if err := withRelockRetry(service, objectPath, func() error {
+		return service.SetItemSecret(primary, sessSecret)
+	}); err != nil {
 		return err
 	}
 	_ = service.SetItemAttributes(primary, attributes)
 	_ = service.SetItemLabel(primary, label)
 	for _, dup := range items[1:] {
-		_ = service.DeleteItem(dup)
+		// Best-effort, but still relock-aware: a collection that relocks
+		// mid-collapse would otherwise leave the duplicates the whole feature
+		// exists to drain (see withRelockRetry and issue #446).
+		_ = withRelockRetry(service, objectPath, func() error {
+			return service.DeleteItem(dup)
+		})
 	}
 
 	return nil
diff --git a/store/keychain/keychain_linux_test.go b/store/keychain/keychain_linux_test.go
@@ -48,13 +48,34 @@ type fakeService struct {
 	// concurrency-safe: the tests that read them drive a single sequential
 	// operation through the fake.
 	createCalls    int
+	setSecretCalls int
+	deleteCalls    int
 	setSecretItems []dbus.ObjectPath
 	deletedItems   []dbus.ObjectPath
 
+	// {createItem,setSecret,deleteItem}LockedErrs is how many leading calls of
+	// each kind fail with the secret service "collection is locked" D-Bus error
+	// before one succeeds, simulating a collection that relocks underneath the
+	// store (see withRelockRetry). The error is wrapped exactly as the real
+	// service wraps it, so the tests exercise the errors.As-through-wrap path
+	// isLockedDBusError depends on. unlockCalls counts the re-unlocks the retry
+	// issues; unlockErr, when set, makes Unlock fail (e.g. a dismissed prompt).
+	createItemLockedErrs int
+	setSecretLockedErrs  int
+	deleteItemLockedErrs int
+	unlockCalls          int
+	unlockErr            error
+
 	opened atomic.Int64
 	closed atomic.Int64
 }
 
+// lockedErr mirrors how the real SecretService wraps a locked-collection D-Bus
+// error (see secretservice.go), so isLockedDBusError must unwrap to detect it.
+func lockedErr(op string) error {
+	return fmt.Errorf("failed to %s: %w", op, dbus.Error{Name: secretServiceIsLockedError})
+}
+
 func (f *fakeService) Collections() ([]dbus.ObjectPath, error) {
 	return []dbus.ObjectPath{loginKeychainObjectPath}, nil
 }
@@ -65,24 +86,39 @@ func (f *fakeService) OpenSession(kc.AuthenticationMode) (*kc.Session, error) {
 	// lets the Save path run end-to-end against the fake.
 	return &kc.Session{Mode: kc.AuthenticationInsecurePlain}, nil
 }
-func (f *fakeService) CloseSession(*kc.Session)       {}
-func (f *fakeService) Unlock([]dbus.ObjectPath) error { return nil }
+func (f *fakeService) CloseSession(*kc.Session) {}
+func (f *fakeService) Unlock([]dbus.ObjectPath) error {
+	f.unlockCalls++
+	return f.unlockErr
+}
+
 func (f *fakeService) SearchCollection(dbus.ObjectPath, kc.Attributes) ([]dbus.ObjectPath, error) {
 	return f.items, nil
 }
 
 func (f *fakeService) CreateItem(dbus.ObjectPath, map[string]dbus.Variant, kc.Secret, kc.ReplaceBehavior) (dbus.ObjectPath, error) {
 	f.createCalls++
+	if f.createCalls <= f.createItemLockedErrs {
+		return "", lockedErr("create item")
+	}
 	return "/created", nil
 }
 
 func (f *fakeService) DeleteItem(item dbus.ObjectPath) error {
+	f.deleteCalls++
+	if f.deleteCalls <= f.deleteItemLockedErrs {
+		return lockedErr("delete item")
+	}
 	f.deletedItems = append(f.deletedItems, item)
 	return nil
 }
 func (f *fakeService) GetAttributes(dbus.ObjectPath) (kc.Attributes, error)  { return nil, nil }
 func (f *fakeService) GetSecret(dbus.ObjectPath, kc.Session) ([]byte, error) { return nil, nil }
 func (f *fakeService) SetItemSecret(item dbus.ObjectPath, _ kc.Secret) error {
+	f.setSecretCalls++
+	if f.setSecretCalls <= f.setSecretLockedErrs {
+		return lockedErr("set item secret")
+	}
 	f.setSecretItems = append(f.setSecretItems, item)
 	return nil
 }
@@ -105,6 +141,16 @@ func withFakeService(t *testing.T, fake *fakeService) {
 	}
 }
 
+// stubRelockSleep replaces the relock backoff sleep with a no-op so retry tests
+// run without real delays, restoring it on cleanup. It mutates the package-level
+// sleepFn var, so tests using it must not run in parallel.
+func stubRelockSleep(t *testing.T) {
+	t.Helper()
+	orig := sleepFn
+	t.Cleanup(func() { sleepFn = orig })
+	sleepFn = func(time.Duration) {}
+}
+
 // TestKeychainGetNotFound exercises the full Get path against the fake — open,
 // resolve collection, search — and asserts an empty search maps to
 // ErrCredentialNotFound, all without a live keyring.
@@ -180,6 +226,79 @@ func TestKeychainSaveCollapsesDuplicatesInPlace(t *testing.T) {
 		"the remaining duplicates must be collapsed, leaving only the first match")
 }
 
+// TestKeychainSaveRetriesWhenCreateRelocks covers the create path: gnome-keyring
+// can relock the collection between the unlock and the CreateItem, so a fresh
+// Save must react to the locked error by unlocking and retrying rather than
+// failing.
+func TestKeychainSaveRetriesWhenCreateRelocks(t *testing.T) {
+	stubRelockSleep(t)
+	fake := &fakeService{} // no items -> create path
+	fake.createItemLockedErrs = 2
+	withFakeService(t, fake)
+
+	ks := setupKeychain(t, nil)
+	require.NoError(t, ks.Save(t.Context(), store.MustParseID("com.test.test/test/bob"),
+		&mocks.MockCredential{Username: "bob", Password: "bob-password"}))
+
+	assert.Equal(t, 3, fake.createCalls, "two locked failures then one success")
+	assert.Equal(t, 2, fake.unlockCalls, "exactly one Unlock per relock retry")
+}
+
+// TestKeychainSaveRetriesWhenSetSecretRelocks covers the in-place update path:
+// the SetItemSecret that rewrites the surviving item must survive a relock.
+func TestKeychainSaveRetriesWhenSetSecretRelocks(t *testing.T) {
+	stubRelockSleep(t)
+	fake := &fakeService{items: []dbus.ObjectPath{"/item/a"}}
+	fake.setSecretLockedErrs = 2
+	withFakeService(t, fake)
+
+	ks := setupKeychain(t, nil)
+	require.NoError(t, ks.Save(t.Context(), store.MustParseID("com.test.test/test/bob"),
+		&mocks.MockCredential{Username: "bob", Password: "bob-password"}))
+
+	assert.Equal(t, []dbus.ObjectPath{"/item/a"}, fake.setSecretItems,
+		"the secret must be written in place once the relock clears")
+	assert.Equal(t, 2, fake.unlockCalls, "exactly one Unlock per relock retry")
+}
+
+// TestKeychainSaveCollapseRetriesWhenDeleteRelocks is the unit-level counterpart
+// of the real-keyring backlog test: collapsing a duplicate must drain it even if
+// the collection relocks mid-delete. The collapse delete is best-effort, but a
+// silently swallowed locked error would leave the duplicate behind — the exact
+// #446 symptom — so it is still relock-aware.
+func TestKeychainSaveCollapseRetriesWhenDeleteRelocks(t *testing.T) {
+	stubRelockSleep(t)
+	fake := &fakeService{items: []dbus.ObjectPath{"/item/a", "/item/b"}}
+	fake.deleteItemLockedErrs = 2
+	withFakeService(t, fake)
+
+	ks := setupKeychain(t, nil)
+	require.NoError(t, ks.Save(t.Context(), store.MustParseID("com.test.test/test/bob"),
+		&mocks.MockCredential{Username: "bob", Password: "bob-password"}))
+
+	assert.Equal(t, []dbus.ObjectPath{"/item/b"}, fake.deletedItems,
+		"the duplicate must be collapsed once the relock clears")
+	assert.Equal(t, 3, fake.deleteCalls, "two locked failures then one success")
+	assert.Equal(t, 2, fake.unlockCalls, "exactly one Unlock per relock retry")
+}
+
+// TestKeychainSaveStopsRetryingAfterMaxRelocks asserts the retry is bounded: a
+// persistently locked collection surfaces the locked error to the caller instead
+// of looping forever.
+func TestKeychainSaveStopsRetryingAfterMaxRelocks(t *testing.T) {
+	stubRelockSleep(t)
+	fake := &fakeService{}              // no items -> create path
+	fake.createItemLockedErrs = 1 << 30 // never recovers
+	withFakeService(t, fake)
+
+	ks := setupKeychain(t, nil)
+	err := ks.Save(t.Context(), store.MustParseID("com.test.test/test/bob"),
+		&mocks.MockCredential{Username: "bob", Password: "bob-password"})
+	require.Error(t, err)
+	assert.True(t, isLockedDBusError(err), "the persistent locked error must reach the caller")
+	assert.Equal(t, maxRelockRetries+1, fake.createCalls, "initial attempt plus the bounded retries")
+}
+
 // The real-keychain dedup tests use their own service group/name so their items
 // are namespace-isolated from TestKeychain (which shares com.test.test/test).
 // GetAllMetadata/Filter search by {service:group, service:name}, so a leaked
@@ -293,7 +412,14 @@ func seedRealDuplicates(t *testing.T, serviceGroup, serviceName string, id store
 		safelySetMetadata(serviceGroup, serviceName, attrs)
 		safelySetID(id, attrs)
 
-		_, err = svc.CreateItem(collection, kc.NewSecretProperties(label, attrs), sessSecret, kc.ReplaceBehaviorDoNotReplace)
+		// Seed directly against the daemon, but stay relock-aware: a prior op's
+		// closing connection can relock the collection between the unlock above
+		// and this create (see withRelockRetry), which would otherwise fail the
+		// seed with "Cannot create an item in a locked collection".
+		err = withRelockRetry(svc, collection, func() error {
+			_, createErr := svc.CreateItem(collection, kc.NewSecretProperties(label, attrs), sessSecret, kc.ReplaceBehaviorDoNotReplace)
+			return createErr
+		})
 		require.NoError(t, err)
 	}
 }
@@ -318,7 +444,9 @@ func purgeRealItems(t *testing.T, serviceGroup, serviceName string, id store.ID)
 	items, err := svc.SearchCollection(collection, attrs)
 	require.NoError(t, err)
 	for _, item := range items {
-		require.NoError(t, svc.DeleteItem(item))
+		require.NoError(t, withRelockRetry(svc, collection, func() error {
+			return svc.DeleteItem(item)
+		}))
 	}
 }