Skip to content

Commit c12b479

Browse files
gurghetclaude
andcommitted
fix: prevent race condition in secret creation
- Add ownership check before updating secrets - Remove delete-before-create pattern that caused race condition - Return 'conflict' if secret exists but not owned by this CR - Move status update to after successful secret creation - Add ownership verification in reconcile timer The race condition occurred when: 1. delete_secret_if_exists removed the secret 2. Timer reconcile ran during the gap 3. Saw secret missing, triggered recreation 4. Multiple keys created in GitHub = spam loop Now the operator: - Only updates secrets it owns - Refuses to overwrite foreign secrets (logs warning) - Cleans up GitHub key if secret creation conflicts Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent fb7abda commit c12b479

1 file changed

Lines changed: 61 additions & 18 deletions

File tree

operator.py

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -180,33 +180,59 @@ class KubernetesSecretManager:
180180
def __init__(self, logger):
181181
self.logger = logger
182182

183+
def _is_owned_by(self, secret, owner_uid):
184+
"""Check if secret is owned by the given owner UID."""
185+
if not secret.metadata.owner_references:
186+
return False
187+
return any(ref.uid == owner_uid for ref in secret.metadata.owner_references)
188+
183189
def create_or_update_secret(self, name, namespace, private_key, public_key, owner_reference):
184-
"""Create or update Kubernetes secret with SSH keys."""
190+
"""Create or update Kubernetes secret with SSH keys.
191+
192+
Returns:
193+
- 'created': new secret was created
194+
- 'updated': existing secret was updated (owned by us)
195+
- 'conflict': secret exists but owned by someone else (no action taken)
196+
"""
185197
# Add github.com to known_hosts
186198
known_hosts = "github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg="
187-
199+
188200
secret_data = {
189201
'identity': private_key,
190202
'identity.pub': public_key,
191203
'known_hosts': known_hosts
192204
}
193-
205+
194206
encoded_data = {k: base64.b64encode(v.encode()).decode() for k, v in secret_data.items()}
195-
207+
196208
try:
197-
# Try to update existing secret
209+
# Check if secret exists
198210
secret = core_v1_api.read_namespaced_secret(name=name, namespace=namespace)
211+
212+
# Check ownership
213+
if not self._is_owned_by(secret, owner_reference.uid):
214+
self.logger.warning(
215+
f"Secret {name} exists but is not owned by this GitHubDeployKey. "
216+
f"Refusing to overwrite. Delete the existing secret manually if you want "
217+
f"the operator to manage it."
218+
)
219+
return 'conflict'
220+
221+
# Owned by us - safe to update
199222
secret.data = encoded_data
200223
core_v1_api.replace_namespaced_secret(
201224
name=name,
202225
namespace=namespace,
203226
body=secret
204227
)
205228
self.logger.info(f"Updated existing secret {name}")
229+
return 'updated'
230+
206231
except kubernetes.client.exceptions.ApiException as e:
207232
if e.status != 404:
208233
raise
209-
234+
235+
# Secret doesn't exist - create it
210236
core_v1_api.create_namespaced_secret(
211237
namespace=namespace,
212238
body=kubernetes.client.V1Secret(
@@ -219,6 +245,7 @@ def create_or_update_secret(self, name, namespace, private_key, public_key, owne
219245
)
220246
)
221247
self.logger.info(f"Created new secret {name}")
248+
return 'created'
222249

223250
def delete_secret_if_exists(self, name, namespace):
224251
"""Delete a Kubernetes secret if it exists."""
@@ -251,28 +278,36 @@ def create_deploy_key(spec, logger, patch, **kwargs):
251278

252279
if not github_manager.verify_key_exists(repo, key.id):
253280
raise kopf.PermanentError("Failed to verify deploy key")
254-
255-
# Update status
256-
patch['status'] = {'keyId': key.id}
257-
258-
# Create secret
281+
282+
# Create secret BEFORE updating status (so status only reflects successful state)
259283
secret_name = f"{kwargs['meta']['name']}-private-key"
260284
owner_reference = kubernetes.client.V1OwnerReference(
261285
api_version=kwargs['body']['apiVersion'],
262286
kind=kwargs['body']['kind'],
263287
name=kwargs['body']['metadata']['name'],
264288
uid=kwargs['body']['metadata']['uid']
265289
)
266-
267-
secret_manager.delete_secret_if_exists(secret_name, kwargs['meta']['namespace'])
268-
secret_manager.create_or_update_secret(
290+
291+
result = secret_manager.create_or_update_secret(
269292
secret_name,
270293
kwargs['meta']['namespace'],
271294
private_key,
272295
public_key,
273296
owner_reference
274297
)
275-
298+
299+
if result == 'conflict':
300+
# Secret exists but not owned by us - clean up the GitHub key we just created
301+
logger.warning(f"Secret {secret_name} owned by another resource. Cleaning up GitHub key.")
302+
github_manager.delete_key_by_id(repo, key.id)
303+
raise kopf.PermanentError(
304+
f"Secret {secret_name} already exists and is not owned by this GitHubDeployKey. "
305+
f"Delete the existing secret manually to allow the operator to manage it."
306+
)
307+
308+
# Only update status after everything succeeded
309+
patch['status'] = {'keyId': key.id}
310+
276311
logger.info(f"Successfully created deploy key {key.id} and secret {secret_name}")
277312

278313
except Exception as e:
@@ -361,14 +396,22 @@ def reconcile_deploy_key(spec, status, logger, patch, **kwargs):
361396
else:
362397
logger.error(f"Error checking deploy key {key_id}: {e}")
363398

364-
# Verify secret exists
399+
# Verify secret exists and is owned by us
365400
secret_name = f"{kwargs['meta']['name']}-private-key"
401+
secret_manager = KubernetesSecretManager(logger)
366402
try:
367-
core_v1_api.read_namespaced_secret(
403+
secret = core_v1_api.read_namespaced_secret(
368404
name=secret_name,
369405
namespace=kwargs['meta']['namespace']
370406
)
371-
logger.info(f"Secret {secret_name} exists")
407+
# Check ownership
408+
if not secret_manager._is_owned_by(secret, kwargs['body']['metadata']['uid']):
409+
logger.warning(
410+
f"Secret {secret_name} exists but is not owned by this GitHubDeployKey. "
411+
f"Delete the existing secret manually to allow the operator to manage it."
412+
)
413+
else:
414+
logger.info(f"Secret {secret_name} exists and is correctly owned")
372415
except kubernetes.client.exceptions.ApiException as e:
373416
if e.status == 404:
374417
logger.info(f"Secret {secret_name} is missing, recreating deploy key")

0 commit comments

Comments
 (0)