@@ -251,17 +251,75 @@ def create_breeder(self, breeder_config, name):
251251 # Create breeder state table for shutdown signaling in the archive DB
252252 self .archive_repo .create_breeder_state_table (breeder_id )
253253
254+ # Wait for the breeder_state table to be fully committed and accessible
255+ # This prevents YugabyteDB serialization conflicts when Optuna starts its DDL operations
256+ from f .controller .database import get_db_connection
257+ max_wait = 10 # seconds
258+ check_interval = 0.5 # seconds
259+ import time
260+ table_ready = False
261+
262+ for attempt in range (int (max_wait / check_interval )):
263+ try :
264+ # Try to query the table - if it succeeds, the transaction is fully committed
265+ db_config = self .archive_repo .base_config .copy ()
266+ db_config ['database' ] = breeder_id
267+ with get_db_connection (db_config ) as conn :
268+ with conn .cursor () as cursor :
269+ cursor .execute ("SELECT COUNT(*) FROM breeder_state;" )
270+ count = cursor .fetchone ()[0 ]
271+ table_ready = True
272+ logger .info (f"Breeder state table is ready for { breeder_uuid } " )
273+ break
274+ except Exception as e :
275+ if attempt < (max_wait / check_interval ) - 1 :
276+ logger .debug (f"Waiting for breeder_state table to be ready... (attempt { attempt + 1 } )" )
277+ time .sleep (check_interval )
278+ else :
279+ logger .error (f"Breeder state table still not ready after { max_wait } s: { e } " )
280+ raise
281+
282+ if not table_ready :
283+ raise Exception (f"Breeder state table did not become ready within { max_wait } s" )
284+
254285 # Initialize Optuna schema to prevent race conditions during worker startup
255286 # Multiple workers starting simultaneously would otherwise conflict trying to create tables
256- try :
257- db_url = self .archive_repo .get_connection_url (breeder_id )
258- storage = optuna .storages .RDBStorage (url = db_url )
259- logger .info (f"Initialized Optuna schema for breeder { breeder_uuid } " )
260- except Exception as e :
261- logger .error (f"Failed to initialize Optuna schema for breeder { breeder_uuid } : { e } " )
262- # Clean up database if schema initialization fails
263- self .archive_repo .drop_database (breeder_id )
264- raise
287+ # Retry logic for YugabyteDB serialization failures and timeouts
288+ max_retries = 5
289+ storage = None
290+ last_error = None
291+
292+ for attempt in range (max_retries ):
293+ try :
294+ db_url = self .archive_repo .get_connection_url (breeder_id )
295+ storage = optuna .storages .RDBStorage (url = db_url )
296+ logger .info (f"Initialized Optuna schema for breeder { breeder_uuid } " )
297+ break
298+ except Exception as e :
299+ last_error = e
300+ error_str = str (e )
301+ # Check for YugabyteDB-specific errors that should be retried
302+ is_retryable = (
303+ 'SerializationFailure' in error_str or
304+ '40001' in error_str or
305+ 'Transaction aborted' in error_str or
306+ 'Timed out waiting' in error_str or
307+ 'InternalError_' in error_str
308+ )
309+
310+ if attempt < max_retries - 1 and is_retryable :
311+ wait_time = 2 ** attempt # Exponential backoff: 2s, 4s, 8s, 16s
312+ logger .warning (f"Optuna schema initialization attempt { attempt + 1 } /{ max_retries } failed for breeder { breeder_uuid } : { e } " )
313+ logger .info (f"Retrying in { wait_time } seconds..." )
314+ time .sleep (wait_time )
315+ else :
316+ logger .error (f"Failed to initialize Optuna schema for breeder { breeder_uuid } after { max_retries } attempts: { e } " )
317+ # Clean up database if schema initialization fails
318+ try :
319+ self .archive_repo .drop_database (breeder_id )
320+ except Exception as drop_error :
321+ logger .error (f"Failed to cleanup database after Optuna init failure: { drop_error } " )
322+ raise
265323
266324 self .metadata_repo .create_table ()
267325 self .metadata_repo .insert_breeder_meta (
0 commit comments