Skip to content

Commit 896e6dc

Browse files
authored
Merge pull request #38 from godon-dev/guard_archive_db_ddl
Fix: archiveDB transaction breeder creation
2 parents 9d1d048 + 738c6db commit 896e6dc

1 file changed

Lines changed: 67 additions & 9 deletions

File tree

controller/breeder_service.py

Lines changed: 67 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -251,17 +251,75 @@ def create_breeder(self, breeder_config, name):
251251
# Create breeder state table for shutdown signaling in the archive DB
252252
self.archive_repo.create_breeder_state_table(breeder_id)
253253

254+
# Wait for the breeder_state table to be fully committed and accessible
255+
# This prevents YugabyteDB serialization conflicts when Optuna starts its DDL operations
256+
from f.controller.database import get_db_connection
257+
max_wait = 10 # seconds
258+
check_interval = 0.5 # seconds
259+
import time
260+
table_ready = False
261+
262+
for attempt in range(int(max_wait / check_interval)):
263+
try:
264+
# Try to query the table - if it succeeds, the transaction is fully committed
265+
db_config = self.archive_repo.base_config.copy()
266+
db_config['database'] = breeder_id
267+
with get_db_connection(db_config) as conn:
268+
with conn.cursor() as cursor:
269+
cursor.execute("SELECT COUNT(*) FROM breeder_state;")
270+
count = cursor.fetchone()[0]
271+
table_ready = True
272+
logger.info(f"Breeder state table is ready for {breeder_uuid}")
273+
break
274+
except Exception as e:
275+
if attempt < (max_wait / check_interval) - 1:
276+
logger.debug(f"Waiting for breeder_state table to be ready... (attempt {attempt + 1})")
277+
time.sleep(check_interval)
278+
else:
279+
logger.error(f"Breeder state table still not ready after {max_wait}s: {e}")
280+
raise
281+
282+
if not table_ready:
283+
raise Exception(f"Breeder state table did not become ready within {max_wait}s")
284+
254285
# Initialize Optuna schema to prevent race conditions during worker startup
255286
# Multiple workers starting simultaneously would otherwise conflict trying to create tables
256-
try:
257-
db_url = self.archive_repo.get_connection_url(breeder_id)
258-
storage = optuna.storages.RDBStorage(url=db_url)
259-
logger.info(f"Initialized Optuna schema for breeder {breeder_uuid}")
260-
except Exception as e:
261-
logger.error(f"Failed to initialize Optuna schema for breeder {breeder_uuid}: {e}")
262-
# Clean up database if schema initialization fails
263-
self.archive_repo.drop_database(breeder_id)
264-
raise
287+
# Retry logic for YugabyteDB serialization failures and timeouts
288+
max_retries = 5
289+
storage = None
290+
last_error = None
291+
292+
for attempt in range(max_retries):
293+
try:
294+
db_url = self.archive_repo.get_connection_url(breeder_id)
295+
storage = optuna.storages.RDBStorage(url=db_url)
296+
logger.info(f"Initialized Optuna schema for breeder {breeder_uuid}")
297+
break
298+
except Exception as e:
299+
last_error = e
300+
error_str = str(e)
301+
# Check for YugabyteDB-specific errors that should be retried
302+
is_retryable = (
303+
'SerializationFailure' in error_str or
304+
'40001' in error_str or
305+
'Transaction aborted' in error_str or
306+
'Timed out waiting' in error_str or
307+
'InternalError_' in error_str
308+
)
309+
310+
if attempt < max_retries - 1 and is_retryable:
311+
wait_time = 2 ** attempt # Exponential backoff: 2s, 4s, 8s, 16s
312+
logger.warning(f"Optuna schema initialization attempt {attempt + 1}/{max_retries} failed for breeder {breeder_uuid}: {e}")
313+
logger.info(f"Retrying in {wait_time} seconds...")
314+
time.sleep(wait_time)
315+
else:
316+
logger.error(f"Failed to initialize Optuna schema for breeder {breeder_uuid} after {max_retries} attempts: {e}")
317+
# Clean up database if schema initialization fails
318+
try:
319+
self.archive_repo.drop_database(breeder_id)
320+
except Exception as drop_error:
321+
logger.error(f"Failed to cleanup database after Optuna init failure: {drop_error}")
322+
raise
265323

266324
self.metadata_repo.create_table()
267325
self.metadata_repo.insert_breeder_meta(

0 commit comments

Comments
 (0)