From 1c4bcb1108b053409691e11253f5baa423472a09 Mon Sep 17 00:00:00 2001 From: Shubham Jain Date: Mon, 15 Jun 2026 14:21:40 +0530 Subject: [PATCH] fix(go-services): mysql healthcheck false-healthy race against the init server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mysql-users/products/orders healthchecks used `mysqladmin ping -h localhost`, which hits MySQL 8.0's socket-only TEMPORARY init server (run to apply the seed db.sql before the real server starts) and passes on exit-code-only — it returns 0 even on "Access denied". So docker marked the container healthy ~3s before the real :3306 TCP listener was up. A service depending on it via `condition: service_healthy` then connected over TCP too early and failed during the temp-server -> real-server restart gap; docker's subsequent probes against the now-stopped temp server drove the failing streak to the retry limit -> "container unhealthy" -> "dependency failed to start". This is the intermittent kafka-ecommerce CI flake (it passes whenever timing happens to favour it). Fix: probe the REAL TCP listener with root creds (`mysqladmin ping -h 127.0.0.1 -P 3306 -uroot -proot`) — only the fully-started real server answers there, never the temp server — and add `start_period: 60s` so slow cold init under CI contention doesn't burn the retry budget before :3306 is up. Applied to all three mysql services. Signed-off-by: Shubham Jain --- go-services/docker-compose.yml | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/go-services/docker-compose.yml b/go-services/docker-compose.yml index 69901be..00d1725 100644 --- a/go-services/docker-compose.yml +++ b/go-services/docker-compose.yml @@ -14,10 +14,19 @@ services: volumes: - ./user_service/db.sql:/docker-entrypoint-initdb.d/init.sql healthcheck: - test: [ "CMD", "mysqladmin", "ping", "-h", "localhost" ] + # TCP+creds probe: only the fully-started real server on :3306 answers. + # The old `ping -h localhost` hit MySQL 8.0's socket-only TEMPORARY init + # server and passed on exit-code-only (even "Access denied"), so the + # container was marked healthy ~3s before the real :3306 listener was up. + # A dependent service connecting over TCP then started too early and + # failed during the temp-server -> real-server restart gap (intermittent + # "container unhealthy" / "dependency failed to start" in CI). + test: [ "CMD", "mysqladmin", "ping", "-h", "127.0.0.1", "-P", "3306", "-uroot", "-proot" ] interval: 5s timeout: 5s retries: 20 + # Cold init (slow under CI contention) shouldn't burn the retry budget. + start_period: 60s mysql-products: image: mysql:8.0 @@ -34,10 +43,19 @@ services: volumes: - ./product_service/db.sql:/docker-entrypoint-initdb.d/init.sql healthcheck: - test: [ "CMD", "mysqladmin", "ping", "-h", "localhost" ] + # TCP+creds probe: only the fully-started real server on :3306 answers. + # The old `ping -h localhost` hit MySQL 8.0's socket-only TEMPORARY init + # server and passed on exit-code-only (even "Access denied"), so the + # container was marked healthy ~3s before the real :3306 listener was up. + # A dependent service connecting over TCP then started too early and + # failed during the temp-server -> real-server restart gap (intermittent + # "container unhealthy" / "dependency failed to start" in CI). + test: [ "CMD", "mysqladmin", "ping", "-h", "127.0.0.1", "-P", "3306", "-uroot", "-proot" ] interval: 5s timeout: 5s retries: 20 + # Cold init (slow under CI contention) shouldn't burn the retry budget. + start_period: 60s mysql-orders: image: mysql:8.0 @@ -54,10 +72,19 @@ services: volumes: - ./order_service/db.sql:/docker-entrypoint-initdb.d/init.sql healthcheck: - test: [ "CMD", "mysqladmin", "ping", "-h", "localhost" ] + # TCP+creds probe: only the fully-started real server on :3306 answers. + # The old `ping -h localhost` hit MySQL 8.0's socket-only TEMPORARY init + # server and passed on exit-code-only (even "Access denied"), so the + # container was marked healthy ~3s before the real :3306 listener was up. + # A dependent service connecting over TCP then started too early and + # failed during the temp-server -> real-server restart gap (intermittent + # "container unhealthy" / "dependency failed to start" in CI). + test: [ "CMD", "mysqladmin", "ping", "-h", "127.0.0.1", "-P", "3306", "-uroot", "-proot" ] interval: 5s timeout: 5s retries: 20 + # Cold init (slow under CI contention) shouldn't burn the retry budget. + start_period: 60s zookeeper: image: confluentinc/cp-zookeeper:7.5.0