fix: add watchdog thread to detect stdout backpressure deadlocks

devin-ai-integration[bot] · devin-ai-integration[bot] · commit 33ff81f3d5bc · 2026-03-10T22:01:12.000Z
When the platform stops reading from the source container's stdout/stderr
pipes (e.g. destination backpressure), all threads block on I/O and no
in-process timeout can fire. The watchdog monitors main-thread progress
and calls os._exit(1) after 10 minutes of no activity.

Also fixes exception handlers in PartitionReader and PartitionEnqueuer
that used blocking queue.put() without timeout, which would deadlock
when the queue is full.

Co-Authored-By: unknown &lt;&gt;
diff --git a/airbyte_cdk/sources/concurrent_source/concurrent_source.py b/airbyte_cdk/sources/concurrent_source/concurrent_source.py
@@ -4,6 +4,8 @@
 
 import concurrent
 import logging
+import os
+import sys
 import threading
 import time
 from queue import Empty, Queue
@@ -38,6 +40,11 @@ class ConcurrentSource:
     """
 
     DEFAULT_TIMEOUT_SECONDS = 900
+    # If the main thread makes no progress for this long, the watchdog
+    # terminates the process.  This breaks deadlocks caused by stdout/stderr
+    # pipe blockage where no in-process timeout can fire because I/O itself
+    # is blocked at the OS level.
+    _WATCHDOG_TIMEOUT_SECONDS = 600.0  # 10 minutes
 
     @staticmethod
     def create(
@@ -108,29 +115,44 @@ def read(
         streams: List[AbstractStream],
     ) -> Iterator[AirbyteMessage]:
         self._logger.info("Starting syncing")
-        concurrent_stream_processor = ConcurrentReadProcessor(
-            streams,
-            PartitionEnqueuer(self._queue, self._threadpool),
-            self._threadpool,
-            self._logger,
-            self._slice_logger,
-            self._message_repository,
-            PartitionReader(
-                self._queue,
-                PartitionLogger(self._slice_logger, self._logger, self._message_repository),
-            ),
+        # Shared timestamp updated every time the main thread makes progress
+        # (consumes an item from the queue).  The watchdog reads this to
+        # detect when the main thread is stuck.
+        self._last_progress_time = time.monotonic()
+        self._watchdog_should_run = True
+        watchdog = threading.Thread(
+            target=self._watchdog_loop,
+            daemon=True,
+            name="progress-watchdog",
         )
+        watchdog.start()
+
+        try:
+            concurrent_stream_processor = ConcurrentReadProcessor(
+                streams,
+                PartitionEnqueuer(self._queue, self._threadpool),
+                self._threadpool,
+                self._logger,
+                self._slice_logger,
+                self._message_repository,
+                PartitionReader(
+                    self._queue,
+                    PartitionLogger(self._slice_logger, self._logger, self._message_repository),
+                ),
+            )
 
-        # Enqueue initial partition generation tasks
-        yield from self._submit_initial_partition_generators(concurrent_stream_processor)
+            # Enqueue initial partition generation tasks
+            yield from self._submit_initial_partition_generators(concurrent_stream_processor)
 
-        # Read from the queue until all partitions were generated and read
-        yield from self._consume_from_queue(
-            self._queue,
-            concurrent_stream_processor,
-        )
-        self._threadpool.check_for_errors_and_shutdown()
-        self._logger.info("Finished syncing")
+            # Read from the queue until all partitions were generated and read
+            yield from self._consume_from_queue(
+                self._queue,
+                concurrent_stream_processor,
+            )
+            self._threadpool.check_for_errors_and_shutdown()
+            self._logger.info("Finished syncing")
+        finally:
+            self._watchdog_should_run = False
 
     def _submit_initial_partition_generators(
         self, concurrent_stream_processor: ConcurrentReadProcessor
@@ -179,6 +201,7 @@ def _consume_from_queue(
                     type(airbyte_message_or_record_or_exception).__name__,
                 )
                 items_since_last_heartbeat = 0
+            self._last_progress_time = now
             last_item_time = now
 
             yield from self._handle_item(
@@ -192,6 +215,42 @@ def _consume_from_queue(
                 # all partitions were generated and processed. we're done here
                 break
 
+    def _watchdog_loop(self) -> None:
+        """Daemon thread that terminates the process when the main thread stalls.
+
+        In Airbyte Cloud the source container's stdout and stderr are read by
+        the platform (replication-orchestrator).  If the platform stops reading
+        (e.g. destination backpressure), both pipes fill up and *all* threads
+        block on I/O — including the main thread's ``yield`` and every worker
+        thread's ``logger.*()`` call.  No in-process timeout can fire because
+        the timeout's own log/write call also blocks.
+
+        This watchdog does **not** perform any I/O.  It simply checks a shared
+        monotonic timestamp that the main thread updates whenever it consumes a
+        queue item.  If no progress is observed for ``_WATCHDOG_TIMEOUT_SECONDS``,
+        it calls ``os._exit(1)`` which is a raw syscall that terminates the
+        process immediately regardless of I/O state.
+        """
+        while self._watchdog_should_run:
+            time.sleep(30)  # check every 30 seconds
+            if not self._watchdog_should_run:
+                return
+            elapsed = time.monotonic() - self._last_progress_time
+            if elapsed >= self._WATCHDOG_TIMEOUT_SECONDS:
+                # Write directly to stderr fd to bypass Python buffering
+                # which may be blocked.  This is best-effort; if the fd is
+                # blocked the write will simply fail and we still exit.
+                try:
+                    msg = (
+                        f"WATCHDOG: Main thread made no progress for "
+                        f"{elapsed:.0f}s (threshold={self._WATCHDOG_TIMEOUT_SECONDS:.0f}s). "
+                        f"Terminating process to prevent indefinite hang.\n"
+                    )
+                    os.write(sys.stderr.fileno(), msg.encode())
+                except Exception:
+                    pass
+                os._exit(1)
+
     def _handle_item(
         self,
         queue_item: QueueItem,
diff --git a/airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py b/airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py
@@ -96,8 +96,12 @@ def generate_partitions(self, stream: AbstractStream) -> None:
                 partition_count,
                 str(e)[:200],
             )
-            self._queue.put(StreamThreadException(e, stream.name))
-            self._queue.put(PartitionGenerationCompletedSentinel(stream))
+            self._put_with_timeout(StreamThreadException(e, stream.name), stream.name, logger)
+            self._put_with_timeout(
+                PartitionGenerationCompletedSentinel(stream),
+                stream.name,
+                logger,
+            )
 
     def _put_with_timeout(
         self,
diff --git a/airbyte_cdk/sources/streams/concurrent/partition_reader.py b/airbyte_cdk/sources/streams/concurrent/partition_reader.py
@@ -129,8 +129,12 @@ def process_partition(self, partition: Partition, cursor: Cursor) -> None:
                 str(e)[:200],
                 slice_info,
             )
-            self._queue.put(StreamThreadException(e, stream_name))
-            self._queue.put(PartitionCompleteSentinel(partition, not self._IS_SUCCESSFUL))
+            self._put_with_timeout(StreamThreadException(e, stream_name), stream_name, logger)
+            self._put_with_timeout(
+                PartitionCompleteSentinel(partition, not self._IS_SUCCESSFUL),
+                stream_name,
+                logger,
+            )
 
     def _put_with_timeout(
         self,