|
3 | 3 | # |
4 | 4 |
|
5 | 5 | import concurrent |
| 6 | +import fcntl |
6 | 7 | import logging |
7 | 8 | import os |
8 | 9 | import sys |
@@ -119,6 +120,7 @@ def read( |
119 | 120 | # (consumes an item from the queue). The watchdog reads this to |
120 | 121 | # detect when the main thread is stuck. |
121 | 122 | self._last_progress_time = time.monotonic() |
| 123 | + self._ensure_stderr_nonblock() |
122 | 124 | self._watchdog_should_run = True |
123 | 125 | watchdog = threading.Thread( |
124 | 126 | target=self._watchdog_loop, |
@@ -162,18 +164,49 @@ def _submit_initial_partition_generators( |
162 | 164 | if status_message: |
163 | 165 | yield status_message |
164 | 166 |
|
| 167 | + _stderr_nonblock_set = False |
| 168 | + |
| 169 | + @classmethod |
| 170 | + def _ensure_stderr_nonblock(cls) -> None: |
| 171 | + """Set stderr fd 2 to non-blocking mode (once). |
| 172 | +
|
| 173 | + In Airbyte Cloud the platform reads the source container's stdout and |
| 174 | + stderr pipes. If the platform pauses reading (e.g. destination |
| 175 | + backpressure), both pipe buffers fill up. A blocking ``os.write(2, |
| 176 | + ...)`` would then stall whichever thread called it — including the |
| 177 | + main thread, which causes the CDK queue to fill and deadlock all |
| 178 | + workers. |
| 179 | +
|
| 180 | + Setting ``O_NONBLOCK`` on fd 2 makes ``os.write(2, ...)`` return |
| 181 | + immediately with ``BlockingIOError`` (EAGAIN) instead of blocking. |
| 182 | + The ``_diag`` method already catches all exceptions, so the message |
| 183 | + is simply dropped when the pipe is full. |
| 184 | + """ |
| 185 | + if cls._stderr_nonblock_set: |
| 186 | + return |
| 187 | + try: |
| 188 | + flags = fcntl.fcntl(2, fcntl.F_GETFL) |
| 189 | + fcntl.fcntl(2, fcntl.F_SETFL, flags | os.O_NONBLOCK) |
| 190 | + cls._stderr_nonblock_set = True |
| 191 | + except Exception: |
| 192 | + # Best-effort; some environments may not support fcntl on fd 2. |
| 193 | + pass |
| 194 | + |
165 | 195 | @staticmethod |
166 | 196 | def _diag(msg: str) -> None: |
167 | 197 | """Write diagnostic message directly to stderr fd 2. |
168 | 198 |
|
169 | 199 | Bypasses all Python buffering (sys.stderr, logging, PrintBuffer) |
170 | 200 | so the message is visible even when stdout/stderr pipes are blocked. |
| 201 | + The fd is set to non-blocking mode by ``_ensure_stderr_nonblock`` |
| 202 | + so this call never stalls the calling thread. |
171 | 203 | """ |
172 | 204 | try: |
173 | 205 | os.write(2, f"DIAG: {msg}\n".encode()) |
174 | 206 | except Exception: |
175 | 207 | # Intentionally ignored: diagnostics are best-effort and must |
176 | | - # never interfere with program execution. |
| 208 | + # never interfere with program execution. In non-blocking mode |
| 209 | + # this catches BlockingIOError (EAGAIN) when the pipe is full. |
177 | 210 | pass |
178 | 211 |
|
179 | 212 | def _consume_from_queue( |
|
0 commit comments