|
8 | 8 | from datetime import timedelta |
9 | 9 | from typing import ( |
10 | 10 | Any, |
| 11 | + Dict, |
11 | 12 | Generator, |
12 | 13 | Generic, |
13 | 14 | Iterable, |
|
38 | 39 | _NO_TIMEOUT = timedelta.max |
39 | 40 | _API_SIDE_RUNNING_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT} |
40 | 41 |
|
| 42 | +# Precedence used to aggregate the `FailureType` of many non-breaking |
| 43 | +# exceptions into a single value. A `config_error` means the user must act |
| 44 | +# before retries can succeed, so it dominates. `transient_error` is next |
| 45 | +# (retryable). `system_error` is the fallback for genuine internal failures. |
| 46 | +_FAILURE_TYPE_PRECEDENCE: Tuple[FailureType, ...] = ( |
| 47 | + FailureType.config_error, |
| 48 | + FailureType.transient_error, |
| 49 | + FailureType.system_error, |
| 50 | +) |
| 51 | + |
| 52 | +# Deterministic, aggregation-friendly user-facing messages per dominant |
| 53 | +# `FailureType`. Counts and raw exception reprs go into `internal_message` |
| 54 | +# so that the `message` field stays stable as a log aggregation key. |
| 55 | +_ASYNC_JOB_FAILURE_MESSAGE_BY_TYPE: Mapping[FailureType, str] = { |
| 56 | + FailureType.config_error: ( |
| 57 | + "Async jobs failed because the source API rejected the request as unauthorized or forbidden." |
| 58 | + ), |
| 59 | + FailureType.transient_error: ( |
| 60 | + "Async jobs failed after exhausting retries for source API rate limit or transient errors." |
| 61 | + ), |
| 62 | + FailureType.system_error: "Async jobs failed after exhausting retry attempts.", |
| 63 | +} |
| 64 | + |
41 | 65 |
|
42 | 66 | class AsyncPartition: |
43 | 67 | """ |
@@ -481,16 +505,56 @@ def create_and_get_completed_partitions(self) -> Iterable[AsyncPartition]: |
481 | 505 | if self._non_breaking_exceptions: |
482 | 506 | # We emitted traced message but we didn't break on non_breaking_exception. We still need to raise an exception so that the |
483 | 507 | # call of `create_and_get_completed_partitions` knows that there was an issue with some partitions and the sync is incomplete. |
| 508 | + failure_type = self._aggregate_failure_type(self._non_breaking_exceptions) |
| 509 | + failure_counts = self._count_failure_types(self._non_breaking_exceptions) |
| 510 | + summary = ", ".join( |
| 511 | + f"{ft.value}={failure_counts[ft]}" |
| 512 | + for ft in _FAILURE_TYPE_PRECEDENCE |
| 513 | + if ft in failure_counts |
| 514 | + ) |
484 | 515 | raise AirbyteTracedException( |
485 | | - message="One or more async jobs failed after exhausting all retry attempts.", |
| 516 | + message=_ASYNC_JOB_FAILURE_MESSAGE_BY_TYPE[failure_type], |
486 | 517 | internal_message="\n".join( |
487 | | - [ |
| 518 | + [f"Underlying failure breakdown: {summary}."] |
| 519 | + + [ |
488 | 520 | filter_secrets(exception.__repr__()) |
489 | 521 | for exception in self._non_breaking_exceptions |
490 | 522 | ] |
491 | 523 | ), |
492 | | - failure_type=FailureType.system_error, |
| 524 | + failure_type=failure_type, |
| 525 | + ) |
| 526 | + |
| 527 | + @staticmethod |
| 528 | + def _aggregate_failure_type(exceptions: List[Exception]) -> FailureType: |
| 529 | + """Return the highest-precedence `FailureType` across `exceptions`. |
| 530 | +
|
| 531 | + Non-`AirbyteTracedException` exceptions are treated as `system_error` |
| 532 | + (matching `AirbyteTracedException`'s default). The precedence order |
| 533 | + is `config_error` > `transient_error` > `system_error`. |
| 534 | + """ |
| 535 | + types_present: Set[FailureType] = { |
| 536 | + exc.failure_type |
| 537 | + if isinstance(exc, AirbyteTracedException) and exc.failure_type is not None |
| 538 | + else FailureType.system_error |
| 539 | + for exc in exceptions |
| 540 | + } |
| 541 | + for failure_type in _FAILURE_TYPE_PRECEDENCE: |
| 542 | + if failure_type in types_present: |
| 543 | + return failure_type |
| 544 | + return FailureType.system_error |
| 545 | + |
| 546 | + @staticmethod |
| 547 | + def _count_failure_types(exceptions: List[Exception]) -> Dict[FailureType, int]: |
| 548 | + """Return a count of each `FailureType` observed in `exceptions`.""" |
| 549 | + counts: Dict[FailureType, int] = {} |
| 550 | + for exc in exceptions: |
| 551 | + failure_type = ( |
| 552 | + exc.failure_type |
| 553 | + if isinstance(exc, AirbyteTracedException) and exc.failure_type is not None |
| 554 | + else FailureType.system_error |
493 | 555 | ) |
| 556 | + counts[failure_type] = counts.get(failure_type, 0) + 1 |
| 557 | + return counts |
494 | 558 |
|
495 | 559 | def _handle_non_breaking_error(self, exception: Exception) -> None: |
496 | 560 | LOGGER.error(f"Failed to start the Job: {exception}, traceback: {traceback.format_exc()}") |
|
0 commit comments