Skip to content

Commit f423697

Browse files
committed
feat(resilience): add tenacity retry decorators and utilities
Add retry utilities for handling transient failures: - Add retry_database_operation decorator for database operations - Add retry_model_inference decorator for model inference - Add retry_query_execution decorator for query execution - Add RetryContext class for manual retry control - Add with_retry wrapper function for runtime configuration - Add max_retries to ResilienceSettings config - Add comprehensive test suite (21 tests) Part of Issue #7: Error Handling & Resilience
1 parent 4940545 commit f423697

3 files changed

Lines changed: 706 additions & 0 deletions

File tree

app/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,13 @@ class ResilienceSettings(BaseSettings):
245245
default=True,
246246
description="Return fallback responses when dependencies fail repeatedly",
247247
)
248+
max_retries: int = Field(
249+
default=3,
250+
ge=1,
251+
le=10,
252+
alias="RESILIENCE_MAX_RETRIES",
253+
description="Maximum number of retry attempts for transient failures",
254+
)
248255

249256

250257
class MonitoringSettings(BaseSettings):

app/retry.py

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
"""
2+
Retry utilities using tenacity for resilient operations.
3+
4+
This module provides retry decorators and utilities for handling
5+
transient failures in database connections, model inference, and
6+
external service calls.
7+
"""
8+
9+
from collections.abc import Callable
10+
from functools import wraps
11+
from typing import Any, TypeVar
12+
13+
from tenacity import (
14+
retry,
15+
retry_if_exception_type,
16+
stop_after_attempt,
17+
wait_exponential,
18+
)
19+
20+
from app.config import get_settings
21+
from app.exceptions import (
22+
DatabaseConnectionException,
23+
ModelInferenceException,
24+
QueryExecutionException,
25+
QueryTimeoutException,
26+
)
27+
from app.logging_config import get_logger
28+
29+
logger = get_logger(__name__)
30+
31+
T = TypeVar("T")
32+
33+
34+
# =============================================================================
35+
# Retry Decorators
36+
# =============================================================================
37+
38+
39+
def retry_database_operation(
40+
max_attempts: int | None = None,
41+
min_wait: float = 1.0,
42+
max_wait: float = 10.0,
43+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
44+
"""
45+
Retry decorator for database operations.
46+
47+
Retries on transient database errors with exponential backoff.
48+
49+
Args:
50+
max_attempts: Maximum retry attempts (default from settings)
51+
min_wait: Minimum wait time in seconds
52+
max_wait: Maximum wait time in seconds
53+
54+
Returns:
55+
Decorated function with retry logic
56+
"""
57+
settings = get_settings()
58+
attempts = max_attempts or settings.resilience.max_retries
59+
60+
return retry(
61+
stop=stop_after_attempt(attempts),
62+
wait=wait_exponential(multiplier=1, min=min_wait, max=max_wait),
63+
retry=retry_if_exception_type(
64+
(DatabaseConnectionException, QueryTimeoutException)
65+
),
66+
before_sleep=_log_retry_attempt,
67+
reraise=True,
68+
)
69+
70+
71+
def retry_model_inference(
72+
max_attempts: int | None = None,
73+
min_wait: float = 2.0,
74+
max_wait: float = 30.0,
75+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
76+
"""
77+
Retry decorator for model inference operations.
78+
79+
Retries on transient model failures with exponential backoff.
80+
Uses longer delays than database operations due to resource constraints.
81+
82+
Args:
83+
max_attempts: Maximum retry attempts (default from settings)
84+
min_wait: Minimum wait time in seconds
85+
max_wait: Maximum wait time in seconds
86+
87+
Returns:
88+
Decorated function with retry logic
89+
"""
90+
settings = get_settings()
91+
attempts = max_attempts or settings.resilience.max_retries
92+
93+
return retry(
94+
stop=stop_after_attempt(attempts),
95+
wait=wait_exponential(multiplier=2, min=min_wait, max=max_wait),
96+
retry=retry_if_exception_type(ModelInferenceException),
97+
before_sleep=_log_retry_attempt,
98+
reraise=True,
99+
)
100+
101+
102+
def retry_query_execution(
103+
max_attempts: int | None = None,
104+
min_wait: float = 0.5,
105+
max_wait: float = 5.0,
106+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
107+
"""
108+
Retry decorator for query execution.
109+
110+
Retries on transient query execution failures.
111+
112+
Args:
113+
max_attempts: Maximum retry attempts (default from settings)
114+
min_wait: Minimum wait time in seconds
115+
max_wait: Maximum wait time in seconds
116+
117+
Returns:
118+
Decorated function with retry logic
119+
"""
120+
settings = get_settings()
121+
attempts = max_attempts or settings.resilience.max_retries
122+
123+
return retry(
124+
stop=stop_after_attempt(attempts),
125+
wait=wait_exponential(multiplier=1, min=min_wait, max=max_wait),
126+
retry=retry_if_exception_type((QueryExecutionException, QueryTimeoutException)),
127+
before_sleep=_log_retry_attempt,
128+
reraise=True,
129+
)
130+
131+
132+
def _log_retry_attempt(retry_state: Any) -> None:
133+
"""Log retry attempts for monitoring."""
134+
exception = retry_state.outcome.exception()
135+
attempt = retry_state.attempt_number
136+
137+
logger.warning(
138+
"retry_attempt",
139+
attempt=attempt,
140+
exception_type=type(exception).__name__,
141+
exception_message=str(exception),
142+
next_wait=retry_state.next_action.sleep if retry_state.next_action else None,
143+
)
144+
145+
146+
# =============================================================================
147+
# Retry Context Managers
148+
# =============================================================================
149+
150+
151+
class RetryContext:
152+
"""
153+
Context manager for manual retry control.
154+
155+
Useful when you need more control over the retry logic than
156+
decorators provide.
157+
158+
Usage:
159+
async with RetryContext(max_attempts=3) as ctx:
160+
for attempt in ctx:
161+
try:
162+
result = await some_operation()
163+
break
164+
except TransientError as e:
165+
ctx.record_failure(e)
166+
"""
167+
168+
def __init__(
169+
self,
170+
max_attempts: int = 3,
171+
base_delay: float = 1.0,
172+
max_delay: float = 10.0,
173+
exceptions: tuple[type[Exception], ...] = (Exception,),
174+
) -> None:
175+
"""
176+
Initialize retry context.
177+
178+
Args:
179+
max_attempts: Maximum number of attempts
180+
base_delay: Base delay for exponential backoff
181+
max_delay: Maximum delay cap
182+
exceptions: Tuple of exception types to retry on
183+
"""
184+
self.max_attempts = max_attempts
185+
self.base_delay = base_delay
186+
self.max_delay = max_delay
187+
self.exceptions = exceptions
188+
self._attempt = 0
189+
self._last_exception: Exception | None = None
190+
191+
def __iter__(self) -> "RetryContext":
192+
"""Make context iterable for retry loop."""
193+
self._attempt = 0
194+
return self
195+
196+
def __next__(self) -> int:
197+
"""Get next attempt number."""
198+
# If we've exhausted attempts, stop iteration
199+
if self._attempt >= self.max_attempts:
200+
raise StopIteration
201+
202+
self._attempt += 1
203+
return self._attempt
204+
205+
def record_failure(self, exception: Exception) -> None:
206+
"""Record a failed attempt."""
207+
self._last_exception = exception
208+
209+
if not isinstance(exception, self.exceptions):
210+
raise exception
211+
212+
if self._attempt >= self.max_attempts:
213+
raise exception
214+
215+
def get_delay(self) -> float:
216+
"""Calculate delay for current attempt."""
217+
delay = self.base_delay * (2 ** (self._attempt - 1))
218+
return float(min(delay, self.max_delay))
219+
220+
@property
221+
def attempt(self) -> int:
222+
"""Current attempt number."""
223+
return self._attempt
224+
225+
@property
226+
def should_retry(self) -> bool:
227+
"""Whether more retries are available."""
228+
return self._attempt < self.max_attempts
229+
230+
231+
# =============================================================================
232+
# Utility Functions
233+
# =============================================================================
234+
235+
236+
def with_retry(
237+
func: Callable[..., T],
238+
max_attempts: int = 3,
239+
exceptions: tuple[type[Exception], ...] = (Exception,),
240+
base_delay: float = 1.0,
241+
max_delay: float = 10.0,
242+
) -> Callable[..., T]:
243+
"""
244+
Wrap a function with retry logic.
245+
246+
Alternative to decorators for runtime configuration.
247+
248+
Args:
249+
func: Function to wrap
250+
max_attempts: Maximum retry attempts
251+
exceptions: Exception types to retry on
252+
base_delay: Base delay for backoff
253+
max_delay: Maximum delay cap
254+
255+
Returns:
256+
Wrapped function with retry logic
257+
"""
258+
259+
@wraps(func)
260+
async def async_wrapper(*args: Any, **kwargs: Any) -> T:
261+
import asyncio
262+
263+
last_exception: Exception | None = None
264+
265+
for attempt in range(1, max_attempts + 1):
266+
try:
267+
result = await func(*args, **kwargs) # type: ignore[misc]
268+
return result # type: ignore[no-any-return]
269+
except exceptions as e:
270+
last_exception = e
271+
if attempt < max_attempts:
272+
delay = min(base_delay * (2 ** (attempt - 1)), max_delay)
273+
logger.warning(
274+
"retry_attempt",
275+
attempt=attempt,
276+
max_attempts=max_attempts,
277+
delay=delay,
278+
error=str(e),
279+
)
280+
await asyncio.sleep(delay)
281+
282+
if last_exception:
283+
raise last_exception
284+
raise RuntimeError("Unexpected retry loop exit")
285+
286+
@wraps(func)
287+
def sync_wrapper(*args: Any, **kwargs: Any) -> T:
288+
import time
289+
290+
last_exception: Exception | None = None
291+
292+
for attempt in range(1, max_attempts + 1):
293+
try:
294+
return func(*args, **kwargs)
295+
except exceptions as e:
296+
last_exception = e
297+
if attempt < max_attempts:
298+
delay = min(base_delay * (2 ** (attempt - 1)), max_delay)
299+
logger.warning(
300+
"retry_attempt",
301+
attempt=attempt,
302+
max_attempts=max_attempts,
303+
delay=delay,
304+
error=str(e),
305+
)
306+
time.sleep(delay)
307+
308+
if last_exception:
309+
raise last_exception
310+
raise RuntimeError("Unexpected retry loop exit")
311+
312+
import asyncio
313+
314+
if asyncio.iscoroutinefunction(func):
315+
return async_wrapper # type: ignore[return-value]
316+
return sync_wrapper

0 commit comments

Comments
 (0)