-
Notifications
You must be signed in to change notification settings - Fork 44
Expand file tree
/
Copy pathmemory_monitor.py
More file actions
162 lines (127 loc) · 5.72 KB
/
memory_monitor.py
File metadata and controls
162 lines (127 loc) · 5.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#
# Copyright (c) 2026 Airbyte, Inc., all rights reserved.
#
"""Source-side memory introspection to emit controlled error messages before OOM kills."""
import logging
from pathlib import Path
from typing import Optional
from airbyte_cdk.models import FailureType
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
logger = logging.getLogger("airbyte")
# cgroup v2 paths
_CGROUP_V2_CURRENT = Path("/sys/fs/cgroup/memory.current")
_CGROUP_V2_MAX = Path("/sys/fs/cgroup/memory.max")
# cgroup v1 paths
_CGROUP_V1_USAGE = Path("/sys/fs/cgroup/memory/memory.usage_in_bytes")
_CGROUP_V1_LIMIT = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
# Default thresholds
_DEFAULT_WARNING_THRESHOLD = 0.85
_DEFAULT_CRITICAL_THRESHOLD = 0.95
# Check interval (every N messages)
_DEFAULT_CHECK_INTERVAL = 1000
class MemoryLimitExceeded(AirbyteTracedException):
"""Raised when connector memory usage exceeds critical threshold."""
pass
class MemoryMonitor:
"""Monitors container memory usage via cgroup files and emits warnings before OOM kills.
Lazily probes cgroup v2 then v1 files on the first call to
``check_memory_usage()``. Caches which version exists.
If neither is found (local dev / CI), all subsequent calls are instant no-ops.
"""
def __init__(
self,
warning_threshold: float = _DEFAULT_WARNING_THRESHOLD,
critical_threshold: float = _DEFAULT_CRITICAL_THRESHOLD,
check_interval: int = _DEFAULT_CHECK_INTERVAL,
) -> None:
self._warning_threshold = warning_threshold
self._critical_threshold = critical_threshold
self._check_interval = check_interval
self._message_count = 0
self._warning_emitted = False
self._critical_raised = False
self._cgroup_version: Optional[int] = None
self._probed = False
def _probe_cgroup(self) -> None:
"""Detect which cgroup version (if any) is available.
Called lazily on the first ``check_memory_usage()`` invocation so
that ``spec`` and ``discover`` commands never incur filesystem I/O.
"""
if self._probed:
return
self._probed = True
if _CGROUP_V2_CURRENT.exists() and _CGROUP_V2_MAX.exists():
self._cgroup_version = 2
elif _CGROUP_V1_USAGE.exists() and _CGROUP_V1_LIMIT.exists():
self._cgroup_version = 1
if self._cgroup_version is None:
logger.debug(
"No cgroup memory files found. Memory monitoring disabled (likely local dev / CI)."
)
def _read_memory(self) -> Optional[tuple[int, int]]:
"""Read current memory usage and limit from cgroup files.
Returns a tuple of (usage_bytes, limit_bytes) or None if unavailable.
Best-effort: failures to read memory info never crash a sync.
"""
if self._cgroup_version is None:
return None
try:
if self._cgroup_version == 2:
usage_path = _CGROUP_V2_CURRENT
limit_path = _CGROUP_V2_MAX
else:
usage_path = _CGROUP_V1_USAGE
limit_path = _CGROUP_V1_LIMIT
limit_text = limit_path.read_text().strip()
# cgroup v2 memory.max can be the literal string "max" (unlimited)
if limit_text == "max":
return None
usage_bytes = int(usage_path.read_text().strip())
limit_bytes = int(limit_text)
if limit_bytes <= 0:
return None
return usage_bytes, limit_bytes
except (OSError, ValueError):
logger.debug("Failed to read cgroup memory files; skipping memory check.")
return None
def check_memory_usage(self) -> None:
"""Check memory usage against thresholds.
Intended to be called on every message. The monitor internally tracks
a message counter and only reads cgroup files every ``check_interval``
messages (default 1000) to minimise I/O overhead.
At the warning threshold (default 85%), logs a warning message.
At the critical threshold (default 95%), raises MemoryLimitExceeded to
trigger a graceful shutdown with an actionable error message.
Each threshold triggers at most once per sync to avoid log spam.
This method is a no-op if cgroup files are unavailable.
"""
self._probe_cgroup()
if self._cgroup_version is None:
return
self._message_count += 1
if self._message_count % self._check_interval != 0:
return
memory_info = self._read_memory()
if memory_info is None:
return
usage_bytes, limit_bytes = memory_info
usage_ratio = usage_bytes / limit_bytes
usage_percent = int(usage_ratio * 100)
usage_gb = usage_bytes / (1024**3)
limit_gb = limit_bytes / (1024**3)
if usage_ratio >= self._critical_threshold and not self._critical_raised:
self._critical_raised = True
raise MemoryLimitExceeded(
internal_message=f"Memory usage is {usage_percent}% ({usage_gb:.2f} / {limit_gb:.2f} GB). "
f"Critical threshold is {int(self._critical_threshold * 100)}%.",
message=f"Source exceeded memory limit ({usage_percent}% used) and must shut down to avoid an out-of-memory crash.",
failure_type=FailureType.system_error,
)
if usage_ratio >= self._warning_threshold and not self._warning_emitted:
self._warning_emitted = True
logger.warning(
"Source memory usage reached %d%% of container limit (%.2f / %.2f GB).",
usage_percent,
usage_gb,
limit_gb,
)