Skip to content

Commit 35cc09e

Browse files
Added missing aws.ec2.gpu.idle spec and minor fixes to gcp.vertex.workbench.idle rule (#172)
1 parent 72e5f4c commit 35cc09e

7 files changed

Lines changed: 1294 additions & 360 deletions

File tree

cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py

Lines changed: 94 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@
132132
}
133133
_DEFAULT_MONTHLY_COST = 600.0
134134

135-
# GPU utilisation thresholds
135+
# GPU utilization thresholds
136136
_GPU_UTIL_THRESHOLD_PCT = 5.0 # below this = idle (when GPU metric available)
137137
_CPU_UTIL_THRESHOLD_PCT = 10.0 # below this = idle (CPU fallback)
138138

@@ -151,7 +151,7 @@ def find_idle_gpu_instances(
151151
cpu_threshold: float = _CPU_UTIL_THRESHOLD_PCT,
152152
) -> List[Finding]:
153153
"""
154-
Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilisation.
154+
Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilization.
155155
156156
GPU instances (raw EC2, outside SageMaker) incur continuous charges while running
157157
regardless of whether GPUs are being utilised. A p4d.24xlarge costs ~$23K/month
@@ -162,24 +162,28 @@ def find_idle_gpu_instances(
162162
- Instance state is running
163163
- Instance type is a known GPU/accelerator family
164164
- Instance is older than idle_days (avoids flagging newly launched instances)
165-
- GPU utilisation < gpu_threshold % over idle_days (HIGH confidence, when NVIDIA
166-
CloudWatch agent publishes nvidia_smi_utilization_gpu under CWAgent namespace)
167-
- OR CPU utilisation < cpu_threshold % over idle_days (MEDIUM confidence fallback,
165+
- GPU utilization < gpu_threshold % over idle_days (HIGH confidence, when the
166+
nvidia_smi_utilization_gpu metric is discoverable for the instance in CloudWatch)
167+
- OR CPU utilization < cpu_threshold % over idle_days (MEDIUM confidence fallback,
168168
used when GPU metrics are not available — CPU alone is a weaker signal)
169169
170170
GPU metric detection:
171-
The NVIDIA CloudWatch agent publishes nvidia_smi_utilization_gpu under the CWAgent
172-
namespace with an InstanceId dimension. Availability is probed via ListMetrics per
173-
instance — not assumed. Instances without the agent fall back to CPU utilisation.
171+
The rule probes CloudWatch ListMetrics for nvidia_smi_utilization_gpu in the CWAgent
172+
namespace, filtered by InstanceId dimension. This depends on the CloudWatch agent
173+
being installed and configured to append EC2 instance dimensions (e.g. via
174+
append_dimensions = {"InstanceId": ...}). AWS does not guarantee the InstanceId
175+
dimension is present by default; its presence is implementation-dependent. If the
176+
metric is absent or the agent is misconfigured, the rule falls back to CPU
177+
utilization. Absence of the metric is NOT proof the GPU is idle.
174178
175179
Multi-GPU handling:
176180
For multi-GPU instances (e.g., p4d.24xlarge has 8 A100s), the MAX statistic is
177181
used across all GPU index dimensions. A single active GPU on an 8-GPU instance
178182
would be averaged away using AVG, producing a misleadingly low reading.
179183
180184
Confidence:
181-
- HIGH: GPU metric available AND max GPU utilisation < gpu_threshold over idle_days
182-
- MEDIUM: GPU metric unavailable, CPU utilisation < cpu_threshold over idle_days
185+
- HIGH: GPU metric discoverable AND max GPU utilization < gpu_threshold over idle_days
186+
- MEDIUM: GPU metric not discoverable; CPU utilization < cpu_threshold over idle_days
183187
184188
IAM permissions:
185189
- ec2:DescribeInstances
@@ -208,53 +212,62 @@ def find_idle_gpu_instances(
208212
if not _is_gpu_instance(instance_type):
209213
continue
210214

211-
instance_id = inst["InstanceId"]
215+
# Normalize InstanceId — skip if missing or empty (spec section 5)
216+
instance_id = (inst.get("InstanceId") or "").strip()
217+
if not instance_id:
218+
continue
219+
212220
tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
213221
# "spot" | "scheduled" | None (on-demand)
214222
instance_lifecycle = inst.get("InstanceLifecycle")
215223
purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
216-
launch_time = inst.get("LaunchTime")
217-
218-
age_days: Optional[int] = None
219-
if launch_time:
220-
if launch_time.tzinfo is None:
221-
launch_time = launch_time.replace(tzinfo=timezone.utc)
222-
age_days = (now - launch_time).days
223224

224-
# Skip instances younger than idle_days — too new to classify
225-
if age_days is not None and age_days < idle_days:
225+
# Normalize LaunchTime — skip if missing, naive, or future (spec section 5, section 8.3)
226+
launch_time = inst.get("LaunchTime")
227+
if not launch_time:
228+
continue # missing LaunchTime → SKIP ITEM
229+
if launch_time.tzinfo is None:
230+
continue # naive timestamp is not tz-aware UTC → SKIP ITEM
231+
age_days = (now - launch_time).days
232+
if age_days < 0:
233+
continue # future LaunchTime → SKIP ITEM
234+
235+
# Skip instances younger than effective_idle_days — too new to classify
236+
if age_days < idle_days:
226237
continue
227238

228239
# Probe for GPU metrics — single ListMetrics call reused for stats
229240
gpu_metrics = _list_gpu_metrics(cloudwatch, instance_id)
230241

231242
if gpu_metrics:
232-
max_gpu_util = _get_max_gpu_utilisation(
243+
max_gpu_util = _get_max_gpu_utilization(
233244
cloudwatch, gpu_metrics, idle_days, now
234245
)
235246
if max_gpu_util is None or max_gpu_util >= gpu_threshold:
236247
continue
237248
confidence = ConfidenceLevel.HIGH
238-
idle_signal = "gpu_utilisation"
249+
idle_signal = "gpu_utilization"
239250
util_value = max_gpu_util
240-
util_label = f"Max GPU utilisation: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
251+
util_label = f"Max GPU utilization: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
241252
else:
242-
avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now)
243-
if avg_cpu is None or avg_cpu >= cpu_threshold:
253+
max_cpu = _get_max_daily_cpu_utilization(
254+
cloudwatch, instance_id, idle_days, now
255+
)
256+
if max_cpu is None or max_cpu >= cpu_threshold:
244257
continue
245258
# CPU fallback is a weak heuristic for GPU workloads:
246-
# accelerator utilisation is invisible to CPU metrics, so a GPU
259+
# accelerator utilization is invisible to CPU metrics, so a GPU
247260
# instance running a compute-bound model can show near-zero CPU
248261
# while doing real work. Confidence is capped at MEDIUM to reflect
249262
# this limitation. Absence of the CWAgent GPU metric is NOT proof
250-
# that the GPU is idle — the agent may simply not be installed.
263+
# that the GPU is idle — the agent may be absent or misconfigured.
251264
confidence = ConfidenceLevel.MEDIUM
252-
idle_signal = "cpu_utilisation_fallback"
253-
util_value = avg_cpu
265+
idle_signal = "cpu_utilization_fallback"
266+
util_value = max_cpu
254267
util_label = (
255-
f"Peak daily CPU utilisation: {avg_cpu:.1f}% "
268+
f"Peak daily CPU utilization: {max_cpu:.1f}% "
256269
f"(threshold: {cpu_threshold}%) — "
257-
f"heuristic only; GPU/accelerator utilisation not directly measured"
270+
f"heuristic only; GPU/accelerator utilization not directly measured"
258271
)
259272

260273
monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST)
@@ -272,15 +285,14 @@ def find_idle_gpu_instances(
272285
f"Instance type: {instance_type} (GPU/accelerator family)",
273286
f"Purchasing model: {purchasing_model}",
274287
util_label,
288+
f"Instance age: {age_days} days",
275289
]
276-
if age_days is not None:
277-
signals.append(f"Instance age: {age_days} days")
278290
if not gpu_metrics:
279291
if _is_neuron_instance(instance_type):
280292
signals.append(
281293
"Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
282294
"applicable; CPU used as heuristic fallback; confidence MEDIUM. "
283-
"Neuron utilisation requires AWS Neuron SDK metrics."
295+
"Neuron utilization requires AWS Neuron SDK metrics."
284296
)
285297
else:
286298
signals.append(
@@ -289,11 +301,18 @@ def find_idle_gpu_instances(
289301
"the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
290302
)
291303

304+
# signals_not_checked: GPU note only applies on CPU fallback path (spec section 11.1)
292305
not_checked = [
293-
"GPU/accelerator utilisation (not directly measurable without CWAgent)",
294306
"Scheduled batch jobs that run outside the observation window",
295307
"Planned future use",
296308
]
309+
if not gpu_metrics:
310+
not_checked.insert(
311+
0,
312+
"Direct GPU/accelerator utilization — nvidia_smi_utilization_gpu was not "
313+
"discoverable in CloudWatch (CWAgent may be absent or not configured with "
314+
"InstanceId dimension); absence of the metric does not confirm the GPU is idle",
315+
)
297316
if purchasing_model == "spot":
298317
not_checked.append(
299318
"Spot interruption history — Spot instances may appear idle "
@@ -307,6 +326,17 @@ def find_idle_gpu_instances(
307326
)
308327

309328
metric_label = "GPU" if gpu_metrics else "CPU (fallback)"
329+
if gpu_metrics:
330+
reason = (
331+
f"GPU EC2 instance has low GPU utilization "
332+
f"({util_value:.1f}%) over {idle_days} days"
333+
)
334+
else:
335+
reason = (
336+
f"GPU EC2 instance shows low CPU proxy signal "
337+
f"({util_value:.1f}%) over {idle_days} days — "
338+
f"GPU activity not directly measured"
339+
)
310340
findings.append(
311341
Finding(
312342
provider="aws",
@@ -316,22 +346,18 @@ def find_idle_gpu_instances(
316346
region=region,
317347
estimated_monthly_cost_usd=monthly_cost,
318348
title=(
319-
f"Idle GPU EC2 Instance ({metric_label} utilisation "
349+
f"Idle GPU EC2 Instance ({metric_label} utilization "
320350
f"<{gpu_threshold if gpu_metrics else cpu_threshold}% "
321351
f"over {idle_days} days)"
322352
),
323353
summary=(
324354
f"EC2 instance '{name_tag}' ({instance_type}) has had "
325-
f"{'GPU' if gpu_metrics else 'CPU'} utilisation below "
355+
f"{'GPU' if gpu_metrics else 'CPU'} utilization below "
326356
f"{gpu_threshold if gpu_metrics else cpu_threshold}% "
327357
f"for {idle_days} days while running, incurring "
328358
f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)."
329359
),
330-
reason=(
331-
f"GPU EC2 instance has low "
332-
f"{'GPU' if gpu_metrics else 'CPU'} utilisation "
333-
f"({util_value:.1f}%) for {idle_days} days"
334-
),
360+
reason=reason,
335361
risk=risk,
336362
confidence=confidence,
337363
detected_at=now,
@@ -340,11 +366,11 @@ def find_idle_gpu_instances(
340366
"instance_id": instance_id,
341367
"instance_type": instance_type,
342368
"name": name_tag,
343-
"age_days": (age_days if age_days is not None else "unknown"),
369+
"age_days": age_days,
344370
"idle_days_threshold": idle_days,
345371
"idle_ratio": idle_ratio,
346372
"idle_signal": idle_signal,
347-
"utilisation_pct": round(util_value, 2),
373+
"utilization_pct": round(util_value, 2),
348374
"purchasing_model": purchasing_model,
349375
"gpu_metric_available": bool(gpu_metrics),
350376
"gpu_metric_note": (
@@ -387,25 +413,34 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list:
387413
"""
388414
Probe CloudWatch ListMetrics for nvidia_smi_utilization_gpu under CWAgent namespace.
389415
390-
Returns the Metrics list (one entry per GPU index) so the caller can reuse it
416+
Exhausts pagination via NextToken (spec section 2 key fact 6: ListMetrics returns up to
417+
500 results per call). Returns all Metrics entries so the caller can reuse them
391418
for GetMetricStatistics without a second ListMetrics call. Returns [] on any error.
392419
"""
420+
metrics: list = []
421+
kwargs: dict = {
422+
"Namespace": "CWAgent",
423+
"MetricName": "nvidia_smi_utilization_gpu",
424+
"Dimensions": [{"Name": "InstanceId", "Value": instance_id}],
425+
}
393426
try:
394-
resp = cloudwatch.list_metrics(
395-
Namespace="CWAgent",
396-
MetricName="nvidia_smi_utilization_gpu",
397-
Dimensions=[{"Name": "InstanceId", "Value": instance_id}],
398-
)
399-
return resp.get("Metrics", [])
427+
while True:
428+
resp = cloudwatch.list_metrics(**kwargs)
429+
metrics.extend(resp.get("Metrics", []))
430+
next_token = resp.get("NextToken")
431+
if not next_token:
432+
break
433+
kwargs["NextToken"] = next_token
400434
except Exception:
401435
return []
436+
return metrics
402437

403438

404-
def _get_max_gpu_utilisation(
439+
def _get_max_gpu_utilization(
405440
cloudwatch, gpu_metrics: list, days: int, now: datetime
406441
) -> Optional[float]:
407442
"""
408-
Return the maximum GPU utilisation across all GPU indices over the window.
443+
Return the maximum GPU utilization across all GPU indices over the window.
409444
410445
Takes the gpu_metrics list already fetched by _list_gpu_metrics — no second
411446
ListMetrics call. Uses MAX statistic so a single active GPU on a multi-GPU
@@ -440,17 +475,17 @@ def _get_max_gpu_utilisation(
440475
return max_util
441476

442477

443-
def _get_avg_cpu_utilisation(
478+
def _get_max_daily_cpu_utilization(
444479
cloudwatch, instance_id: str, days: int, now: datetime
445480
) -> Optional[float]:
446481
"""
447-
Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
482+
Return the maximum daily CPU peak over the window using AWS/EC2 CPUUtilization.
448483
449-
Uses Maximum statistic per day and returns the highest daily peak. This avoids
450-
flagging burst workloads where a short but significant CPU spike would be averaged
451-
away — if the max CPU across any day is below threshold, the instance is truly idle.
484+
Uses Maximum statistic at daily (86400s) period and returns the highest value
485+
across all returned datapoints (spec section 6.2). This avoids flagging burst workloads
486+
where a short but significant CPU spike would be averaged away.
452487
453-
Returns None on error — caller treats None as "not idle" (safe default).
488+
Returns None on error or no datapoints — caller treats None as "not idle" (safe default).
454489
"""
455490
start = now - timedelta(days=days)
456491
try:

0 commit comments

Comments
 (0)