Skip to content

Commit f933b14

Browse files
authored
AWS fixes around few rules (#156)
1 parent 71ac012 commit f933b14

16 files changed

Lines changed: 1839 additions & 513 deletions

cleancloud/providers/aws/rules/cloudwatch_inactive.py

Lines changed: 0 additions & 70 deletions
This file was deleted.
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
from datetime import datetime, timezone
2+
from typing import List, Optional
3+
4+
import boto3
5+
6+
from cleancloud.core.confidence import ConfidenceLevel
7+
from cleancloud.core.evidence import Evidence
8+
from cleancloud.core.finding import Finding
9+
from cleancloud.core.risk import RiskLevel
10+
11+
# Log groups newer than this are skipped — noise-reduction heuristic, not an AWS rule.
12+
# New groups may not have had time for an operator to review and configure retention.
13+
_MIN_AGE_DAYS = 7
14+
15+
# Approximate CloudWatch Logs storage cost per GB-month.
16+
# This is the us-east-1 rate as of 2024; actual cost varies by region.
17+
# Use only as an order-of-magnitude estimate, not a billing figure.
18+
_STORAGE_COST_PER_GB_APPROX = 0.03
19+
20+
# Risk thresholds by stored size
21+
_HIGH_RISK_GB = 1.0 # ≥ 1 GB stored → HIGH (significant cost + compliance exposure)
22+
# < 1 GB stored → MEDIUM; 0 bytes stored → LOW (no current cost, but policy gap still flagged)
23+
24+
25+
def find_cloudwatch_logs_no_retention(
26+
session: boto3.Session,
27+
region: str,
28+
) -> List[Finding]:
29+
"""
30+
Find CloudWatch log groups with no retention policy (logs never expire).
31+
32+
This is a hygiene rule, not an idle/activity rule. It flags log groups where
33+
retentionInDays is unset, meaning logs accumulate indefinitely and storage costs
34+
grow without bound.
35+
36+
Notes on accuracy:
37+
- storedBytes is eventually consistent and may lag actual ingestion by hours.
38+
The cost estimate is therefore approximate and should not be used for billing.
39+
- Log groups newer than 7 days are skipped as a noise-reduction heuristic.
40+
- Infinite retention may be intentional for audit/security/compliance logs —
41+
always review before acting on findings from this rule.
42+
- Zero storedBytes does not mean no future cost risk; active log groups can
43+
grow rapidly once ingestion begins.
44+
45+
Risk is dynamic based on stored data size:
46+
- HIGH: ≥ 1 GB stored (significant ongoing cost + likely compliance exposure)
47+
- MEDIUM: > 0 bytes but < 1 GB (growing cost, policy gap)
48+
- LOW: 0 bytes stored (no current cost, but hygiene issue)
49+
50+
IAM permissions:
51+
- logs:DescribeLogGroups
52+
"""
53+
logs = session.client("logs", region_name=region)
54+
paginator = logs.get_paginator("describe_log_groups")
55+
56+
findings: List[Finding] = []
57+
now = datetime.now(timezone.utc)
58+
59+
for page in paginator.paginate():
60+
for lg in page.get("logGroups", []):
61+
retention_days = lg.get("retentionInDays") # None = never expire
62+
63+
if retention_days is not None:
64+
continue
65+
66+
# Noise-reduction heuristic: skip recently created log groups.
67+
# This is NOT an AWS-defined behavior — new groups may simply not have
68+
# been reviewed yet. Adjust _MIN_AGE_DAYS if this produces too much noise.
69+
creation_time_ms = lg.get("creationTime")
70+
if creation_time_ms:
71+
creation_time = datetime.fromtimestamp(creation_time_ms / 1000, tz=timezone.utc)
72+
age_days = (now - creation_time).days
73+
if age_days < _MIN_AGE_DAYS:
74+
continue
75+
else:
76+
age_days = None
77+
78+
stored_bytes = lg.get("storedBytes") or 0
79+
stored_gb = stored_bytes / (1024**3)
80+
81+
# storedBytes is eventually consistent — cost estimate may lag reality.
82+
monthly_storage_cost: Optional[float] = (
83+
round(stored_gb * _STORAGE_COST_PER_GB_APPROX, 2) if stored_bytes > 0 else None
84+
)
85+
86+
# Risk is proportional to stored size
87+
if stored_gb >= _HIGH_RISK_GB:
88+
risk = RiskLevel.HIGH
89+
elif stored_bytes > 0:
90+
risk = RiskLevel.MEDIUM
91+
else:
92+
risk = RiskLevel.LOW
93+
94+
signals_used = [
95+
"Log group has no retention policy configured (logs never expire)",
96+
]
97+
if age_days is not None:
98+
signals_used.append(f"Log group is {age_days} days old")
99+
if stored_bytes > 0:
100+
signals_used.append(
101+
f"Stored data: {stored_gb:.2f} GB "
102+
f"(~${monthly_storage_cost:.2f}/month at ~${_STORAGE_COST_PER_GB_APPROX}/GB — "
103+
f"region-dependent estimate; storedBytes may lag actual ingestion)"
104+
)
105+
else:
106+
signals_used.append(
107+
"Stored data: 0 bytes (storedBytes may lag; active groups can grow rapidly)"
108+
)
109+
110+
evidence = Evidence(
111+
signals_used=signals_used,
112+
signals_not_checked=[
113+
"Recent ingestion activity (not checked — this is a hygiene rule)",
114+
"Intentional retention for audit, security, or compliance logs",
115+
"Application-level usage",
116+
"Future ingestion volume",
117+
],
118+
time_window=None,
119+
)
120+
121+
findings.append(
122+
Finding(
123+
provider="aws",
124+
rule_id="aws.cloudwatch.logs.infinite_retention",
125+
resource_type="aws.cloudwatch.log_group",
126+
resource_id=lg["logGroupName"],
127+
region=region,
128+
estimated_monthly_cost_usd=monthly_storage_cost,
129+
title="CloudWatch log group with infinite retention",
130+
summary=(
131+
"Log group has no retention policy — logs accumulate indefinitely"
132+
+ (f" ({stored_gb:.2f} GB stored)" if stored_bytes > 0 else "")
133+
),
134+
reason="Retention is not set (logs never expire)",
135+
risk=risk,
136+
confidence=ConfidenceLevel.MEDIUM, # conservative — no activity check
137+
detected_at=now,
138+
evidence=evidence,
139+
details={
140+
"stored_bytes": stored_bytes,
141+
"stored_gb": round(stored_gb, 4),
142+
"stored_bytes_note": "eventually consistent — may lag actual ingestion",
143+
"retention_days": retention_days,
144+
"age_days": age_days,
145+
"age_gate_note": f"groups < {_MIN_AGE_DAYS} days old are skipped (noise-reduction heuristic)",
146+
"estimated_monthly_storage_cost": (
147+
f"~${monthly_storage_cost:.2f}/month (approx, region-dependent)"
148+
if monthly_storage_cost
149+
else "negligible now — active groups can grow rapidly"
150+
),
151+
},
152+
)
153+
)
154+
155+
return findings

cleancloud/providers/aws/rules/ec2_gpu_idle.py

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
"p4d.",
2525
"p4de.",
2626
"p5.",
27+
"p5en.", # H100 + higher network bandwidth than p5
28+
"p6.", # NVIDIA B200 (Blackwell)
2729
"g4dn.",
2830
"g4ad.",
2931
"g5.",
@@ -208,6 +210,9 @@ def find_idle_gpu_instances(
208210

209211
instance_id = inst["InstanceId"]
210212
tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
213+
# "spot" | "scheduled" | None (on-demand)
214+
instance_lifecycle = inst.get("InstanceLifecycle")
215+
purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
211216
launch_time = inst.get("LaunchTime")
212217

213218
age_days: Optional[int] = None
@@ -237,10 +242,20 @@ def find_idle_gpu_instances(
237242
avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now)
238243
if avg_cpu is None or avg_cpu >= cpu_threshold:
239244
continue
245+
# CPU fallback is a weak heuristic for GPU workloads:
246+
# accelerator utilisation is invisible to CPU metrics, so a GPU
247+
# instance running a compute-bound model can show near-zero CPU
248+
# while doing real work. Confidence is capped at MEDIUM to reflect
249+
# this limitation. Absence of the CWAgent GPU metric is NOT proof
250+
# that the GPU is idle — the agent may simply not be installed.
240251
confidence = ConfidenceLevel.MEDIUM
241252
idle_signal = "cpu_utilisation_fallback"
242253
util_value = avg_cpu
243-
util_label = f"Avg CPU utilisation: {avg_cpu:.1f}% (threshold: {cpu_threshold}%, GPU metric unavailable)"
254+
util_label = (
255+
f"Peak daily CPU utilisation: {avg_cpu:.1f}% "
256+
f"(threshold: {cpu_threshold}%) — "
257+
f"heuristic only; GPU/accelerator utilisation not directly measured"
258+
)
244259

245260
monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST)
246261
idle_ratio = round(age_days / idle_days, 2) if (age_days and idle_days) else 0.0
@@ -255,29 +270,35 @@ def find_idle_gpu_instances(
255270
signals = [
256271
"Instance state: running",
257272
f"Instance type: {instance_type} (GPU/accelerator family)",
273+
f"Purchasing model: {purchasing_model}",
258274
util_label,
259275
]
260276
if age_days is not None:
261277
signals.append(f"Instance age: {age_days} days")
262278
if not gpu_metrics:
263279
if _is_neuron_instance(instance_type):
264280
signals.append(
265-
"Neuron instance (Trainium/Inferentia) — no NVIDIA GPU metric "
266-
"available by design; CPU utilisation used as fallback signal; "
267-
"confidence capped at MEDIUM"
281+
"Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
282+
"applicable; CPU used as heuristic fallback; confidence MEDIUM. "
283+
"Neuron utilisation requires AWS Neuron SDK metrics."
268284
)
269285
else:
270286
signals.append(
271-
"NVIDIA CloudWatch agent not detected — GPU metric unavailable; "
272-
"CPU utilisation used as fallback signal; confidence capped at MEDIUM"
287+
"CWAgent nvidia_smi_utilization_gpu metric not found — "
288+
"this may mean the CloudWatch agent is not installed, not that "
289+
"the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
273290
)
274291

275292
not_checked = [
276-
"GPU processes not visible without nvidia-smi or DCGM agent",
293+
"GPU/accelerator utilisation (not directly measurable without CWAgent)",
277294
"Scheduled batch jobs that run outside the observation window",
278295
"Planned future use",
279-
"Spot instance hibernation state",
280296
]
297+
if purchasing_model == "spot":
298+
not_checked.append(
299+
"Spot interruption history — Spot instances may appear idle "
300+
"between interruption and relaunch"
301+
)
281302

282303
evidence = Evidence(
283304
signals_used=signals,
@@ -304,7 +325,7 @@ def find_idle_gpu_instances(
304325
f"{'GPU' if gpu_metrics else 'CPU'} utilisation below "
305326
f"{gpu_threshold if gpu_metrics else cpu_threshold}% "
306327
f"for {idle_days} days while running, incurring "
307-
f"continuous charges (~${monthly_cost:,.0f}/month)."
328+
f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)."
308329
),
309330
reason=(
310331
f"GPU EC2 instance has low "
@@ -324,11 +345,16 @@ def find_idle_gpu_instances(
324345
"idle_ratio": idle_ratio,
325346
"idle_signal": idle_signal,
326347
"utilisation_pct": round(util_value, 2),
348+
"purchasing_model": purchasing_model,
327349
"gpu_metric_available": bool(gpu_metrics),
350+
"gpu_metric_note": (
351+
"agent-dependent (CWAgent nvidia_smi_utilization_gpu); "
352+
"absence does not confirm GPU is idle"
353+
),
328354
"gpu_threshold_pct": gpu_threshold,
329355
"cpu_threshold_pct": cpu_threshold,
330356
"estimated_monthly_cost": f"~${monthly_cost:,.0f}/month",
331-
"cost_basis": "us-east-1 on-demand",
357+
"cost_basis": "us-east-1 on-demand (region-dependent estimate)",
332358
"tags": tags,
333359
},
334360
)
@@ -418,7 +444,12 @@ def _get_avg_cpu_utilisation(
418444
cloudwatch, instance_id: str, days: int, now: datetime
419445
) -> Optional[float]:
420446
"""
421-
Return the average CPU utilisation over the window using AWS/EC2 CPUUtilization.
447+
Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
448+
449+
Uses Maximum statistic per day and returns the highest daily peak. This avoids
450+
flagging burst workloads where a short but significant CPU spike would be averaged
451+
away — if the max CPU across any day is below threshold, the instance is truly idle.
452+
422453
Returns None on error — caller treats None as "not idle" (safe default).
423454
"""
424455
start = now - timedelta(days=days)
@@ -430,11 +461,11 @@ def _get_avg_cpu_utilisation(
430461
StartTime=start,
431462
EndTime=now,
432463
Period=86400, # 1-day granularity
433-
Statistics=["Average"],
464+
Statistics=["Maximum"],
434465
)
435466
datapoints = resp.get("Datapoints", [])
436467
if not datapoints:
437468
return None
438-
return sum(dp["Average"] for dp in datapoints) / len(datapoints)
469+
return max(dp["Maximum"] for dp in datapoints)
439470
except ClientError:
440471
return None

0 commit comments

Comments
 (0)