cleancloud-io
diff --git a/‎cleancloud/providers/aws/rules/cloudwatch_inactive.py‎
Lines changed: 0 additions & 70 deletions b/‎cleancloud/providers/aws/rules/cloudwatch_inactive.py‎
Lines changed: 0 additions & 70 deletions
diff --git a/‎cleancloud/providers/aws/rules/cloudwatch_logs_no_retention.py‎
Lines changed: 155 additions & 0 deletions b/‎cleancloud/providers/aws/rules/cloudwatch_logs_no_retention.py‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎cleancloud/providers/aws/rules/ec2_gpu_idle.py‎
Lines changed: 44 additions & 13 deletions b/‎cleancloud/providers/aws/rules/ec2_gpu_idle.py‎
Lines changed: 44 additions & 13 deletions
@@ -0,0 +1,155 @@
+from datetime import datetime, timezone
+from typing import List, Optional
+
+import boto3
+
+from cleancloud.core.confidence import ConfidenceLevel
+from cleancloud.core.evidence import Evidence
+from cleancloud.core.finding import Finding
+from cleancloud.core.risk import RiskLevel
+
+# Log groups newer than this are skipped — noise-reduction heuristic, not an AWS rule.
+# New groups may not have had time for an operator to review and configure retention.
+_MIN_AGE_DAYS = 7
+
+# Approximate CloudWatch Logs storage cost per GB-month.
+# This is the us-east-1 rate as of 2024; actual cost varies by region.
+# Use only as an order-of-magnitude estimate, not a billing figure.
+_STORAGE_COST_PER_GB_APPROX = 0.03
+
+# Risk thresholds by stored size
+_HIGH_RISK_GB = 1.0  # ≥ 1 GB stored → HIGH (significant cost + compliance exposure)
+# < 1 GB stored → MEDIUM; 0 bytes stored → LOW (no current cost, but policy gap still flagged)
+
+
+def find_cloudwatch_logs_no_retention(
+    session: boto3.Session,
+    region: str,
+) -> List[Finding]:
+    """
+    Find CloudWatch log groups with no retention policy (logs never expire).
+
+    This is a hygiene rule, not an idle/activity rule. It flags log groups where
+    retentionInDays is unset, meaning logs accumulate indefinitely and storage costs
+    grow without bound.
+
+    Notes on accuracy:
+    - storedBytes is eventually consistent and may lag actual ingestion by hours.
+      The cost estimate is therefore approximate and should not be used for billing.
+    - Log groups newer than 7 days are skipped as a noise-reduction heuristic.
+    - Infinite retention may be intentional for audit/security/compliance logs —
+      always review before acting on findings from this rule.
+    - Zero storedBytes does not mean no future cost risk; active log groups can
+      grow rapidly once ingestion begins.
+
+    Risk is dynamic based on stored data size:
+    - HIGH:   ≥ 1 GB stored (significant ongoing cost + likely compliance exposure)
+    - MEDIUM: > 0 bytes but < 1 GB (growing cost, policy gap)
+    - LOW:    0 bytes stored (no current cost, but hygiene issue)
+
+    IAM permissions:
+    - logs:DescribeLogGroups
+    """
+    logs = session.client("logs", region_name=region)
+    paginator = logs.get_paginator("describe_log_groups")
+
+    findings: List[Finding] = []
+    now = datetime.now(timezone.utc)
+
+    for page in paginator.paginate():
+        for lg in page.get("logGroups", []):
+            retention_days = lg.get("retentionInDays")  # None = never expire
+
+            if retention_days is not None:
+                continue
+
+            # Noise-reduction heuristic: skip recently created log groups.
+            # This is NOT an AWS-defined behavior — new groups may simply not have
+            # been reviewed yet. Adjust _MIN_AGE_DAYS if this produces too much noise.
+            creation_time_ms = lg.get("creationTime")
+            if creation_time_ms:
+                creation_time = datetime.fromtimestamp(creation_time_ms / 1000, tz=timezone.utc)
+                age_days = (now - creation_time).days
+                if age_days < _MIN_AGE_DAYS:
+                    continue
+            else:
+                age_days = None
+
+            stored_bytes = lg.get("storedBytes") or 0
+            stored_gb = stored_bytes / (1024**3)
+
+            # storedBytes is eventually consistent — cost estimate may lag reality.
+            monthly_storage_cost: Optional[float] = (
+                round(stored_gb * _STORAGE_COST_PER_GB_APPROX, 2) if stored_bytes > 0 else None
+            )
+
+            # Risk is proportional to stored size
+            if stored_gb >= _HIGH_RISK_GB:
+                risk = RiskLevel.HIGH
+            elif stored_bytes > 0:
+                risk = RiskLevel.MEDIUM
+            else:
+                risk = RiskLevel.LOW
+
+            signals_used = [
+                "Log group has no retention policy configured (logs never expire)",
+            ]
+            if age_days is not None:
+                signals_used.append(f"Log group is {age_days} days old")
+            if stored_bytes > 0:
+                signals_used.append(
+                    f"Stored data: {stored_gb:.2f} GB "
+                    f"(~${monthly_storage_cost:.2f}/month at ~${_STORAGE_COST_PER_GB_APPROX}/GB — "
+                    f"region-dependent estimate; storedBytes may lag actual ingestion)"
+                )
+            else:
+                signals_used.append(
+                    "Stored data: 0 bytes (storedBytes may lag; active groups can grow rapidly)"
+                )
+
+            evidence = Evidence(
+                signals_used=signals_used,
+                signals_not_checked=[
+                    "Recent ingestion activity (not checked — this is a hygiene rule)",
+                    "Intentional retention for audit, security, or compliance logs",
+                    "Application-level usage",
+                    "Future ingestion volume",
+                ],
+                time_window=None,
+            )
+
+            findings.append(
+                Finding(
+                    provider="aws",
+                    rule_id="aws.cloudwatch.logs.infinite_retention",
+                    resource_type="aws.cloudwatch.log_group",
+                    resource_id=lg["logGroupName"],
+                    region=region,
+                    estimated_monthly_cost_usd=monthly_storage_cost,
+                    title="CloudWatch log group with infinite retention",
+                    summary=(
+                        "Log group has no retention policy — logs accumulate indefinitely"
+                        + (f" ({stored_gb:.2f} GB stored)" if stored_bytes > 0 else "")
+                    ),
+                    reason="Retention is not set (logs never expire)",
+                    risk=risk,
+                    confidence=ConfidenceLevel.MEDIUM,  # conservative — no activity check
+                    detected_at=now,
+                    evidence=evidence,
+                    details={
+                        "stored_bytes": stored_bytes,
+                        "stored_gb": round(stored_gb, 4),
+                        "stored_bytes_note": "eventually consistent — may lag actual ingestion",
+                        "retention_days": retention_days,
+                        "age_days": age_days,
+                        "age_gate_note": f"groups < {_MIN_AGE_DAYS} days old are skipped (noise-reduction heuristic)",
+                        "estimated_monthly_storage_cost": (
+                            f"~${monthly_storage_cost:.2f}/month (approx, region-dependent)"
+                            if monthly_storage_cost
+                            else "negligible now — active groups can grow rapidly"
+                        ),
+                    },
+                )
+            )
+
+    return findings
@@ -24,6 +24,8 @@
     "p4d.",
     "p4de.",
     "p5.",
+    "p5en.",  # H100 + higher network bandwidth than p5
+    "p6.",  # NVIDIA B200 (Blackwell)
     "g4dn.",
     "g4ad.",
     "g5.",
@@ -208,6 +210,9 @@ def find_idle_gpu_instances(
 
                     instance_id = inst["InstanceId"]
                     tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
+                    # "spot" | "scheduled" | None (on-demand)
+                    instance_lifecycle = inst.get("InstanceLifecycle")
+                    purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
                     launch_time = inst.get("LaunchTime")
 
                     age_days: Optional[int] = None
@@ -237,10 +242,20 @@ def find_idle_gpu_instances(
                         avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now)
                         if avg_cpu is None or avg_cpu >= cpu_threshold:
                             continue
+                        # CPU fallback is a weak heuristic for GPU workloads:
+                        # accelerator utilisation is invisible to CPU metrics, so a GPU
+                        # instance running a compute-bound model can show near-zero CPU
+                        # while doing real work. Confidence is capped at MEDIUM to reflect
+                        # this limitation. Absence of the CWAgent GPU metric is NOT proof
+                        # that the GPU is idle — the agent may simply not be installed.
                         confidence = ConfidenceLevel.MEDIUM
                         idle_signal = "cpu_utilisation_fallback"
                         util_value = avg_cpu
-                        util_label = f"Avg CPU utilisation: {avg_cpu:.1f}% (threshold: {cpu_threshold}%, GPU metric unavailable)"
+                        util_label = (
+                            f"Peak daily CPU utilisation: {avg_cpu:.1f}% "
+                            f"(threshold: {cpu_threshold}%) — "
+                            f"heuristic only; GPU/accelerator utilisation not directly measured"
+                        )
 
                     monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST)
                     idle_ratio = round(age_days / idle_days, 2) if (age_days and idle_days) else 0.0
@@ -255,29 +270,35 @@ def find_idle_gpu_instances(
                     signals = [
                         "Instance state: running",
                         f"Instance type: {instance_type} (GPU/accelerator family)",
+                        f"Purchasing model: {purchasing_model}",
                         util_label,
                     ]
                     if age_days is not None:
                         signals.append(f"Instance age: {age_days} days")
                     if not gpu_metrics:
                         if _is_neuron_instance(instance_type):
                             signals.append(
-                                "Neuron instance (Trainium/Inferentia) — no NVIDIA GPU metric "
-                                "available by design; CPU utilisation used as fallback signal; "
-                                "confidence capped at MEDIUM"
+                                "Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
+                                "applicable; CPU used as heuristic fallback; confidence MEDIUM. "
+                                "Neuron utilisation requires AWS Neuron SDK metrics."
                             )
                         else:
                             signals.append(
-                                "NVIDIA CloudWatch agent not detected — GPU metric unavailable; "
-                                "CPU utilisation used as fallback signal; confidence capped at MEDIUM"
+                                "CWAgent nvidia_smi_utilization_gpu metric not found — "
+                                "this may mean the CloudWatch agent is not installed, not that "
+                                "the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
                             )
 
                     not_checked = [
-                        "GPU processes not visible without nvidia-smi or DCGM agent",
+                        "GPU/accelerator utilisation (not directly measurable without CWAgent)",
                         "Scheduled batch jobs that run outside the observation window",
                         "Planned future use",
-                        "Spot instance hibernation state",
                     ]
+                    if purchasing_model == "spot":
+                        not_checked.append(
+                            "Spot interruption history — Spot instances may appear idle "
+                            "between interruption and relaunch"
+                        )
 
                     evidence = Evidence(
                         signals_used=signals,
@@ -304,7 +325,7 @@ def find_idle_gpu_instances(
                                 f"{'GPU' if gpu_metrics else 'CPU'} utilisation below "
                                 f"{gpu_threshold if gpu_metrics else cpu_threshold}% "
                                 f"for {idle_days} days while running, incurring "
-                                f"continuous charges (~${monthly_cost:,.0f}/month)."
+                                f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)."
                             ),
                             reason=(
                                 f"GPU EC2 instance has low "
@@ -324,11 +345,16 @@ def find_idle_gpu_instances(
                                 "idle_ratio": idle_ratio,
                                 "idle_signal": idle_signal,
                                 "utilisation_pct": round(util_value, 2),
+                                "purchasing_model": purchasing_model,
                                 "gpu_metric_available": bool(gpu_metrics),
+                                "gpu_metric_note": (
+                                    "agent-dependent (CWAgent nvidia_smi_utilization_gpu); "
+                                    "absence does not confirm GPU is idle"
+                                ),
                                 "gpu_threshold_pct": gpu_threshold,
                                 "cpu_threshold_pct": cpu_threshold,
                                 "estimated_monthly_cost": f"~${monthly_cost:,.0f}/month",
-                                "cost_basis": "us-east-1 on-demand",
+                                "cost_basis": "us-east-1 on-demand (region-dependent estimate)",
                                 "tags": tags,
                             },
                         )
@@ -418,7 +444,12 @@ def _get_avg_cpu_utilisation(
     cloudwatch, instance_id: str, days: int, now: datetime
 ) -> Optional[float]:
     """
-    Return the average CPU utilisation over the window using AWS/EC2 CPUUtilization.
+    Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
+
+    Uses Maximum statistic per day and returns the highest daily peak. This avoids
+    flagging burst workloads where a short but significant CPU spike would be averaged
+    away — if the max CPU across any day is below threshold, the instance is truly idle.
+
     Returns None on error — caller treats None as "not idle" (safe default).
     """
     start = now - timedelta(days=days)
@@ -430,11 +461,11 @@ def _get_avg_cpu_utilisation(
             StartTime=start,
             EndTime=now,
             Period=86400,  # 1-day granularity
-            Statistics=["Average"],
+            Statistics=["Maximum"],
         )
         datapoints = resp.get("Datapoints", [])
         if not datapoints:
             return None
-        return sum(dp["Average"] for dp in datapoints) / len(datapoints)
+        return max(dp["Maximum"] for dp in datapoints)
     except ClientError:
         return None