cleancloud-io
diff --git a/‎cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py‎
Lines changed: 94 additions & 59 deletions b/‎cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py‎
Lines changed: 94 additions & 59 deletions
@@ -132,7 +132,7 @@
 }
 _DEFAULT_MONTHLY_COST = 600.0
 
-# GPU utilisation thresholds
+# GPU utilization thresholds
 _GPU_UTIL_THRESHOLD_PCT = 5.0  # below this = idle (when GPU metric available)
 _CPU_UTIL_THRESHOLD_PCT = 10.0  # below this = idle (CPU fallback)
 
@@ -151,7 +151,7 @@ def find_idle_gpu_instances(
     cpu_threshold: float = _CPU_UTIL_THRESHOLD_PCT,
 ) -> List[Finding]:
     """
-    Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilisation.
+    Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilization.
 
     GPU instances (raw EC2, outside SageMaker) incur continuous charges while running
     regardless of whether GPUs are being utilised. A p4d.24xlarge costs ~$23K/month
@@ -162,24 +162,28 @@ def find_idle_gpu_instances(
     - Instance state is running
     - Instance type is a known GPU/accelerator family
     - Instance is older than idle_days (avoids flagging newly launched instances)
-    - GPU utilisation < gpu_threshold % over idle_days (HIGH confidence, when NVIDIA
-      CloudWatch agent publishes nvidia_smi_utilization_gpu under CWAgent namespace)
-    - OR CPU utilisation < cpu_threshold % over idle_days (MEDIUM confidence fallback,
+    - GPU utilization < gpu_threshold % over idle_days (HIGH confidence, when the
+      nvidia_smi_utilization_gpu metric is discoverable for the instance in CloudWatch)
+    - OR CPU utilization < cpu_threshold % over idle_days (MEDIUM confidence fallback,
       used when GPU metrics are not available — CPU alone is a weaker signal)
 
     GPU metric detection:
-    The NVIDIA CloudWatch agent publishes nvidia_smi_utilization_gpu under the CWAgent
-    namespace with an InstanceId dimension. Availability is probed via ListMetrics per
-    instance — not assumed. Instances without the agent fall back to CPU utilisation.
+    The rule probes CloudWatch ListMetrics for nvidia_smi_utilization_gpu in the CWAgent
+    namespace, filtered by InstanceId dimension. This depends on the CloudWatch agent
+    being installed and configured to append EC2 instance dimensions (e.g. via
+    append_dimensions = {"InstanceId": ...}). AWS does not guarantee the InstanceId
+    dimension is present by default; its presence is implementation-dependent. If the
+    metric is absent or the agent is misconfigured, the rule falls back to CPU
+    utilization. Absence of the metric is NOT proof the GPU is idle.
 
     Multi-GPU handling:
     For multi-GPU instances (e.g., p4d.24xlarge has 8 A100s), the MAX statistic is
     used across all GPU index dimensions. A single active GPU on an 8-GPU instance
     would be averaged away using AVG, producing a misleadingly low reading.
 
     Confidence:
-    - HIGH: GPU metric available AND max GPU utilisation < gpu_threshold over idle_days
-    - MEDIUM: GPU metric unavailable, CPU utilisation < cpu_threshold over idle_days
+    - HIGH: GPU metric discoverable AND max GPU utilization < gpu_threshold over idle_days
+    - MEDIUM: GPU metric not discoverable; CPU utilization < cpu_threshold over idle_days
 
     IAM permissions:
     - ec2:DescribeInstances
@@ -208,53 +212,62 @@ def find_idle_gpu_instances(
                     if not _is_gpu_instance(instance_type):
                         continue
 
-                    instance_id = inst["InstanceId"]
+                    # Normalize InstanceId — skip if missing or empty (spec section 5)
+                    instance_id = (inst.get("InstanceId") or "").strip()
+                    if not instance_id:
+                        continue
+
                     tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
                     # "spot" | "scheduled" | None (on-demand)
                     instance_lifecycle = inst.get("InstanceLifecycle")
                     purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
-                    launch_time = inst.get("LaunchTime")
-
-                    age_days: Optional[int] = None
-                    if launch_time:
-                        if launch_time.tzinfo is None:
-                            launch_time = launch_time.replace(tzinfo=timezone.utc)
-                        age_days = (now - launch_time).days
 
-                    # Skip instances younger than idle_days — too new to classify
-                    if age_days is not None and age_days < idle_days:
+                    # Normalize LaunchTime — skip if missing, naive, or future (spec section 5, section 8.3)
+                    launch_time = inst.get("LaunchTime")
+                    if not launch_time:
+                        continue  # missing LaunchTime → SKIP ITEM
+                    if launch_time.tzinfo is None:
+                        continue  # naive timestamp is not tz-aware UTC → SKIP ITEM
+                    age_days = (now - launch_time).days
+                    if age_days < 0:
+                        continue  # future LaunchTime → SKIP ITEM
+
+                    # Skip instances younger than effective_idle_days — too new to classify
+                    if age_days < idle_days:
                         continue
 
                     # Probe for GPU metrics — single ListMetrics call reused for stats
                     gpu_metrics = _list_gpu_metrics(cloudwatch, instance_id)
 
                     if gpu_metrics:
-                        max_gpu_util = _get_max_gpu_utilisation(
+                        max_gpu_util = _get_max_gpu_utilization(
                             cloudwatch, gpu_metrics, idle_days, now
                         )
                         if max_gpu_util is None or max_gpu_util >= gpu_threshold:
                             continue
                         confidence = ConfidenceLevel.HIGH
-                        idle_signal = "gpu_utilisation"
+                        idle_signal = "gpu_utilization"
                         util_value = max_gpu_util
-                        util_label = f"Max GPU utilisation: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
+                        util_label = f"Max GPU utilization: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
                     else:
-                        avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now)
-                        if avg_cpu is None or avg_cpu >= cpu_threshold:
+                        max_cpu = _get_max_daily_cpu_utilization(
+                            cloudwatch, instance_id, idle_days, now
+                        )
+                        if max_cpu is None or max_cpu >= cpu_threshold:
                             continue
                         # CPU fallback is a weak heuristic for GPU workloads:
-                        # accelerator utilisation is invisible to CPU metrics, so a GPU
+                        # accelerator utilization is invisible to CPU metrics, so a GPU
                         # instance running a compute-bound model can show near-zero CPU
                         # while doing real work. Confidence is capped at MEDIUM to reflect
                         # this limitation. Absence of the CWAgent GPU metric is NOT proof
-                        # that the GPU is idle — the agent may simply not be installed.
+                        # that the GPU is idle — the agent may be absent or misconfigured.
                         confidence = ConfidenceLevel.MEDIUM
-                        idle_signal = "cpu_utilisation_fallback"
-                        util_value = avg_cpu
+                        idle_signal = "cpu_utilization_fallback"
+                        util_value = max_cpu
                         util_label = (
-                            f"Peak daily CPU utilisation: {avg_cpu:.1f}% "
+                            f"Peak daily CPU utilization: {max_cpu:.1f}% "
                             f"(threshold: {cpu_threshold}%) — "
-                            f"heuristic only; GPU/accelerator utilisation not directly measured"
+                            f"heuristic only; GPU/accelerator utilization not directly measured"
                         )
 
                     monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST)
@@ -272,15 +285,14 @@ def find_idle_gpu_instances(
                         f"Instance type: {instance_type} (GPU/accelerator family)",
                         f"Purchasing model: {purchasing_model}",
                         util_label,
+                        f"Instance age: {age_days} days",
                     ]
-                    if age_days is not None:
-                        signals.append(f"Instance age: {age_days} days")
                     if not gpu_metrics:
                         if _is_neuron_instance(instance_type):
                             signals.append(
                                 "Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
                                 "applicable; CPU used as heuristic fallback; confidence MEDIUM. "
-                                "Neuron utilisation requires AWS Neuron SDK metrics."
+                                "Neuron utilization requires AWS Neuron SDK metrics."
                             )
                         else:
                             signals.append(
@@ -289,11 +301,18 @@ def find_idle_gpu_instances(
                                 "the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
                             )
 
+                    # signals_not_checked: GPU note only applies on CPU fallback path (spec section 11.1)
                     not_checked = [
-                        "GPU/accelerator utilisation (not directly measurable without CWAgent)",
                         "Scheduled batch jobs that run outside the observation window",
                         "Planned future use",
                     ]
+                    if not gpu_metrics:
+                        not_checked.insert(
+                            0,
+                            "Direct GPU/accelerator utilization — nvidia_smi_utilization_gpu was not "
+                            "discoverable in CloudWatch (CWAgent may be absent or not configured with "
+                            "InstanceId dimension); absence of the metric does not confirm the GPU is idle",
+                        )
                     if purchasing_model == "spot":
                         not_checked.append(
                             "Spot interruption history — Spot instances may appear idle "
@@ -307,6 +326,17 @@ def find_idle_gpu_instances(
                     )
 
                     metric_label = "GPU" if gpu_metrics else "CPU (fallback)"
+                    if gpu_metrics:
+                        reason = (
+                            f"GPU EC2 instance has low GPU utilization "
+                            f"({util_value:.1f}%) over {idle_days} days"
+                        )
+                    else:
+                        reason = (
+                            f"GPU EC2 instance shows low CPU proxy signal "
+                            f"({util_value:.1f}%) over {idle_days} days — "
+                            f"GPU activity not directly measured"
+                        )
                     findings.append(
                         Finding(
                             provider="aws",
@@ -316,22 +346,18 @@ def find_idle_gpu_instances(
                             region=region,
                             estimated_monthly_cost_usd=monthly_cost,
                             title=(
-                                f"Idle GPU EC2 Instance ({metric_label} utilisation "
+                                f"Idle GPU EC2 Instance ({metric_label} utilization "
                                 f"<{gpu_threshold if gpu_metrics else cpu_threshold}% "
                                 f"over {idle_days} days)"
                             ),
                             summary=(
                                 f"EC2 instance '{name_tag}' ({instance_type}) has had "
-                                f"{'GPU' if gpu_metrics else 'CPU'} utilisation below "
+                                f"{'GPU' if gpu_metrics else 'CPU'} utilization below "
                                 f"{gpu_threshold if gpu_metrics else cpu_threshold}% "
                                 f"for {idle_days} days while running, incurring "
                                 f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)."
                             ),
-                            reason=(
-                                f"GPU EC2 instance has low "
-                                f"{'GPU' if gpu_metrics else 'CPU'} utilisation "
-                                f"({util_value:.1f}%) for {idle_days} days"
-                            ),
+                            reason=reason,
                             risk=risk,
                             confidence=confidence,
                             detected_at=now,
@@ -340,11 +366,11 @@ def find_idle_gpu_instances(
                                 "instance_id": instance_id,
                                 "instance_type": instance_type,
                                 "name": name_tag,
-                                "age_days": (age_days if age_days is not None else "unknown"),
+                                "age_days": age_days,
                                 "idle_days_threshold": idle_days,
                                 "idle_ratio": idle_ratio,
                                 "idle_signal": idle_signal,
-                                "utilisation_pct": round(util_value, 2),
+                                "utilization_pct": round(util_value, 2),
                                 "purchasing_model": purchasing_model,
                                 "gpu_metric_available": bool(gpu_metrics),
                                 "gpu_metric_note": (
@@ -387,25 +413,34 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list:
     """
     Probe CloudWatch ListMetrics for nvidia_smi_utilization_gpu under CWAgent namespace.
 
-    Returns the Metrics list (one entry per GPU index) so the caller can reuse it
+    Exhausts pagination via NextToken (spec section 2 key fact 6: ListMetrics returns up to
+    500 results per call). Returns all Metrics entries so the caller can reuse them
     for GetMetricStatistics without a second ListMetrics call. Returns [] on any error.
     """
+    metrics: list = []
+    kwargs: dict = {
+        "Namespace": "CWAgent",
+        "MetricName": "nvidia_smi_utilization_gpu",
+        "Dimensions": [{"Name": "InstanceId", "Value": instance_id}],
+    }
     try:
-        resp = cloudwatch.list_metrics(
-            Namespace="CWAgent",
-            MetricName="nvidia_smi_utilization_gpu",
-            Dimensions=[{"Name": "InstanceId", "Value": instance_id}],
-        )
-        return resp.get("Metrics", [])
+        while True:
+            resp = cloudwatch.list_metrics(**kwargs)
+            metrics.extend(resp.get("Metrics", []))
+            next_token = resp.get("NextToken")
+            if not next_token:
+                break
+            kwargs["NextToken"] = next_token
     except Exception:
         return []
+    return metrics
 
 
-def _get_max_gpu_utilisation(
+def _get_max_gpu_utilization(
     cloudwatch, gpu_metrics: list, days: int, now: datetime
 ) -> Optional[float]:
     """
-    Return the maximum GPU utilisation across all GPU indices over the window.
+    Return the maximum GPU utilization across all GPU indices over the window.
 
     Takes the gpu_metrics list already fetched by _list_gpu_metrics — no second
     ListMetrics call. Uses MAX statistic so a single active GPU on a multi-GPU
@@ -440,17 +475,17 @@ def _get_max_gpu_utilisation(
     return max_util
 
 
-def _get_avg_cpu_utilisation(
+def _get_max_daily_cpu_utilization(
     cloudwatch, instance_id: str, days: int, now: datetime
 ) -> Optional[float]:
     """
-    Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
+    Return the maximum daily CPU peak over the window using AWS/EC2 CPUUtilization.
 
-    Uses Maximum statistic per day and returns the highest daily peak. This avoids
-    flagging burst workloads where a short but significant CPU spike would be averaged
-    away — if the max CPU across any day is below threshold, the instance is truly idle.
+    Uses Maximum statistic at daily (86400s) period and returns the highest value
+    across all returned datapoints (spec section 6.2). This avoids flagging burst workloads
+    where a short but significant CPU spike would be averaged away.
 
-    Returns None on error — caller treats None as "not idle" (safe default).
+    Returns None on error or no datapoints — caller treats None as "not idle" (safe default).
     """
     start = now - timedelta(days=days)
     try: