2424 "p4d." ,
2525 "p4de." ,
2626 "p5." ,
27+ "p5en." , # H100 + higher network bandwidth than p5
28+ "p6." , # NVIDIA B200 (Blackwell)
2729 "g4dn." ,
2830 "g4ad." ,
2931 "g5." ,
@@ -208,6 +210,9 @@ def find_idle_gpu_instances(
208210
209211 instance_id = inst ["InstanceId" ]
210212 tags = {t ["Key" ]: t ["Value" ] for t in inst .get ("Tags" , [])}
213+ # "spot" | "scheduled" | None (on-demand)
214+ instance_lifecycle = inst .get ("InstanceLifecycle" )
215+ purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
211216 launch_time = inst .get ("LaunchTime" )
212217
213218 age_days : Optional [int ] = None
@@ -237,10 +242,20 @@ def find_idle_gpu_instances(
237242 avg_cpu = _get_avg_cpu_utilisation (cloudwatch , instance_id , idle_days , now )
238243 if avg_cpu is None or avg_cpu >= cpu_threshold :
239244 continue
245+ # CPU fallback is a weak heuristic for GPU workloads:
246+ # accelerator utilisation is invisible to CPU metrics, so a GPU
247+ # instance running a compute-bound model can show near-zero CPU
248+ # while doing real work. Confidence is capped at MEDIUM to reflect
249+ # this limitation. Absence of the CWAgent GPU metric is NOT proof
250+ # that the GPU is idle — the agent may simply not be installed.
240251 confidence = ConfidenceLevel .MEDIUM
241252 idle_signal = "cpu_utilisation_fallback"
242253 util_value = avg_cpu
243- util_label = f"Avg CPU utilisation: { avg_cpu :.1f} % (threshold: { cpu_threshold } %, GPU metric unavailable)"
254+ util_label = (
255+ f"Peak daily CPU utilisation: { avg_cpu :.1f} % "
256+ f"(threshold: { cpu_threshold } %) — "
257+ f"heuristic only; GPU/accelerator utilisation not directly measured"
258+ )
244259
245260 monthly_cost = _MONTHLY_COST .get (instance_type , _DEFAULT_MONTHLY_COST )
246261 idle_ratio = round (age_days / idle_days , 2 ) if (age_days and idle_days ) else 0.0
@@ -255,29 +270,35 @@ def find_idle_gpu_instances(
255270 signals = [
256271 "Instance state: running" ,
257272 f"Instance type: { instance_type } (GPU/accelerator family)" ,
273+ f"Purchasing model: { purchasing_model } " ,
258274 util_label ,
259275 ]
260276 if age_days is not None :
261277 signals .append (f"Instance age: { age_days } days" )
262278 if not gpu_metrics :
263279 if _is_neuron_instance (instance_type ):
264280 signals .append (
265- "Neuron instance (Trainium/Inferentia) — no NVIDIA GPU metric "
266- "available by design ; CPU utilisation used as fallback signal; "
267- "confidence capped at MEDIUM "
281+ "Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
282+ "applicable ; CPU used as heuristic fallback; confidence MEDIUM. "
283+ "Neuron utilisation requires AWS Neuron SDK metrics. "
268284 )
269285 else :
270286 signals .append (
271- "NVIDIA CloudWatch agent not detected — GPU metric unavailable; "
272- "CPU utilisation used as fallback signal; confidence capped at MEDIUM"
287+ "CWAgent nvidia_smi_utilization_gpu metric not found — "
288+ "this may mean the CloudWatch agent is not installed, not that "
289+ "the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
273290 )
274291
275292 not_checked = [
276- "GPU processes not visible without nvidia-smi or DCGM agent " ,
293+ "GPU/accelerator utilisation ( not directly measurable without CWAgent) " ,
277294 "Scheduled batch jobs that run outside the observation window" ,
278295 "Planned future use" ,
279- "Spot instance hibernation state" ,
280296 ]
297+ if purchasing_model == "spot" :
298+ not_checked .append (
299+ "Spot interruption history — Spot instances may appear idle "
300+ "between interruption and relaunch"
301+ )
281302
282303 evidence = Evidence (
283304 signals_used = signals ,
@@ -304,7 +325,7 @@ def find_idle_gpu_instances(
304325 f"{ 'GPU' if gpu_metrics else 'CPU' } utilisation below "
305326 f"{ gpu_threshold if gpu_metrics else cpu_threshold } % "
306327 f"for { idle_days } days while running, incurring "
307- f"continuous charges (~${ monthly_cost :,.0f} /month)."
328+ f"continuous charges (~${ monthly_cost :,.0f} /month us-east-1 estimate )."
308329 ),
309330 reason = (
310331 f"GPU EC2 instance has low "
@@ -324,11 +345,16 @@ def find_idle_gpu_instances(
324345 "idle_ratio" : idle_ratio ,
325346 "idle_signal" : idle_signal ,
326347 "utilisation_pct" : round (util_value , 2 ),
348+ "purchasing_model" : purchasing_model ,
327349 "gpu_metric_available" : bool (gpu_metrics ),
350+ "gpu_metric_note" : (
351+ "agent-dependent (CWAgent nvidia_smi_utilization_gpu); "
352+ "absence does not confirm GPU is idle"
353+ ),
328354 "gpu_threshold_pct" : gpu_threshold ,
329355 "cpu_threshold_pct" : cpu_threshold ,
330356 "estimated_monthly_cost" : f"~${ monthly_cost :,.0f} /month" ,
331- "cost_basis" : "us-east-1 on-demand" ,
357+ "cost_basis" : "us-east-1 on-demand (region-dependent estimate) " ,
332358 "tags" : tags ,
333359 },
334360 )
@@ -418,7 +444,12 @@ def _get_avg_cpu_utilisation(
418444 cloudwatch , instance_id : str , days : int , now : datetime
419445) -> Optional [float ]:
420446 """
421- Return the average CPU utilisation over the window using AWS/EC2 CPUUtilization.
447+ Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
448+
449+ Uses Maximum statistic per day and returns the highest daily peak. This avoids
450+ flagging burst workloads where a short but significant CPU spike would be averaged
451+ away — if the max CPU across any day is below threshold, the instance is truly idle.
452+
422453 Returns None on error — caller treats None as "not idle" (safe default).
423454 """
424455 start = now - timedelta (days = days )
@@ -430,11 +461,11 @@ def _get_avg_cpu_utilisation(
430461 StartTime = start ,
431462 EndTime = now ,
432463 Period = 86400 , # 1-day granularity
433- Statistics = ["Average " ],
464+ Statistics = ["Maximum " ],
434465 )
435466 datapoints = resp .get ("Datapoints" , [])
436467 if not datapoints :
437468 return None
438- return sum (dp ["Average " ] for dp in datapoints ) / len ( datapoints )
469+ return max (dp ["Maximum " ] for dp in datapoints )
439470 except ClientError :
440471 return None
0 commit comments