132132}
133133_DEFAULT_MONTHLY_COST = 600.0
134134
135- # GPU utilisation thresholds
135+ # GPU utilization thresholds
136136_GPU_UTIL_THRESHOLD_PCT = 5.0 # below this = idle (when GPU metric available)
137137_CPU_UTIL_THRESHOLD_PCT = 10.0 # below this = idle (CPU fallback)
138138
@@ -151,7 +151,7 @@ def find_idle_gpu_instances(
151151 cpu_threshold : float = _CPU_UTIL_THRESHOLD_PCT ,
152152) -> List [Finding ]:
153153 """
154- Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilisation .
154+ Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilization .
155155
156156 GPU instances (raw EC2, outside SageMaker) incur continuous charges while running
157157 regardless of whether GPUs are being utilised. A p4d.24xlarge costs ~$23K/month
@@ -162,24 +162,28 @@ def find_idle_gpu_instances(
162162 - Instance state is running
163163 - Instance type is a known GPU/accelerator family
164164 - Instance is older than idle_days (avoids flagging newly launched instances)
165- - GPU utilisation < gpu_threshold % over idle_days (HIGH confidence, when NVIDIA
166- CloudWatch agent publishes nvidia_smi_utilization_gpu under CWAgent namespace )
167- - OR CPU utilisation < cpu_threshold % over idle_days (MEDIUM confidence fallback,
165+ - GPU utilization < gpu_threshold % over idle_days (HIGH confidence, when the
166+ nvidia_smi_utilization_gpu metric is discoverable for the instance in CloudWatch )
167+ - OR CPU utilization < cpu_threshold % over idle_days (MEDIUM confidence fallback,
168168 used when GPU metrics are not available — CPU alone is a weaker signal)
169169
170170 GPU metric detection:
171- The NVIDIA CloudWatch agent publishes nvidia_smi_utilization_gpu under the CWAgent
172- namespace with an InstanceId dimension. Availability is probed via ListMetrics per
173- instance — not assumed. Instances without the agent fall back to CPU utilisation.
171+ The rule probes CloudWatch ListMetrics for nvidia_smi_utilization_gpu in the CWAgent
172+ namespace, filtered by InstanceId dimension. This depends on the CloudWatch agent
173+ being installed and configured to append EC2 instance dimensions (e.g. via
174+ append_dimensions = {"InstanceId": ...}). AWS does not guarantee the InstanceId
175+ dimension is present by default; its presence is implementation-dependent. If the
176+ metric is absent or the agent is misconfigured, the rule falls back to CPU
177+ utilization. Absence of the metric is NOT proof the GPU is idle.
174178
175179 Multi-GPU handling:
176180 For multi-GPU instances (e.g., p4d.24xlarge has 8 A100s), the MAX statistic is
177181 used across all GPU index dimensions. A single active GPU on an 8-GPU instance
178182 would be averaged away using AVG, producing a misleadingly low reading.
179183
180184 Confidence:
181- - HIGH: GPU metric available AND max GPU utilisation < gpu_threshold over idle_days
182- - MEDIUM: GPU metric unavailable, CPU utilisation < cpu_threshold over idle_days
185+ - HIGH: GPU metric discoverable AND max GPU utilization < gpu_threshold over idle_days
186+ - MEDIUM: GPU metric not discoverable; CPU utilization < cpu_threshold over idle_days
183187
184188 IAM permissions:
185189 - ec2:DescribeInstances
@@ -208,53 +212,62 @@ def find_idle_gpu_instances(
208212 if not _is_gpu_instance (instance_type ):
209213 continue
210214
211- instance_id = inst ["InstanceId" ]
215+ # Normalize InstanceId — skip if missing or empty (spec section 5)
216+ instance_id = (inst .get ("InstanceId" ) or "" ).strip ()
217+ if not instance_id :
218+ continue
219+
212220 tags = {t ["Key" ]: t ["Value" ] for t in inst .get ("Tags" , [])}
213221 # "spot" | "scheduled" | None (on-demand)
214222 instance_lifecycle = inst .get ("InstanceLifecycle" )
215223 purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
216- launch_time = inst .get ("LaunchTime" )
217-
218- age_days : Optional [int ] = None
219- if launch_time :
220- if launch_time .tzinfo is None :
221- launch_time = launch_time .replace (tzinfo = timezone .utc )
222- age_days = (now - launch_time ).days
223224
224- # Skip instances younger than idle_days — too new to classify
225- if age_days is not None and age_days < idle_days :
225+ # Normalize LaunchTime — skip if missing, naive, or future (spec section 5, section 8.3)
226+ launch_time = inst .get ("LaunchTime" )
227+ if not launch_time :
228+ continue # missing LaunchTime → SKIP ITEM
229+ if launch_time .tzinfo is None :
230+ continue # naive timestamp is not tz-aware UTC → SKIP ITEM
231+ age_days = (now - launch_time ).days
232+ if age_days < 0 :
233+ continue # future LaunchTime → SKIP ITEM
234+
235+ # Skip instances younger than effective_idle_days — too new to classify
236+ if age_days < idle_days :
226237 continue
227238
228239 # Probe for GPU metrics — single ListMetrics call reused for stats
229240 gpu_metrics = _list_gpu_metrics (cloudwatch , instance_id )
230241
231242 if gpu_metrics :
232- max_gpu_util = _get_max_gpu_utilisation (
243+ max_gpu_util = _get_max_gpu_utilization (
233244 cloudwatch , gpu_metrics , idle_days , now
234245 )
235246 if max_gpu_util is None or max_gpu_util >= gpu_threshold :
236247 continue
237248 confidence = ConfidenceLevel .HIGH
238- idle_signal = "gpu_utilisation "
249+ idle_signal = "gpu_utilization "
239250 util_value = max_gpu_util
240- util_label = f"Max GPU utilisation : { max_gpu_util :.1f} % (threshold: { gpu_threshold } %)"
251+ util_label = f"Max GPU utilization : { max_gpu_util :.1f} % (threshold: { gpu_threshold } %)"
241252 else :
242- avg_cpu = _get_avg_cpu_utilisation (cloudwatch , instance_id , idle_days , now )
243- if avg_cpu is None or avg_cpu >= cpu_threshold :
253+ max_cpu = _get_max_daily_cpu_utilization (
254+ cloudwatch , instance_id , idle_days , now
255+ )
256+ if max_cpu is None or max_cpu >= cpu_threshold :
244257 continue
245258 # CPU fallback is a weak heuristic for GPU workloads:
246- # accelerator utilisation is invisible to CPU metrics, so a GPU
259+ # accelerator utilization is invisible to CPU metrics, so a GPU
247260 # instance running a compute-bound model can show near-zero CPU
248261 # while doing real work. Confidence is capped at MEDIUM to reflect
249262 # this limitation. Absence of the CWAgent GPU metric is NOT proof
250- # that the GPU is idle — the agent may simply not be installed .
263+ # that the GPU is idle — the agent may be absent or misconfigured .
251264 confidence = ConfidenceLevel .MEDIUM
252- idle_signal = "cpu_utilisation_fallback "
253- util_value = avg_cpu
265+ idle_signal = "cpu_utilization_fallback "
266+ util_value = max_cpu
254267 util_label = (
255- f"Peak daily CPU utilisation : { avg_cpu :.1f} % "
268+ f"Peak daily CPU utilization : { max_cpu :.1f} % "
256269 f"(threshold: { cpu_threshold } %) — "
257- f"heuristic only; GPU/accelerator utilisation not directly measured"
270+ f"heuristic only; GPU/accelerator utilization not directly measured"
258271 )
259272
260273 monthly_cost = _MONTHLY_COST .get (instance_type , _DEFAULT_MONTHLY_COST )
@@ -272,15 +285,14 @@ def find_idle_gpu_instances(
272285 f"Instance type: { instance_type } (GPU/accelerator family)" ,
273286 f"Purchasing model: { purchasing_model } " ,
274287 util_label ,
288+ f"Instance age: { age_days } days" ,
275289 ]
276- if age_days is not None :
277- signals .append (f"Instance age: { age_days } days" )
278290 if not gpu_metrics :
279291 if _is_neuron_instance (instance_type ):
280292 signals .append (
281293 "Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
282294 "applicable; CPU used as heuristic fallback; confidence MEDIUM. "
283- "Neuron utilisation requires AWS Neuron SDK metrics."
295+ "Neuron utilization requires AWS Neuron SDK metrics."
284296 )
285297 else :
286298 signals .append (
@@ -289,11 +301,18 @@ def find_idle_gpu_instances(
289301 "the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
290302 )
291303
304+ # signals_not_checked: GPU note only applies on CPU fallback path (spec section 11.1)
292305 not_checked = [
293- "GPU/accelerator utilisation (not directly measurable without CWAgent)" ,
294306 "Scheduled batch jobs that run outside the observation window" ,
295307 "Planned future use" ,
296308 ]
309+ if not gpu_metrics :
310+ not_checked .insert (
311+ 0 ,
312+ "Direct GPU/accelerator utilization — nvidia_smi_utilization_gpu was not "
313+ "discoverable in CloudWatch (CWAgent may be absent or not configured with "
314+ "InstanceId dimension); absence of the metric does not confirm the GPU is idle" ,
315+ )
297316 if purchasing_model == "spot" :
298317 not_checked .append (
299318 "Spot interruption history — Spot instances may appear idle "
@@ -307,6 +326,17 @@ def find_idle_gpu_instances(
307326 )
308327
309328 metric_label = "GPU" if gpu_metrics else "CPU (fallback)"
329+ if gpu_metrics :
330+ reason = (
331+ f"GPU EC2 instance has low GPU utilization "
332+ f"({ util_value :.1f} %) over { idle_days } days"
333+ )
334+ else :
335+ reason = (
336+ f"GPU EC2 instance shows low CPU proxy signal "
337+ f"({ util_value :.1f} %) over { idle_days } days — "
338+ f"GPU activity not directly measured"
339+ )
310340 findings .append (
311341 Finding (
312342 provider = "aws" ,
@@ -316,22 +346,18 @@ def find_idle_gpu_instances(
316346 region = region ,
317347 estimated_monthly_cost_usd = monthly_cost ,
318348 title = (
319- f"Idle GPU EC2 Instance ({ metric_label } utilisation "
349+ f"Idle GPU EC2 Instance ({ metric_label } utilization "
320350 f"<{ gpu_threshold if gpu_metrics else cpu_threshold } % "
321351 f"over { idle_days } days)"
322352 ),
323353 summary = (
324354 f"EC2 instance '{ name_tag } ' ({ instance_type } ) has had "
325- f"{ 'GPU' if gpu_metrics else 'CPU' } utilisation below "
355+ f"{ 'GPU' if gpu_metrics else 'CPU' } utilization below "
326356 f"{ gpu_threshold if gpu_metrics else cpu_threshold } % "
327357 f"for { idle_days } days while running, incurring "
328358 f"continuous charges (~${ monthly_cost :,.0f} /month us-east-1 estimate)."
329359 ),
330- reason = (
331- f"GPU EC2 instance has low "
332- f"{ 'GPU' if gpu_metrics else 'CPU' } utilisation "
333- f"({ util_value :.1f} %) for { idle_days } days"
334- ),
360+ reason = reason ,
335361 risk = risk ,
336362 confidence = confidence ,
337363 detected_at = now ,
@@ -340,11 +366,11 @@ def find_idle_gpu_instances(
340366 "instance_id" : instance_id ,
341367 "instance_type" : instance_type ,
342368 "name" : name_tag ,
343- "age_days" : ( age_days if age_days is not None else "unknown" ) ,
369+ "age_days" : age_days ,
344370 "idle_days_threshold" : idle_days ,
345371 "idle_ratio" : idle_ratio ,
346372 "idle_signal" : idle_signal ,
347- "utilisation_pct " : round (util_value , 2 ),
373+ "utilization_pct " : round (util_value , 2 ),
348374 "purchasing_model" : purchasing_model ,
349375 "gpu_metric_available" : bool (gpu_metrics ),
350376 "gpu_metric_note" : (
@@ -387,25 +413,34 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list:
387413 """
388414 Probe CloudWatch ListMetrics for nvidia_smi_utilization_gpu under CWAgent namespace.
389415
390- Returns the Metrics list (one entry per GPU index) so the caller can reuse it
416+ Exhausts pagination via NextToken (spec section 2 key fact 6: ListMetrics returns up to
417+ 500 results per call). Returns all Metrics entries so the caller can reuse them
391418 for GetMetricStatistics without a second ListMetrics call. Returns [] on any error.
392419 """
420+ metrics : list = []
421+ kwargs : dict = {
422+ "Namespace" : "CWAgent" ,
423+ "MetricName" : "nvidia_smi_utilization_gpu" ,
424+ "Dimensions" : [{"Name" : "InstanceId" , "Value" : instance_id }],
425+ }
393426 try :
394- resp = cloudwatch .list_metrics (
395- Namespace = "CWAgent" ,
396- MetricName = "nvidia_smi_utilization_gpu" ,
397- Dimensions = [{"Name" : "InstanceId" , "Value" : instance_id }],
398- )
399- return resp .get ("Metrics" , [])
427+ while True :
428+ resp = cloudwatch .list_metrics (** kwargs )
429+ metrics .extend (resp .get ("Metrics" , []))
430+ next_token = resp .get ("NextToken" )
431+ if not next_token :
432+ break
433+ kwargs ["NextToken" ] = next_token
400434 except Exception :
401435 return []
436+ return metrics
402437
403438
404- def _get_max_gpu_utilisation (
439+ def _get_max_gpu_utilization (
405440 cloudwatch , gpu_metrics : list , days : int , now : datetime
406441) -> Optional [float ]:
407442 """
408- Return the maximum GPU utilisation across all GPU indices over the window.
443+ Return the maximum GPU utilization across all GPU indices over the window.
409444
410445 Takes the gpu_metrics list already fetched by _list_gpu_metrics — no second
411446 ListMetrics call. Uses MAX statistic so a single active GPU on a multi-GPU
@@ -440,17 +475,17 @@ def _get_max_gpu_utilisation(
440475 return max_util
441476
442477
443- def _get_avg_cpu_utilisation (
478+ def _get_max_daily_cpu_utilization (
444479 cloudwatch , instance_id : str , days : int , now : datetime
445480) -> Optional [float ]:
446481 """
447- Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
482+ Return the maximum daily CPU peak over the window using AWS/EC2 CPUUtilization.
448483
449- Uses Maximum statistic per day and returns the highest daily peak. This avoids
450- flagging burst workloads where a short but significant CPU spike would be averaged
451- away — if the max CPU across any day is below threshold, the instance is truly idle .
484+ Uses Maximum statistic at daily (86400s) period and returns the highest value
485+ across all returned datapoints (spec section 6.2). This avoids flagging burst workloads
486+ where a short but significant CPU spike would be averaged away .
452487
453- Returns None on error — caller treats None as "not idle" (safe default).
488+ Returns None on error or no datapoints — caller treats None as "not idle" (safe default).
454489 """
455490 start = now - timedelta (days = days )
456491 try :
0 commit comments