|
1 | | -[ |
2 | | - { |
3 | | - "scenario_id": "node-not-ready", |
4 | | - "description": "Worker node becomes NotReady due to kubelet crash", |
5 | | - "node_status_check": "## Node Status\n- control-plane-1: Ready, SchedulingDisabled\n- worker-1: Ready, 6 vCPU, 30Gi RAM\n- worker-2: NotReady, 6 vCPU, 20Gi RAM — last heartbeat 8m ago\n- worker-3: Ready, 6 vCPU, 20Gi RAM\n\n## Node Resource Usage\nNAME CPU(cores) CPU% MEMORY(bytes) MEMORY%\ncontrol-plane-1 380m 9% 3200Mi 40%\nworker-1 2100m 35% 18500Mi 62%\nworker-2 <unknown> <unknown> <unknown> <unknown>\nworker-3 1800m 30% 9600Mi 48%", |
6 | | - "pod_health_check": "## Unhealthy Pods\nmonitoring prometheus-node-exporter-abc12 0/1 NodeAffinity 0 worker-2\nproduction cache-redis-0 0/1 Terminating 0 worker-2\nproduction api-server-5f8b9c7d4-x2k9p 0/1 Pending 0 <none>\n\n## High Restart Pods\n monitoring/alertmanager-main-0: 12 restarts", |
7 | | - "event_collector": "## Warning Events (6 most recent)\nmonitoring Warning NodeNotReady Node/worker-2 Node worker-2 status is now: NodeNotReady\nproduction Warning FailedScheduling Pod/api-server-5f8b9c7d4-x2k9p 0/3 nodes are available: 1 node had untolerated taint node.kubernetes.io/not-ready\nmonitoring Warning Unhealthy Pod/prometheus-node-exporter-abc12 Readiness probe failed: connection refused\nproduction Warning Evicted Pod/cache-redis-0 The node was low on resource: ephemeral-storage\nkube-system Warning NodeNotReady Node/worker-2 Controller detected that node worker-2 is not ready\nmonitoring Warning BackOff Pod/alertmanager-main-0 Back-off restarting failed container\n\n## Recent Events (10 most recent)\nkube-system Normal NodeHasSufficientMemory Node/worker-1 Node worker-1 status is now: NodeHasSufficientMemory\nkube-system Normal Starting Node/worker-3 Starting kubelet\nmonitoring Normal Pulled Pod/grafana-6d8f9c8b7-abc Container image already present\nmonitoring Normal Created Pod/grafana-6d8f9c8b7-abc Created container grafana", |
8 | | - "resource_pressure_check": "## Node Pressure Conditions\n- worker-2: MemoryPressure=Unknown DiskPressure=Unknown PIDPressure=Unknown Ready=False\n\n## Node Resource Utilization\nNAME CPU(cores) CPU% MEMORY(bytes) MEMORY%\ncontrol-plane-1 380m 9% 3200Mi 40%\nworker-1 2100m 35% 18500Mi 62%\nworker-3 1800m 30% 9600Mi 48%\nNote: worker-2 metrics unavailable (node NotReady)\n\n## Nodes Exceeding Thresholds\nNone (excluding offline worker-2)" |
9 | | - }, |
10 | | - { |
11 | | - "scenario_id": "memory-pressure", |
12 | | - "description": "Multiple pods OOMKilled due to memory pressure on a worker node", |
13 | | - "node_status_check": "## Node Status\nAll 4 nodes are present.\n- control-plane-1: Ready, SchedulingDisabled\n- worker-1: Ready, 8 vCPU, 64Gi RAM, MemoryPressure=True\n- worker-2: Ready, 6 vCPU, 20Gi RAM\n- worker-3: Ready, 6 vCPU, 20Gi RAM\n\n## Node Resource Usage\nNAME CPU(cores) CPU% MEMORY(bytes) MEMORY%\ncontrol-plane-1 420m 10% 3100Mi 39%\nworker-1 5800m 72% 59800Mi 93%\nworker-2 1200m 20% 12400Mi 62%\nworker-3 900m 15% 8200Mi 41%", |
14 | | - "pod_health_check": "## Unhealthy Pods\nml-serving inference-server-7f9b8c6d5-kl2m3 0/1 OOMKilled 3 worker-1\nproduction api-gateway-5d7b8a9c2-pq4r5 0/1 CrashLoopBackOff 7 worker-1\nmonitoring metrics-collector-6c8d9e7f1-st6u7 0/1 OOMKilled 2 worker-1\n\n## High Restart Pods\n ml-serving/inference-server-7f9b8c6d5-kl2m3: 8 restarts\n production/api-gateway-5d7b8a9c2-pq4r5: 12 restarts\n monitoring/metrics-collector-6c8d9e7f1-st6u7: 6 restarts", |
15 | | - "event_collector": "## Warning Events (8 most recent)\nml-serving Warning OOMKilling Pod/inference-server-7f9b8c6d5-kl2m3 Memory cgroup out of memory: Killed process 4521 (inference-server)\nproduction Warning OOMKilling Pod/api-gateway-5d7b8a9c2-pq4r5 Memory cgroup out of memory: Killed process 3892 (python)\nmonitoring Warning OOMKilling Pod/metrics-collector-6c8d9e7f1-st6u7 Memory cgroup out of memory: Killed process 5123\nml-serving Warning BackOff Pod/inference-server-7f9b8c6d5-kl2m3 Back-off restarting failed container\nproduction Warning BackOff Pod/api-gateway-5d7b8a9c2-pq4r5 Back-off restarting failed container\nkube-system Warning SystemOOM Node/worker-1 System OOM encountered, victim process: inference-server\nkube-system Warning EvictionThresholdMet Node/worker-1 Attempting to reclaim memory\nml-serving Warning Unhealthy Pod/inference-server-7f9b8c6d5-kl2m3 Liveness probe failed: connection refused", |
16 | | - "resource_pressure_check": "## Nodes Under Pressure\n- worker-1: MemoryPressure=True DiskPressure=False PIDPressure=False Ready=True\n\n## Node Resource Utilization\nNAME CPU(cores) CPU% MEMORY(bytes) MEMORY%\ncontrol-plane-1 420m 10% 3100Mi 39%\nworker-1 5800m 72% 59800Mi 93%\nworker-2 1200m 20% 12400Mi 62%\nworker-3 900m 15% 8200Mi 41%\n\n## Nodes Exceeding Thresholds\n - worker-1: Memory at 93% (threshold: 85%)" |
17 | | - }, |
18 | | - { |
19 | | - "scenario_id": "healthy-cluster", |
20 | | - "description": "Normal cluster operations with no issues detected", |
21 | | - "node_status_check": "## Node Status\nAll 4 nodes are in Ready state.\n- control-plane-1: Ready, SchedulingDisabled (control-plane taint)\n- worker-1: Ready, 6 vCPU, 30Gi RAM\n- worker-2: Ready, 6 vCPU, 20Gi RAM\n- worker-3: Ready, 6 vCPU, 20Gi RAM\n\n## Node Resource Usage\nNAME CPU(cores) CPU% MEMORY(bytes) MEMORY%\ncontrol-plane-1 350m 8% 3000Mi 37%\nworker-1 1500m 25% 16000Mi 53%\nworker-2 900m 15% 9200Mi 46%\nworker-3 1100m 18% 8800Mi 44%", |
22 | | - "pod_health_check": "## Pod Health Summary\nAll pods are in Running or Succeeded state across all namespaces.\n68 pods running across 12 namespaces.\n\n## High Restart Pods\nNo pods with excessive restart counts detected.", |
23 | | - "event_collector": "## Warning Events\nNo warning events found.\n\n## Recent Events (5 most recent)\nml-serving Normal Pulled Pod/inference-server-7f9b8c6d5-abc Container image already present on machine\nmonitoring Normal Started Pod/grafana-6d8f9c8b7-def Started container grafana\ndefault Normal Scheduled Pod/test-job-xyz-123 Successfully assigned to worker-2\nkube-system Normal Starting Node/worker-1 Starting kubelet\nkube-system Normal NodeReady Node/worker-3 Node worker-3 status is now: NodeReady", |
24 | | - "resource_pressure_check": "## Node Pressure Conditions\nNo nodes reporting pressure conditions (MemoryPressure, DiskPressure, PIDPressure all False).\n\n## Node Resource Utilization\nNAME CPU(cores) CPU% MEMORY(bytes) MEMORY%\ncontrol-plane-1 350m 8% 3000Mi 37%\nworker-1 1500m 25% 16000Mi 53%\nworker-2 900m 15% 9200Mi 46%\nworker-3 1100m 18% 8800Mi 44%\n\n## Nodes Exceeding Thresholds\nNone." |
25 | | - } |
26 | | -] |
| 1 | +version https://git-lfs.github.com/spec/v1 |
| 2 | +oid sha256:29429c8bec6bf001536a2b9e5549378064b12f9482cc036ae40aaacea2c80411 |
| 3 | +size 7780 |
0 commit comments