Skip to content

Commit 2993c99

Browse files
authored
Add missing hosts info to the prometheus exporter output. (#8328)
Sometimes the hostStats object of the agents becomes null in the management server. It is a rare situation, and we haven't found the root cause yet, but it occurs occasionally in our CloudStack deployments with many hosts. The hostStat is null, even though the agent is UP and hosting multiple VMs. It is possible to access the VM consoles and execute tasks on them. This pull request doesn't address the issue directly; rather it displays those hosts in Prometheus so we can restart the agent and get the necessary information.
1 parent c599011 commit 2993c99

File tree

1 file changed

+62
-23
lines changed

1 file changed

+62
-23
lines changed

plugins/integrations/prometheus/src/main/java/org/apache/cloudstack/metrics/PrometheusExporterImpl.java

Lines changed: 62 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,24 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
8282
private static final String ONLINE = "online";
8383
private static final String OFFLINE = "offline";
8484

85+
enum MissingInfoFilter {
86+
Host_Stats("hostStats"),
87+
CPU_CAPACITY("cpuCapacity"),
88+
MEM_CAPACITY("memCapacity"),
89+
CORE_CAPACITY("coreCapacity");
90+
91+
private final String name;
92+
93+
MissingInfoFilter(String name){
94+
this.name = name;
95+
}
96+
97+
@Override
98+
public String toString() {
99+
return name;
100+
}
101+
}
102+
85103
private static List<Item> metricsItems = new ArrayList<>();
86104

87105
@Inject
@@ -129,8 +147,6 @@ private void addHostMetrics(final List<Item> metricsList, final long dcId, final
129147
Map<String, Integer> upHosts = new HashMap<>();
130148
Map<String, Integer> downHosts = new HashMap<>();
131149

132-
HostStats hostStats;
133-
134150
for (final HostVO host : hostDao.listAll()) {
135151
if (host == null || host.getType() != Host.Type.Routing || host.getDataCenterId() != dcId) {
136152
continue;
@@ -147,8 +163,6 @@ private void addHostMetrics(final List<Item> metricsList, final long dcId, final
147163
int isDedicated = (dr != null) ? 1 : 0;
148164
metricsList.add(new ItemHostIsDedicated(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), isDedicated));
149165

150-
String hostTags = markTagMaps(host, totalHosts, upHosts, downHosts);
151-
hostStats = ApiDBUtils.getHostStatistics(host.getId());
152166

153167
// Get account, domain details for dedicated hosts
154168
if (isDedicated == 1) {
@@ -160,16 +174,22 @@ private void addHostMetrics(final List<Item> metricsList, final long dcId, final
160174
metricsList.add(new ItemHostDedicatedToAccount(zoneName, host.getName(), accountName, domain.getPath(), isDedicated));
161175
}
162176

177+
String hostTags = markTagMaps(host, totalHosts, upHosts, downHosts);
178+
HostStats hostStats = ApiDBUtils.getHostStatistics(host.getId());
179+
180+
if (hostStats == null){
181+
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.Host_Stats));
182+
}
183+
163184
final String cpuFactor = String.valueOf(CapacityManager.CpuOverprovisioningFactor.valueIn(host.getClusterId()));
164185
final CapacityVO cpuCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_CPU);
165-
final double cpuUsedMhz = hostStats.getCpuUtilization() * host.getCpus() * host.getSpeed() / 100.0 ;
166186

167-
if (host.isInMaintenanceStates()) {
168-
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, ALLOCATED, 0L, isDedicated, hostTags));
169-
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, USED, 0L, isDedicated, hostTags));
170-
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, TOTAL, 0L, isDedicated, hostTags));
187+
if (cpuCapacity == null && !host.isInMaintenanceStates()){
188+
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.CPU_CAPACITY));
171189
}
172-
else if (cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.Enabled) {
190+
191+
if (hostStats != null && cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.Enabled) {
192+
final double cpuUsedMhz = hostStats.getCpuUtilization() * host.getCpus() * host.getSpeed() / 100.0 ;
173193
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, ALLOCATED, cpuCapacity.getUsedCapacity(), isDedicated, hostTags));
174194
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, USED, cpuUsedMhz, isDedicated, hostTags));
175195
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, TOTAL, cpuCapacity.getTotalCapacity(), isDedicated, hostTags));
@@ -181,12 +201,12 @@ else if (cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.
181201

182202
final String memoryFactor = String.valueOf(CapacityManager.MemOverprovisioningFactor.valueIn(host.getClusterId()));
183203
final CapacityVO memCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_MEMORY);
184-
if (host.isInMaintenanceStates()) {
185-
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, ALLOCATED, 0L, isDedicated, hostTags));
186-
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, USED, 0, isDedicated, hostTags));
187-
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, TOTAL, 0L, isDedicated, hostTags));
204+
205+
if (memCapacity == null && !host.isInMaintenanceStates()){
206+
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.MEM_CAPACITY));
188207
}
189-
else if (memCapacity != null && memCapacity.getCapacityState() == CapacityState.Enabled) {
208+
209+
if (hostStats != null && memCapacity != null && memCapacity.getCapacityState() == CapacityState.Enabled) {
190210
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, ALLOCATED, memCapacity.getUsedCapacity(), isDedicated, hostTags));
191211
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, USED, hostStats.getUsedMemory(), isDedicated, hostTags));
192212
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, TOTAL, memCapacity.getTotalCapacity(), isDedicated, hostTags));
@@ -197,13 +217,13 @@ else if (memCapacity != null && memCapacity.getCapacityState() == CapacityState.
197217
}
198218

199219
metricsList.add(new ItemHostVM(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), vmDao.listByHostId(host.getId()).size()));
200-
201220
final CapacityVO coreCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_CPU_CORE);
202-
if (host.isInMaintenanceStates()) {
203-
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), USED, 0L, isDedicated, hostTags));
204-
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), TOTAL, 0L, isDedicated, hostTags));
221+
222+
if (coreCapacity == null && !host.isInMaintenanceStates()){
223+
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.CORE_CAPACITY));
205224
}
206-
else if (coreCapacity != null && coreCapacity.getCapacityState() == CapacityState.Enabled) {
225+
226+
if (hostStats != null && coreCapacity != null && coreCapacity.getCapacityState() == CapacityState.Enabled) {
207227
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), USED, coreCapacity.getUsedCapacity(), isDedicated, hostTags));
208228
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), TOTAL, coreCapacity.getTotalCapacity(), isDedicated, hostTags));
209229
} else {
@@ -213,17 +233,17 @@ else if (coreCapacity != null && coreCapacity.getCapacityState() == CapacityStat
213233
}
214234

215235
final List<CapacityDaoImpl.SummedCapacity> cpuCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_CPU, dcId, null, null);
216-
if (cpuCapacity != null && cpuCapacity.size() > 0) {
236+
if (cpuCapacity != null && !cpuCapacity.isEmpty()) {
217237
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, null, null, null, null, ALLOCATED, cpuCapacity.get(0).getAllocatedCapacity() != null ? cpuCapacity.get(0).getAllocatedCapacity() : 0, 0, ""));
218238
}
219239

220240
final List<CapacityDaoImpl.SummedCapacity> memCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_MEMORY, dcId, null, null);
221-
if (memCapacity != null && memCapacity.size() > 0) {
241+
if (memCapacity != null && !memCapacity.isEmpty()) {
222242
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, null, null, null, null, ALLOCATED, memCapacity.get(0).getAllocatedCapacity() != null ? memCapacity.get(0).getAllocatedCapacity() : 0, 0, ""));
223243
}
224244

225245
final List<CapacityDaoImpl.SummedCapacity> coreCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_CPU_CORE, dcId, null, null);
226-
if (coreCapacity != null && coreCapacity.size() > 0) {
246+
if (coreCapacity != null && !coreCapacity.isEmpty()) {
227247
metricsList.add(new ItemVMCore(zoneName, zoneUuid, null, null, null, ALLOCATED, coreCapacity.get(0).getAllocatedCapacity() != null ? coreCapacity.get(0).getAllocatedCapacity() : 0, 0, ""));
228248
}
229249

@@ -626,6 +646,25 @@ public String toMetricsString() {
626646
}
627647
}
628648

649+
class MissingHostInfo extends Item {
650+
651+
String zoneName;
652+
String hostName;
653+
MissingInfoFilter filter;
654+
655+
public MissingHostInfo(String zoneName, String hostname, MissingInfoFilter filter) {
656+
super("cloudstack_host_missing_info");
657+
this.zoneName = zoneName;
658+
this.hostName = hostname;
659+
this.filter = filter;
660+
}
661+
662+
@Override
663+
public String toMetricsString() {
664+
return String.format("%s{zone=\"%s\",hostname=\"%s\",filter=\"%s\"} -1", name, zoneName, hostName, filter);
665+
}
666+
}
667+
629668
class ItemHostCpu extends Item {
630669
String zoneName;
631670
String zoneUuid;

0 commit comments

Comments
 (0)