Skip to content

Commit 6f00ed7

Browse files
committed
CKS scaling reverts to Running on failure with VM cleanup and affinity checks
1 parent 6ee30eb commit 6f00ed7

File tree

4 files changed

+151
-24
lines changed

4 files changed

+151
-24
lines changed

api/src/main/java/com/cloud/kubernetes/cluster/KubernetesCluster.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ enum State {
9898
s_fsm.addTransition(State.Running, Event.ScaleDownRequested, State.Scaling);
9999
s_fsm.addTransition(State.Stopped, Event.ScaleUpRequested, State.ScalingStoppedCluster);
100100
s_fsm.addTransition(State.Scaling, Event.OperationSucceeded, State.Running);
101-
s_fsm.addTransition(State.Scaling, Event.OperationFailed, State.Alert);
101+
s_fsm.addTransition(State.Scaling, Event.OperationFailed, State.Running);
102102
s_fsm.addTransition(State.ScalingStoppedCluster, Event.OperationSucceeded, State.Stopped);
103103
s_fsm.addTransition(State.ScalingStoppedCluster, Event.OperationFailed, State.Alert);
104104

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterActionWorker.java

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,11 @@
2828
import java.util.Arrays;
2929
import java.util.Collections;
3030
import java.util.HashMap;
31+
import java.util.HashSet;
3132
import java.util.List;
3233
import java.util.Map;
3334
import java.util.Objects;
35+
import java.util.Set;
3436
import java.util.concurrent.atomic.AtomicInteger;
3537
import java.util.stream.Collectors;
3638

@@ -63,7 +65,9 @@
6365
import com.cloud.vm.dao.NicDao;
6466
import com.cloud.vm.UserVmManager;
6567
import org.apache.cloudstack.affinity.AffinityGroupVO;
68+
import org.apache.cloudstack.affinity.AffinityProcessorBase;
6669
import org.apache.cloudstack.affinity.dao.AffinityGroupDao;
70+
import org.apache.cloudstack.affinity.dao.AffinityGroupVMMapDao;
6771
import org.apache.cloudstack.api.ApiCommandResourceType;
6872
import org.apache.cloudstack.api.ApiConstants;
6973
import org.apache.cloudstack.api.command.user.firewall.CreateFirewallRuleCmd;
@@ -125,10 +129,12 @@
125129
import com.cloud.utils.fsm.StateMachine2;
126130
import com.cloud.utils.ssh.SshHelper;
127131
import com.cloud.vm.VMInstanceDetailVO;
132+
import com.cloud.vm.VMInstanceVO;
128133
import com.cloud.vm.UserVmService;
129134
import com.cloud.vm.UserVmVO;
130135
import com.cloud.vm.VmDetailConstants;
131136
import com.cloud.vm.dao.UserVmDao;
137+
import com.cloud.vm.dao.VMInstanceDao;
132138
import com.cloud.vm.dao.VMInstanceDetailsDao;
133139

134140
import static com.cloud.kubernetes.cluster.KubernetesServiceHelper.KubernetesClusterNodeType.CONTROL;
@@ -214,6 +220,10 @@ public class KubernetesClusterActionWorker {
214220
private NicDao nicDao;
215221
@Inject
216222
protected AffinityGroupDao affinityGroupDao;
223+
@Inject
224+
protected AffinityGroupVMMapDao affinityGroupVMMapDao;
225+
@Inject
226+
protected VMInstanceDao vmInstanceDao;
217227

218228
protected KubernetesClusterDao kubernetesClusterDao;
219229
protected KubernetesClusterVmMapDao kubernetesClusterVmMapDao;
@@ -1133,4 +1143,58 @@ protected List<Long> getMergedAffinityGroupIds(KubernetesClusterNodeType nodeTyp
11331143
}
11341144
return affinityGroupIds.isEmpty() ? null : affinityGroupIds;
11351145
}
1146+
1147+
private Set<Long> getRunningVmHostIds(Long affinityGroupId) {
1148+
return affinityGroupVMMapDao.listVmIdsByAffinityGroup(affinityGroupId).stream()
1149+
.map(vmInstanceDao::findById)
1150+
.filter(vm -> Objects.nonNull(vm) && Objects.nonNull(vm.getHostId()) && VirtualMachine.State.Running.equals(vm.getState()))
1151+
.map(VMInstanceVO::getHostId)
1152+
.collect(Collectors.toSet());
1153+
}
1154+
1155+
protected AffinityConstraints resolveAffinityConstraints(KubernetesClusterNodeType nodeType, Long domainId, Long accountId) {
1156+
Set<Long> antiAffinityOccupiedHosts = new HashSet<>();
1157+
Long requiredHostId = null;
1158+
boolean hasHostAntiAffinity = false;
1159+
boolean hasHostAffinity = false;
1160+
1161+
if (Objects.nonNull(nodeType)) {
1162+
List<Long> affinityGroupIds = getMergedAffinityGroupIds(nodeType, domainId, accountId);
1163+
if (CollectionUtils.isNotEmpty(affinityGroupIds)) {
1164+
for (Long affinityGroupId : affinityGroupIds) {
1165+
AffinityGroupVO affinityGroup = affinityGroupDao.findById(affinityGroupId);
1166+
if (Objects.isNull(affinityGroup)) {
1167+
continue;
1168+
}
1169+
if (AffinityProcessorBase.AFFINITY_TYPE_HOST_ANTI.equals(affinityGroup.getType())) {
1170+
hasHostAntiAffinity = true;
1171+
antiAffinityOccupiedHosts.addAll(getRunningVmHostIds(affinityGroupId));
1172+
} else if (AffinityProcessorBase.AFFINITY_TYPE_HOST.equals(affinityGroup.getType())) {
1173+
hasHostAffinity = true;
1174+
Set<Long> hostIds = getRunningVmHostIds(affinityGroupId);
1175+
if (CollectionUtils.isNotEmpty(hostIds)) {
1176+
requiredHostId = hostIds.iterator().next();
1177+
}
1178+
}
1179+
}
1180+
}
1181+
}
1182+
1183+
return new AffinityConstraints(hasHostAntiAffinity, hasHostAffinity, antiAffinityOccupiedHosts, requiredHostId);
1184+
}
1185+
1186+
protected static class AffinityConstraints {
1187+
final boolean hasHostAntiAffinity;
1188+
final boolean hasHostAffinity;
1189+
final Set<Long> antiAffinityOccupiedHosts;
1190+
final Long requiredHostId;
1191+
1192+
AffinityConstraints(boolean hasHostAntiAffinity, boolean hasHostAffinity,
1193+
Set<Long> antiAffinityOccupiedHosts, Long requiredHostId) {
1194+
this.hasHostAntiAffinity = hasHostAntiAffinity;
1195+
this.hasHostAffinity = hasHostAffinity;
1196+
this.antiAffinityOccupiedHosts = antiAffinityOccupiedHosts;
1197+
this.requiredHostId = requiredHostId;
1198+
}
1199+
}
11361200
}

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterResourceModifierActionWorker.java

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@
115115
import com.cloud.vm.UserVmManager;
116116
import com.cloud.vm.VirtualMachine;
117117
import com.cloud.vm.VmDetailConstants;
118-
import com.cloud.vm.dao.VMInstanceDao;
119118
import org.apache.cloudstack.api.ApiCommandResourceType;
120119
import org.apache.cloudstack.context.CallContext;
121120
import org.apache.logging.log4j.Level;
@@ -151,8 +150,6 @@ public class KubernetesClusterResourceModifierActionWorker extends KubernetesClu
151150
@Inject
152151
protected LoadBalancerDao loadBalancerDao;
153152
@Inject
154-
protected VMInstanceDao vmInstanceDao;
155-
@Inject
156153
protected UserVmManager userVmManager;
157154
@Inject
158155
protected LaunchPermissionDao launchPermissionDao;
@@ -176,8 +173,33 @@ protected void init() {
176173
kubernetesClusterNodeNamePrefix = getKubernetesClusterNodeNamePrefix();
177174
}
178175

176+
protected List<HostVO> filterHostsByAffinityConstraints(List<HostVO> hosts, AffinityConstraints constraints, DataCenter zone)
177+
throws InsufficientServerCapacityException {
178+
if (constraints.hasHostAffinity && Objects.nonNull(constraints.requiredHostId)) {
179+
hosts = hosts.stream().filter(host -> host.getId() == constraints.requiredHostId.longValue()).collect(Collectors.toList());
180+
if (CollectionUtils.isEmpty(hosts)) {
181+
String msg = String.format("Cannot find capacity for Kubernetes cluster: host affinity requires all VMs on host %d but it is not available in zone %s",
182+
constraints.requiredHostId, zone.getName());
183+
throw new InsufficientServerCapacityException(msg, DataCenter.class, zone.getId());
184+
}
185+
}
186+
187+
if (constraints.hasHostAntiAffinity) {
188+
hosts = hosts.stream().filter(host -> !constraints.antiAffinityOccupiedHosts.contains(host.getId())).collect(Collectors.toList());
189+
if (CollectionUtils.isEmpty(hosts)) {
190+
String msg = String.format("Cannot find capacity for Kubernetes cluster: host anti-affinity requires each VM on a separate host, " +
191+
"but all %d available hosts in zone %s are already occupied by existing cluster VMs",
192+
constraints.antiAffinityOccupiedHosts.size(), zone.getName());
193+
throw new InsufficientServerCapacityException(msg, DataCenter.class, zone.getId());
194+
}
195+
}
196+
197+
return hosts;
198+
}
199+
179200
protected DeployDestination plan(final long nodesCount, final DataCenter zone, final ServiceOffering offering,
180-
final Long domainId, final Long accountId, final Hypervisor.HypervisorType hypervisorType, CPU.CPUArch arch) throws InsufficientServerCapacityException {
201+
final Long domainId, final Long accountId, final Hypervisor.HypervisorType hypervisorType,
202+
CPU.CPUArch arch, KubernetesClusterNodeType nodeType) throws InsufficientServerCapacityException {
181203
final int cpu_requested = offering.getCpu() * offering.getSpeed();
182204
final long ram_requested = offering.getRamSize() * 1024L * 1024L;
183205
boolean useDedicatedHosts = false;
@@ -198,18 +220,22 @@ protected DeployDestination plan(final long nodesCount, final DataCenter zone, f
198220
if (hosts.isEmpty()) {
199221
hosts = resourceManager.listAllHostsInOneZoneByType(Host.Type.Routing, zone.getId());
200222
}
201-
if (hypervisorType != null) {
223+
if (Objects.nonNull(hypervisorType)) {
202224
hosts = hosts.stream().filter(x -> x.getHypervisorType() == hypervisorType).collect(Collectors.toList());
203225
}
204-
if (arch != null) {
226+
if (Objects.nonNull(arch)) {
205227
hosts = hosts.stream().filter(x -> x.getArch().equals(arch)).collect(Collectors.toList());
206228
}
207229
if (CollectionUtils.isEmpty(hosts)) {
208230
String msg = String.format("Cannot find enough capacity for Kubernetes cluster(requested cpu=%d memory=%s) with offering: %s hypervisor: %s and arch: %s",
209-
cpu_requested * nodesCount, toHumanReadableSize(ram_requested * nodesCount), offering.getName(), clusterTemplate.getHypervisorType().toString(), arch.getType());
231+
cpu_requested * nodesCount, toHumanReadableSize(ram_requested * nodesCount), offering.getName(), clusterTemplate.getHypervisorType().toString(),
232+
Objects.nonNull(arch) ? arch.getType() : "null");
210233
logAndThrow(Level.WARN, msg, new InsufficientServerCapacityException(msg, DataCenter.class, zone.getId()));
211234
}
212235

236+
AffinityConstraints affinityConstraints = resolveAffinityConstraints(nodeType, domainId, accountId);
237+
hosts = filterHostsByAffinityConstraints(hosts, affinityConstraints, zone);
238+
213239
final Map<String, Pair<HostVO, Integer>> hosts_with_resevered_capacity = new ConcurrentHashMap<String, Pair<HostVO, Integer>>();
214240
for (HostVO h : hosts) {
215241
hosts_with_resevered_capacity.put(h.getUuid(), new Pair<HostVO, Integer>(h, 0));
@@ -229,6 +255,9 @@ protected DeployDestination plan(final long nodesCount, final DataCenter zone, f
229255
continue;
230256
}
231257
int reserved = hp.second();
258+
if (affinityConstraints.hasHostAntiAffinity && reserved > 0) {
259+
continue;
260+
}
232261
reserved++;
233262
ClusterVO cluster = clusterDao.findById(h.getClusterId());
234263
ClusterDetailsVO cluster_detail_cpu = clusterDetailsDao.findDetail(cluster.getId(), "cpuOvercommitRatio");
@@ -263,10 +292,17 @@ protected DeployDestination plan(final long nodesCount, final DataCenter zone, f
263292
}
264293
return new DeployDestination(zone, null, null, null);
265294
}
266-
String msg = String.format("Cannot find enough capacity for Kubernetes cluster(requested cpu=%d memory=%s) with offering: %s hypervisor: %s and arch: %s",
267-
cpu_requested * nodesCount, toHumanReadableSize(ram_requested * nodesCount), offering.getName(), clusterTemplate.getHypervisorType().toString(), arch.getType());
268-
269-
logger.warn(msg);
295+
String msg;
296+
if (affinityConstraints.hasHostAntiAffinity) {
297+
msg = String.format("Cannot find enough capacity for Kubernetes cluster (requested cpu=%d memory=%s) with offering: %s. " +
298+
"Host anti-affinity requires %d separate hosts but not enough suitable hosts are available in zone %s",
299+
cpu_requested * nodesCount, toHumanReadableSize(ram_requested * nodesCount), offering.getName(),
300+
nodesCount, zone.getName());
301+
} else {
302+
msg = String.format("Cannot find enough capacity for Kubernetes cluster(requested cpu=%d memory=%s) with offering: %s hypervisor: %s and arch: %s",
303+
cpu_requested * nodesCount, toHumanReadableSize(ram_requested * nodesCount), offering.getName(), clusterTemplate.getHypervisorType().toString(),
304+
Objects.nonNull(arch) ? arch.getType() : "null");
305+
}
270306
throw new InsufficientServerCapacityException(msg, DataCenter.class, zone.getId());
271307
}
272308

@@ -295,7 +331,7 @@ protected Map<String, DeployDestination> planKubernetesCluster(Long domainId, Lo
295331
if (logger.isDebugEnabled()) {
296332
logger.debug("Checking deployment destination for {} nodes on Kubernetes cluster : {} in zone : {}", nodeType.name(), kubernetesCluster.getName(), zone.getName());
297333
}
298-
DeployDestination planForNodeType = plan(nodes, zone, nodeOffering, domainId, accountId, hypervisorType, arch);
334+
DeployDestination planForNodeType = plan(nodes, zone, nodeOffering, domainId, accountId, hypervisorType, arch, nodeType);
299335
destinationMap.put(nodeType.name(), planForNodeType);
300336
}
301337
return destinationMap;

0 commit comments

Comments
 (0)