Skip to content

Commit a844a11

Browse files
committed
fix bugs
1 parent 2b6f53a commit a844a11

File tree

19 files changed

+455
-45
lines changed

19 files changed

+455
-45
lines changed

api/src/main/java/org/apache/cloudstack/gpu/GpuService.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,15 @@ public interface GpuService extends Manager {
8181
*
8282
* @param vmId The ID of the VM to deallocate GPU devices for.
8383
*/
84-
void deallocateGpuDevicesForVmOnHost(long vmId);
84+
void deallocateAllGpuDevicesForVm(long vmId);
85+
86+
87+
/**
88+
* Deallocate GPU devices for a VM on a host.
89+
*
90+
* @param vmId The ID of the VM to deallocate GPU devices for.
91+
*/
92+
void deallocateGpuDevicesForVmOnHost(long vmId, long hostId);
8593

8694
/**
8795
* Deallocate existing GPU devices for a VM on a host and allocate new GPU devices to the VM.
@@ -122,7 +130,7 @@ public interface GpuService extends Manager {
122130
* @param gpuCount The number of GPU devices to get.
123131
* @return The GPU devices.
124132
*/
125-
GPUDeviceTO getGPUDevice(VirtualMachine vm, VgpuProfile vgpuProfile, int gpuCount);
133+
GPUDeviceTO getGPUDevice(VirtualMachine vm, long hostId, VgpuProfile vgpuProfile, int gpuCount);
126134

127135
/**
128136
* Gets the GPU group details from the GPU devices on a host.

engine/components-api/src/main/java/com/cloud/resource/ResourceManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ public interface ResourceManager extends ResourceService, Configurable {
208208
* @param gpuCount
209209
* @return GPUDeviceTO[]
210210
*/
211-
GPUDeviceTO getGPUDevice(VirtualMachine vm, VgpuProfileVO vgpuProfile, int gpuCount);
211+
GPUDeviceTO getGPUDevice(VirtualMachine vm, long hostId, VgpuProfileVO vgpuProfile, int gpuCount);
212212

213213
/**
214214
* Get available GPU device

engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2356,7 +2356,7 @@ public void destroy(final String vmUuid, final boolean expunge) throws AgentUnav
23562356

23572357
deleteVMSnapshots(vm, expunge);
23582358

2359-
gpuService.deallocateGpuDevicesForVmOnHost(vm.getId());
2359+
gpuService.deallocateAllGpuDevicesForVm(vm.getId());
23602360

23612361
Transaction.execute(new TransactionCallbackWithExceptionNoReturn<CloudRuntimeException>() {
23622362
@Override
@@ -2928,6 +2928,8 @@ protected void migrate(final VMInstanceVO vm, final long srcHostId, final Deploy
29282928
logger.info("Migration was unsuccessful. Cleaning up: {}", vm);
29292929
_networkMgr.rollbackNicForMigration(vmSrc, profile);
29302930
volumeMgr.release(vm.getId(), dstHostId);
2931+
// deallocate GPU devices for the VM on the destination host
2932+
gpuService.deallocateGpuDevicesForVmOnHost(vm.getId(), dstHostId);
29312933

29322934
_alertMgr.sendAlert(alertType, fromHost.getDataCenterId(), fromHost.getPodId(),
29332935
"Unable to migrate vm " + vm.getInstanceName() + " from host " + fromHost.getName() + " in zone " + dest.getDataCenter().getName() + " and pod " +
@@ -2946,6 +2948,8 @@ protected void migrate(final VMInstanceVO vm, final long srcHostId, final Deploy
29462948
} else {
29472949
_networkMgr.commitNicForMigration(vmSrc, profile);
29482950
volumeMgr.release(vm.getId(), srcHostId);
2951+
// deallocate GPU devices for the VM on the src host after migration is complete
2952+
gpuService.deallocateGpuDevicesForVmOnHost(vm.getId(), srcHostId);
29492953
_networkMgr.setHypervisorHostname(profile, dest, true);
29502954
recreateCheckpointsKvmOnVmAfterMigration(vm, dstHostId);
29512955

engine/schema/src/main/java/com/cloud/gpu/dao/GpuDeviceDaoImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ public GpuDeviceDaoImpl() {
6060
allFieldSearch = createSearchBuilder();
6161
allFieldSearch.and(IDS, allFieldSearch.entity().getId(), SearchCriteria.Op.IN);
6262
allFieldSearch.and(HOST_ID, allFieldSearch.entity().getHostId(), SearchCriteria.Op.EQ);
63+
allFieldSearch.and(CARD_ID, allFieldSearch.entity().getCardId(), SearchCriteria.Op.EQ);
6364
allFieldSearch.and(BUS_ADDRESS, allFieldSearch.entity().getBusAddress(), SearchCriteria.Op.EQ);
6465
allFieldSearch.and(STATE, allFieldSearch.entity().getState(), SearchCriteria.Op.EQ);
6566
allFieldSearch.and(VGPU_PROFILE_ID, allFieldSearch.entity().getVgpuProfileId(), SearchCriteria.Op.EQ);

engine/schema/src/main/java/com/cloud/gpu/dao/VgpuProfileDao.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ public interface VgpuProfileDao extends GenericDao<VgpuProfileVO, Long> {
3434

3535
VgpuProfileVO findByNameAndCardId(String name, long cardId);
3636

37+
int removeByCardId(long cardId);
38+
3739
Pair<List<VgpuProfileVO>, Integer> searchAndCountVgpuProfiles(Long id, String name, String keyword, Long gpuCardId,
3840
boolean activeOnly, Long startIndex, Long pageSize);
3941
}

engine/schema/src/main/java/com/cloud/gpu/dao/VgpuProfileDaoImpl.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ public VgpuProfileVO findByNameAndCardId(String name, long cardId) {
5757
return findOneBy(sc);
5858
}
5959

60+
@Override
61+
public int removeByCardId(long cardId) {
62+
SearchCriteria<VgpuProfileVO> sc = allFieldSearch.create();
63+
sc.setParameters("cardId", cardId);
64+
return remove(sc);
65+
}
66+
6067
@Override
6168
public Pair<List<VgpuProfileVO>, Integer> searchAndCountVgpuProfiles(Long id, String name, String keyword,
6269
Long gpuCardId, boolean activeOnly, Long startIndex, Long pageSize) {

engine/schema/src/main/resources/META-INF/db/schema-42010to42100.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,11 +238,11 @@ CREATE TABLE IF NOT EXISTS `cloud`.`gui_themes_details` (
238238
CREATE TABLE IF NOT EXISTS `cloud`.`gpu_card` (
239239
`id` bigint unsigned NOT NULL AUTO_INCREMENT COMMENT 'id',
240240
`uuid` varchar(40) NOT NULL UNIQUE,
241-
`device_id` varchar(255) NOT NULL COMMENT 'device id of the GPU card',
241+
`device_id` varchar(4) NOT NULL COMMENT 'device id of the GPU card',
242242
`device_name` varchar(255) NOT NULL COMMENT 'device name of the GPU card',
243243
`name` varchar(255) NOT NULL COMMENT 'name of the GPU card',
244244
`vendor_name` varchar(255) NOT NULL COMMENT 'vendor name of the GPU card',
245-
`vendor_id` varchar(255) NOT NULL COMMENT 'vendor id of the GPU card',
245+
`vendor_id` varchar(4) NOT NULL COMMENT 'vendor id of the GPU card',
246246
`created` datetime NOT NULL COMMENT 'date created',
247247
PRIMARY KEY (`id`),
248248
UNIQUE KEY (`vendor_id`, `device_id`)

plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/wrapper/LibvirtMigrateCommandWrapper.java

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.io.InputStream;
2424
import java.net.URISyntaxException;
2525
import java.nio.charset.StandardCharsets;
26+
import java.util.ArrayList;
2627
import java.util.HashSet;
2728
import java.util.List;
2829
import java.util.Map;
@@ -40,9 +41,13 @@
4041
import javax.xml.parsers.ParserConfigurationException;
4142
import javax.xml.transform.TransformerException;
4243

44+
import com.cloud.agent.api.VgpuTypesInfo;
45+
import com.cloud.agent.api.to.GPUDeviceTO;
46+
import com.cloud.hypervisor.kvm.resource.LibvirtGpuDef;
4347
import com.cloud.hypervisor.kvm.resource.LibvirtXMLParser;
4448
import org.apache.cloudstack.utils.security.ParserUtils;
4549
import org.apache.commons.collections.MapUtils;
50+
import org.apache.commons.collections4.CollectionUtils;
4651
import org.apache.commons.io.FilenameUtils;
4752
import org.apache.commons.io.IOUtils;
4853
import org.apache.commons.lang3.StringUtils;
@@ -226,6 +231,8 @@ Use VIR_DOMAIN_XML_SECURE (value = 1) prior to v1.0.0.
226231

227232
xmlDesc = updateVmSharesIfNeeded(command, xmlDesc, libvirtComputingResource);
228233

234+
xmlDesc = updateGpuDevicesIfNeeded(command, xmlDesc, libvirtComputingResource);
235+
229236
dconn = libvirtUtilitiesHelper.retrieveQemuConnection(destinationUri);
230237

231238
if (to.getType() == VirtualMachine.Type.User) {
@@ -417,6 +424,116 @@ protected Set<String> getMigrateStorageDeviceLabels(List<DiskDef> diskDefinition
417424
return setOfLabels;
418425
}
419426

427+
String updateGpuDevicesIfNeeded(MigrateCommand migrateCommand, String xmlDesc, LibvirtComputingResource libvirtComputingResource)
428+
throws ParserConfigurationException, IOException, SAXException, TransformerException {
429+
GPUDeviceTO gpuDevice = migrateCommand.getVirtualMachine().getGpuDevice();
430+
if (gpuDevice == null || CollectionUtils.isEmpty(gpuDevice.getGpuDevices())) {
431+
logger.debug("No GPU device to update for VM [{}].", migrateCommand.getVmName());
432+
return xmlDesc;
433+
}
434+
435+
List<VgpuTypesInfo> devices = gpuDevice.getGpuDevices();
436+
logger.info("Updating GPU devices for VM [{}] during migration. Number of devices: {}",
437+
migrateCommand.getVmName(), devices.size());
438+
439+
// Parse XML and find devices element
440+
DocumentBuilderFactory docFactory = ParserUtils.getSaferDocumentBuilderFactory();
441+
DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
442+
Document document;
443+
try (InputStream inputStream = IOUtils.toInputStream(xmlDesc, StandardCharsets.UTF_8)) {
444+
document = docBuilder.parse(inputStream);
445+
}
446+
447+
NodeList devicesList = document.getElementsByTagName("devices");
448+
if (devicesList.getLength() == 0) {
449+
logger.warn("No devices section found in XML for VM [{}]", migrateCommand.getVmName());
450+
return xmlDesc;
451+
}
452+
453+
Element devicesElement = (Element) devicesList.item(0);
454+
455+
// Remove existing GPU hostdev elements and add new ones
456+
removeExistingGpuHostdevElements(devicesElement);
457+
addNewGpuHostdevElements(document, devicesElement, devices);
458+
459+
String newXmlDesc = LibvirtXMLParser.getXml(document);
460+
logger.debug("Updated XML configuration for VM [{}] with new GPU devices", migrateCommand.getVmName());
461+
462+
return newXmlDesc;
463+
}
464+
465+
/**
466+
* Removes existing GPU hostdev elements from the devices section.
467+
* GPU devices are identified as hostdev elements with type='pci' or type='mdev'.
468+
*/
469+
private void removeExistingGpuHostdevElements(Element devicesElement) {
470+
NodeList hostdevNodes = devicesElement.getElementsByTagName("hostdev");
471+
List<Node> nodesToRemove = new ArrayList<>();
472+
473+
for (int i = 0; i < hostdevNodes.getLength(); i++) {
474+
Node hostdevNode = hostdevNodes.item(i);
475+
if (hostdevNode.getNodeType() == Node.ELEMENT_NODE) {
476+
Element hostdevElement = (Element) hostdevNode;
477+
String hostdevType = hostdevElement.getAttribute("type");
478+
479+
// Remove hostdev elements that represent GPU devices (type='pci' or type='mdev')
480+
if ("pci".equals(hostdevType) || "mdev".equals(hostdevType)) {
481+
// Additional check: ensure this is actually a GPU device by checking mode='subsystem'
482+
String mode = hostdevElement.getAttribute("mode");
483+
if ("subsystem".equals(mode)) {
484+
nodesToRemove.add(hostdevNode);
485+
}
486+
}
487+
}
488+
}
489+
490+
// Remove the nodes
491+
for (Node node : nodesToRemove) {
492+
devicesElement.removeChild(node);
493+
}
494+
495+
logger.debug("Removed {} existing GPU hostdev elements", nodesToRemove.size());
496+
}
497+
498+
/**
499+
* Adds new GPU hostdev elements to the devices section based on the GPU devices
500+
* allocated on the destination host.
501+
*/
502+
private void addNewGpuHostdevElements(Document document, Element devicesElement, List<VgpuTypesInfo> devices)
503+
throws ParserConfigurationException, IOException, SAXException {
504+
if (devices.isEmpty()) {
505+
return;
506+
}
507+
508+
// Reuse parser for efficiency
509+
DocumentBuilderFactory factory = ParserUtils.getSaferDocumentBuilderFactory();
510+
DocumentBuilder builder = factory.newDocumentBuilder();
511+
512+
for (VgpuTypesInfo deviceInfo : devices) {
513+
Element hostdevElement = createGpuHostdevElement(document, deviceInfo, builder);
514+
devicesElement.appendChild(hostdevElement);
515+
logger.debug("Added new GPU hostdev element for device: {} (type: {}, busAddress: {})",
516+
deviceInfo.getDeviceName(), deviceInfo.getDeviceType(), deviceInfo.getBusAddress());
517+
}
518+
}
519+
520+
/**
521+
* Creates a hostdev element for a GPU device using LibvirtGpuDef.
522+
*/
523+
private Element createGpuHostdevElement(Document document, VgpuTypesInfo deviceInfo, DocumentBuilder builder)
524+
throws IOException, SAXException {
525+
// Generate GPU XML using LibvirtGpuDef
526+
LibvirtGpuDef gpuDef = new LibvirtGpuDef();
527+
gpuDef.defGpu(deviceInfo);
528+
String gpuXml = gpuDef.toString();
529+
530+
// Parse and import into target document
531+
try (InputStream xmlStream = IOUtils.toInputStream(gpuXml, StandardCharsets.UTF_8)) {
532+
Document gpuDocument = builder.parse(xmlStream);
533+
Element hostdevElement = gpuDocument.getDocumentElement();
534+
return (Element) document.importNode(hostdevElement, true);
535+
}
536+
}
420537

421538
/**
422539
* Checks if the CPU shares are equal in the source host and destination host.

0 commit comments

Comments
 (0)