Skip to content

Commit 4d37986

Browse files
committed
fix(thunderbolt): add nvInvalidateDeviceReferences to fix module unload crash
1 parent 91eb137 commit 4d37986

File tree

4 files changed

+88
-53
lines changed

4 files changed

+88
-53
lines changed

src/nvidia-modeset/include/nvkms-private.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ struct NvKmsPerOpenDev *nvAllocPerOpenDev(struct NvKmsPerOpen *pOpen,
3535

3636
void nvRevokeDevice(NVDevEvoPtr pDevEvo);
3737

38+
void nvInvalidateDeviceReferences(NVDevEvoPtr pDevEvo);
39+
3840
void nvFreePerOpenDev(struct NvKmsPerOpen *pOpen,
3941
struct NvKmsPerOpenDev *pOpenDev);
4042

@@ -73,8 +75,6 @@ const NVEvoApiHandlesRec *nvGetSurfaceHandlesFromOpenDevConst(
7375

7476
void nvKmsServiceNonStallInterrupt(void *dataPtr, NvU32 dataU32);
7577

76-
void nvKmsReinitializeGlobalClient(void);
77-
7878
#ifdef __cplusplus
7979
};
8080
#endif

src/nvidia-modeset/kapi/src/nvkms-kapi.c

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -633,8 +633,13 @@ static void FreeDevice(struct NvKmsKapiDevice *device)
633633
* faults or hangs when trying to access unmapped GPU memory.
634634
*
635635
* We only:
636-
* 1. Release the GPU reference count (nvkms_close_gpu)
637-
* 2. Free kernel memory resources (semaphore, device struct)
636+
* 1. Mark GPU as lost to prevent hardware access
637+
* 2. Release the GPU reference count (nvkms_close_gpu)
638+
* 3. Clean up kernel memory resources (handle allocator, semaphore, device struct)
639+
*
640+
* We skip:
641+
* - KmsFreeDevice() - would call nvkms_ioctl_from_kapi() which accesses hardware
642+
* - RmFreeDevice() - would call nvRmApiFree() which accesses hardware
638643
*
639644
* The hardware resources will be cleaned up when the GPU is physically
640645
* removed from the system.
@@ -653,17 +658,39 @@ static void FreeDeviceForSurpriseRemoval(struct NvKmsKapiDevice *device)
653658
nvkms_gpu_lost(device->gpuId);
654659

655660
/*
656-
* Skip KmsFreeDevice() and RmFreeDevice() - these try to access
657-
* GPU hardware via ioctls and RM API calls, which will crash
658-
* since the GPU memory is unmapped after surprise removal.
661+
* Clear device handles to prevent any stale references.
662+
* Don't call nvRmApiFree() as that would access hardware.
663+
*/
664+
device->hKmsDevice = 0;
665+
device->hKmsDisp = 0;
666+
device->hRmSubDevice = 0;
667+
device->hRmDevice = 0;
668+
device->hRmClient = 0;
669+
device->smgGpuInstSubscriptionHdl = 0;
670+
device->smgComputeInstSubscriptionHdl = 0;
671+
672+
/*
673+
* Tear down the handle allocator - this only frees kernel memory
674+
* (bitmaps), no hardware access.
675+
*/
676+
nvTearDownUnixRmHandleAllocator(&device->handleAllocator);
677+
device->deviceInstance = 0;
678+
679+
/*
680+
* Clear pKmsOpen - we can't call nvkms_close_from_kapi() as that
681+
* would try to access hardware through nvKmsClose(). The popen
682+
* structure will be leaked, but this only happens during surprise
683+
* removal which is an abnormal condition.
659684
*/
685+
device->pKmsOpen = NULL;
660686

661687
/* Lower the reference count of gpu - this is safe, no hardware access */
662688
nvkms_close_gpu(device->gpuId);
663689

664690
/* Free kernel memory resources */
665691
if (device->pSema != NULL) {
666692
nvkms_sema_free(device->pSema);
693+
device->pSema = NULL;
667694
}
668695

669696
nvKmsKapiFree(device);

src/nvidia-modeset/src/nvkms-evo.c

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9050,14 +9050,21 @@ NvBool nvFreeDevEvo(NVDevEvoPtr pDevEvo)
90509050
/*
90519051
* If the GPU was lost (surprise removal), skip all hardware-related
90529052
* cleanup. Just free software resources and remove from device list.
9053+
*
9054+
* NOTE: We do NOT call nvFreePerOpenDev() here because the pNvKmsOpenDev
9055+
* is still in the global open list. It will be properly cleaned up during
9056+
* module unload when nvKmsClose iterates through all open handles.
9057+
* Calling nvFreePerOpenDev here would cause a double-free crash.
90539058
*/
90549059
if (pDevEvo->gpuLost) {
90559060
nvEvoLogDev(pDevEvo, EVO_LOG_INFO,
90569061
"Freeing device after GPU lost, skipping hardware cleanup");
90579062

9058-
/* Still need to free the per-open data (software resources only) */
9059-
nvFreePerOpenDev(nvEvoGlobal.nvKmsPerOpen, pDevEvo->pNvKmsOpenDev);
9060-
pDevEvo->pNvKmsOpenDev = NULL;
9063+
/*
9064+
* Invalidate all pOpenDev references to this device before freeing it.
9065+
* This ensures nvKmsClose won't try to access the freed pDevEvo.
9066+
*/
9067+
nvInvalidateDeviceReferences(pDevEvo);
90619068

90629069
goto free_software_resources;
90639070
}
@@ -9125,12 +9132,16 @@ NvBool nvFreeDevEvo(NVDevEvoPtr pDevEvo)
91259132
nvFree(pDevEvo);
91269133

91279134
/*
9128-
* If the GPU was lost and the device list is now empty, reinitialize
9129-
* the global RM client so that newly attached GPUs can be used.
9135+
* NOTE: We intentionally do NOT call nvKmsReinitializeGlobalClient()
9136+
* here even if the device list is empty. The global client handle
9137+
* is still referenced by open handles (pNvKmsOpenDev) that will be
9138+
* cleaned up during module unload by nvKmsClose(). Reinitializing
9139+
* the client here would corrupt those handles and cause a crash.
9140+
*
9141+
* If the user reconnects the GPU before unloading the module, it will
9142+
* work because AllocDevice checks for stale gpuLost devices and cleans
9143+
* them up. The global client doesn't need to be reinitialized for that.
91309144
*/
9131-
if (wasGpuLost && nvListIsEmpty(&nvEvoGlobal.devList)) {
9132-
nvKmsReinitializeGlobalClient();
9133-
}
91349145

91359146
return TRUE;
91369147
}

src/nvidia-modeset/src/nvkms.c

Lines changed: 35 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1635,6 +1635,16 @@ static void DisableRemainingVblankSemControls(
16351635
static void FreeDeviceReference(struct NvKmsPerOpen *pOpen,
16361636
struct NvKmsPerOpenDev *pOpenDev)
16371637
{
1638+
/*
1639+
* If pDevEvo is NULL, the device was already freed due to GPU loss
1640+
* (surprise removal). In this case, skip all hardware-related cleanup
1641+
* and just free the software structures.
1642+
*/
1643+
if (pOpenDev->pDevEvo == NULL) {
1644+
nvFreePerOpenDev(pOpen, pOpenDev);
1645+
return;
1646+
}
1647+
16381648
/* Disable all client-owned vblank sync objects that still exist. */
16391649
DisableRemainingVblankSyncObjects(pOpen, pOpenDev);
16401650

@@ -5327,6 +5337,31 @@ void nvRevokeDevice(NVDevEvoPtr pDevEvo)
53275337
}
53285338
}
53295339

5340+
/*
5341+
* Invalidate all pOpenDev references to a device.
5342+
* Called when GPU is lost to ensure nvKmsClose doesn't access freed pDevEvo.
5343+
* This sets pOpenDev->pDevEvo to NULL for all open handles.
5344+
*/
5345+
void nvInvalidateDeviceReferences(NVDevEvoPtr pDevEvo)
5346+
{
5347+
struct NvKmsPerOpen *pOpen;
5348+
struct NvKmsPerOpenDev *pOpenDev;
5349+
NvKmsGenericHandle dev;
5350+
5351+
if (pDevEvo == NULL) {
5352+
return;
5353+
}
5354+
5355+
nvListForEachEntry(pOpen, &perOpenIoctlList, perOpenIoctlListEntry) {
5356+
FOR_ALL_POINTERS_IN_EVO_API_HANDLES(&pOpen->ioctl.devHandles,
5357+
pOpenDev, dev) {
5358+
if (pOpenDev->pDevEvo == pDevEvo) {
5359+
pOpenDev->pDevEvo = NULL;
5360+
}
5361+
}
5362+
}
5363+
}
5364+
53305365
/*!
53315366
* Open callback.
53325367
*
@@ -6318,44 +6353,6 @@ static void FreeGlobalState(void)
63186353
nvClearDpyOverrides();
63196354
}
63206355

6321-
/*
6322-
* Reinitialize the global RM client after a GPU surprise removal.
6323-
* When a GPU is removed, the RM client handle may become invalid.
6324-
* This function re-creates the client handle so that newly attached
6325-
* GPUs can be used.
6326-
*/
6327-
void nvKmsReinitializeGlobalClient(void)
6328-
{
6329-
NvU32 ret;
6330-
6331-
/*
6332-
* First, try to free the old client handle. This may fail if RM
6333-
* already invalidated it, but that's OK.
6334-
*/
6335-
if (nvEvoGlobal.clientHandle != 0) {
6336-
nvRmApiFree(nvEvoGlobal.clientHandle, nvEvoGlobal.clientHandle,
6337-
nvEvoGlobal.clientHandle);
6338-
nvEvoGlobal.clientHandle = 0;
6339-
}
6340-
6341-
/* Allocate a new root client */
6342-
ret = nvRmApiAlloc(NV01_NULL_OBJECT,
6343-
NV01_NULL_OBJECT,
6344-
NV01_NULL_OBJECT,
6345-
NV01_ROOT,
6346-
&nvEvoGlobal.clientHandle);
6347-
6348-
if (ret != NVOS_STATUS_SUCCESS) {
6349-
nvEvoLog(EVO_LOG_ERROR, "Failed to reinitialize client after GPU removal");
6350-
return;
6351-
}
6352-
6353-
/* Update the RM context */
6354-
nvEvoGlobal.rmSmgContext.clientHandle = nvEvoGlobal.clientHandle;
6355-
6356-
nvEvoLog(EVO_LOG_INFO, "Reinitialized global client after GPU surprise removal");
6357-
}
6358-
63596356
/*
63606357
* Wrappers to help SMG access NvKmsKAPI's RM context.
63616358
*/

0 commit comments

Comments
 (0)