Skip to content

Commit 91eb137

Browse files
committed
Fix: Thunderbolt eGPU hot-plug/unplug kernel crash support
1 parent 2af9f1f commit 91eb137

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1099
-110
lines changed

kernel-open/common/inc/nvkms-kapi.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,17 @@ struct NvKmsKapiFunctionsTable {
660660
*/
661661
void (*freeDevice)(struct NvKmsKapiDevice *device);
662662

663+
/*!
664+
* Frees a device during surprise removal (e.g., Thunderbolt eGPU unplug).
665+
* This skips all hardware access and only releases kernel resources.
666+
* Use this instead of freeDevice() when the GPU hardware is no longer
667+
* accessible to avoid page faults and hangs.
668+
*
669+
* \param [in] device A device returned by allocateDevice().
670+
* This function is a no-op if device is not valid.
671+
*/
672+
void (*freeDeviceForSurpriseRemoval)(struct NvKmsKapiDevice *device);
673+
663674
/*!
664675
* Grab ownership of device, ownership is required to do modeset.
665676
*

kernel-open/nvidia-drm/nvidia-drm-drv.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -852,6 +852,43 @@ static void nv_drm_dev_unload(struct drm_device *dev)
852852
return;
853853
}
854854

855+
/*
856+
* During surprise removal (e.g., Thunderbolt eGPU hot-unplug),
857+
* the GPU hardware is no longer accessible. Skip NVKMS calls that
858+
* would access hardware to prevent page faults and crashes.
859+
* Use freeDeviceForSurpriseRemoval which only releases kernel resources
860+
* without attempting any hardware access.
861+
*/
862+
if (nv_dev->inSurpriseRemoval) {
863+
NV_DRM_DEV_LOG_INFO(nv_dev,
864+
"Surprise removal detected, skipping hardware access");
865+
866+
cancel_delayed_work_sync(&nv_dev->hotplug_event_work);
867+
mutex_lock(&nv_dev->lock);
868+
869+
atomic_set(&nv_dev->enable_event_handling, false);
870+
drm_kms_helper_poll_fini(dev);
871+
drm_mode_config_cleanup(dev);
872+
873+
pDevice = nv_dev->pDevice;
874+
nv_dev->pDevice = NULL;
875+
876+
mutex_unlock(&nv_dev->lock);
877+
878+
/*
879+
* Use freeDeviceForSurpriseRemoval instead of freeDevice.
880+
* This skips KmsFreeDevice() and RmFreeDevice() which would try
881+
* to access GPU hardware via ioctls/RM API calls and cause
882+
* page faults since the GPU memory is unmapped.
883+
* It only calls nvkms_close_gpu() to release the GPU reference
884+
* count, allowing the eGPU to be re-initialized when reconnected.
885+
*/
886+
if (pDevice != NULL) {
887+
nvKms->freeDeviceForSurpriseRemoval(pDevice);
888+
}
889+
return;
890+
}
891+
855892
/* Release modeset ownership if fbdev is enabled */
856893

857894
#if defined(NV_DRM_FBDEV_AVAILABLE)
@@ -2160,6 +2197,28 @@ static void nv_drm_dev_destroy(struct nv_drm_device *nv_dev)
21602197
nv_drm_free(nv_dev);
21612198
}
21622199

2200+
/*
2201+
* Helper to get PCI device from DRM device, handling both old and new kernels.
2202+
* Returns NULL if not a PCI device or device not available.
2203+
*/
2204+
static struct pci_dev *nv_drm_get_pci_dev(struct drm_device *dev)
2205+
{
2206+
if (dev == NULL) {
2207+
return NULL;
2208+
}
2209+
2210+
#if defined(NV_DRM_DEVICE_HAS_PDEV)
2211+
return dev->pdev;
2212+
#else
2213+
/* On newer kernels (5.14+), drm_device.pdev was removed.
2214+
* Get PCI device from the parent device. */
2215+
if (dev->dev != NULL && dev->dev->bus == &pci_bus_type) {
2216+
return to_pci_dev(dev->dev);
2217+
}
2218+
return NULL;
2219+
#endif
2220+
}
2221+
21632222
/*
21642223
* Unregister a single NVIDIA DRM device.
21652224
*/
@@ -2168,7 +2227,26 @@ void nv_drm_remove(NvU32 gpuId)
21682227
struct nv_drm_device *nv_dev = nv_drm_find_and_remove_device(gpuId);
21692228

21702229
if (nv_dev) {
2230+
struct pci_dev *pdev;
2231+
21712232
NV_DRM_DEV_LOG_INFO(nv_dev, "Removing device");
2233+
2234+
/*
2235+
* Check if this is a surprise removal (hot-unplug) by testing
2236+
* if the PCI channel is offline. This happens when:
2237+
* - Thunderbolt eGPU is physically disconnected
2238+
* - GPU falls off the bus unexpectedly
2239+
*
2240+
* For normal driver unload (rmmod), the PCI channel remains online.
2241+
* We only skip NVKMS hardware access during surprise removal.
2242+
*/
2243+
pdev = nv_drm_get_pci_dev(nv_dev->dev);
2244+
if (pdev != NULL && pci_channel_offline(pdev)) {
2245+
NV_DRM_DEV_LOG_INFO(nv_dev,
2246+
"PCI channel offline - surprise removal detected");
2247+
nv_dev->inSurpriseRemoval = NV_TRUE;
2248+
}
2249+
21722250
drm_dev_unplug(nv_dev->dev);
21732251
nv_drm_dev_destroy(nv_dev);
21742252
}

kernel-open/nvidia-drm/nvidia-drm-priv.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,21 @@ struct nv_drm_device {
146146
NvBool subOwnershipGranted;
147147
NvBool hasFramebufferConsole;
148148

149+
/*
150+
* Set to NV_TRUE for external GPUs (e.g., Thunderbolt/USB4 eGPU).
151+
* External GPUs use the fast removal path to avoid hangs during
152+
* both surprise removal and "safe" software-initiated disconnect.
153+
*/
154+
NvBool isExternalGpu;
155+
156+
/*
157+
* Set to NV_TRUE when the device is being removed due to
158+
* surprise removal (e.g., Thunderbolt eGPU hot-unplug).
159+
* When set, NVKMS operations that would access GPU hardware
160+
* are skipped to prevent crashes from accessing unmapped memory.
161+
*/
162+
NvBool inSurpriseRemoval;
163+
149164
struct drm_property *nv_out_fence_property;
150165
struct drm_property *nv_input_colorspace_property;
151166

kernel-open/nvidia-modeset/nvidia-modeset-linux.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,27 @@ void nvkms_close_gpu(NvU32 gpuId)
12041204
__rm_ops.free_stack(stack);
12051205
}
12061206

1207+
void nvkms_gpu_lost(NvU32 gpuId)
1208+
{
1209+
/*
1210+
* Mark the GPU as lost in NVKMS. This prevents hardware access
1211+
* and cancels pending timers that might try to access the removed GPU.
1212+
*
1213+
* NOTE: We intentionally do NOT take nvkms_lock here because this function
1214+
* may be called from contexts that already hold the lock (e.g., during
1215+
* module unload). The gpuLost flag is a simple boolean that can be safely
1216+
* written without a lock - any racing operation will either:
1217+
* 1. See gpuLost=TRUE and bail out early
1218+
* 2. See gpuLost=FALSE but hit the 0xFFFFFFFF check when reading hardware
1219+
*
1220+
* A memory barrier ensures the write is visible to other CPUs promptly.
1221+
*/
1222+
nvKmsGpuLost(gpuId);
1223+
1224+
/* Ensure gpuLost write is visible to other CPUs */
1225+
smp_wmb();
1226+
}
1227+
12071228
NvU32 nvkms_enumerate_gpus(nv_gpu_info_t *gpu_info)
12081229
{
12091230
return __rm_ops.enumerate_gpus(gpu_info);

kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,12 @@ void* nvkms_get_per_open_data(int fd);
309309
NvBool nvkms_open_gpu(NvU32 gpuId);
310310
void nvkms_close_gpu(NvU32 gpuId);
311311

312+
/*!
313+
* Mark a GPU as lost (surprise removal, e.g., Thunderbolt eGPU unplug).
314+
* This prevents hardware access and cancels pending timers.
315+
*/
316+
void nvkms_gpu_lost(NvU32 gpuId);
317+
312318

313319
/*!
314320
* Enumerate nvidia gpus.

kernel-open/nvidia-modeset/nvkms.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ void nvKmsModuleUnload(void);
8888
void nvKmsSuspend(NvU32 gpuId);
8989
void nvKmsResume(NvU32 gpuId);
9090

91+
void nvKmsGpuLost(NvU32 gpuId);
92+
9193
void nvKmsGetProcFiles(const nvkms_procfs_file_t **ppProcFiles);
9294

9395
NvBool nvKmsReadConf(const char *buff, size_t size,

kernel-open/nvidia/nv-acpi.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,12 +261,28 @@ static void nv_acpi_notify_event(acpi_handle handle, u32 event_type, void *data)
261261
{
262262
nv_acpi_t *pNvAcpiObject = data;
263263
nv_state_t *nvl = pNvAcpiObject->notifier_data;
264+
nv_state_t *nv;
265+
266+
if (nvl == NULL)
267+
return;
268+
269+
nv = NV_STATE_PTR(nvl);
270+
if (nv == NULL)
271+
return;
272+
273+
/*
274+
* Check if we're in surprise removal before processing ACPI events.
275+
* This can happen during Thunderbolt eGPU hot-unplug where the device
276+
* is being removed but ACPI events are still being delivered.
277+
*/
278+
if (nv->flags & NV_FLAG_IN_SURPRISE_REMOVAL)
279+
return;
264280

265281
/*
266282
* Function to handle device specific ACPI events such as display hotplug,
267283
* GPS and D-notifier events.
268284
*/
269-
rm_acpi_notify(pNvAcpiObject->sp, NV_STATE_PTR(nvl), event_type);
285+
rm_acpi_notify(pNvAcpiObject->sp, nv, event_type);
270286
}
271287

272288
void nv_acpi_register_notifier(nv_linux_state_t *nvl)

kernel-open/nvidia/nv-i2c.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@ static int nv_i2c_algo_master_xfer(struct i2c_adapter *adapter, struct i2c_msg m
4444
#endif
4545
;
4646

47+
/*
48+
* Check if the GPU is in surprise removal (e.g., Thunderbolt unplug).
49+
* If so, return immediately to avoid hanging on RPC calls to GSP.
50+
*/
51+
if (nv_check_gpu_state(nv) != NV_OK)
52+
{
53+
return -ENODEV;
54+
}
55+
4756
rc = nv_kmem_cache_alloc_stack(&sp);
4857
if (rc != 0)
4958
{
@@ -93,6 +102,15 @@ static int nv_i2c_algo_smbus_xfer(
93102
NV_STATUS rmStatus = NV_OK;
94103
nvidia_stack_t *sp = NULL;
95104

105+
/*
106+
* Check if the GPU is in surprise removal (e.g., Thunderbolt unplug).
107+
* If so, return immediately to avoid hanging on RPC calls to GSP.
108+
*/
109+
if (nv_check_gpu_state(nv) != NV_OK)
110+
{
111+
return -ENODEV;
112+
}
113+
96114
rc = nv_kmem_cache_alloc_stack(&sp);
97115
if (rc != 0)
98116
{
@@ -196,6 +214,15 @@ static u32 nv_i2c_algo_functionality(struct i2c_adapter *adapter)
196214
u32 ret = I2C_FUNC_I2C;
197215
nvidia_stack_t *sp = NULL;
198216

217+
/*
218+
* Check if the GPU is in surprise removal (e.g., Thunderbolt unplug).
219+
* If so, return 0 to indicate no functionality available.
220+
*/
221+
if (nv_check_gpu_state(nv) != NV_OK)
222+
{
223+
return 0;
224+
}
225+
199226
if (nv_kmem_cache_alloc_stack(&sp) != 0)
200227
{
201228
return 0;

kernel-open/nvidia/nv-pci.c

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "nv-msi.h"
2828
#include "nv-hypervisor.h"
2929
#include "nv-reg.h"
30+
#include "nv-rsync.h"
3031

3132
#if defined(NV_VGPU_KVM_BUILD)
3233
#include "nv-vgpu-vfio-interface.h"
@@ -1736,6 +1737,13 @@ nv_pci_remove(struct pci_dev *pci_dev)
17361737

17371738
nv = NV_STATE_PTR(nvl);
17381739

1740+
/*
1741+
* Note: For external GPUs (eGPU via Thunderbolt), the NV_FLAG_IN_SURPRISE_REMOVAL
1742+
* flag is set later in the removal process - either when waiting for usage count
1743+
* times out, or when actual surprise removal is detected. Setting it too early
1744+
* can interfere with normal cleanup operations that need to acquire GPU locks.
1745+
*/
1746+
17391747
#if NV_IS_EXPORT_SYMBOL_GPL_iommu_dev_disable_feature
17401748
#if defined(CONFIG_IOMMU_SVA) && \
17411749
(defined(NV_IOASID_GET_PRESENT) || defined(NV_MM_PASID_DROP_PRESENT))
@@ -1773,22 +1781,34 @@ nv_pci_remove(struct pci_dev *pci_dev)
17731781
/*
17741782
* Sanity check: A removed device shouldn't have a non-zero usage_count.
17751783
* For eGPU, fall off the bus along with clients active is a valid scenario.
1776-
* Hence skipping the sanity check for eGPU.
1784+
* We still wait for a short time to allow in-progress close operations
1785+
* to complete, but with a timeout to prevent hangs.
17771786
*/
1778-
if ((NV_ATOMIC_READ(nvl->usage_count) != 0) && !(nv->is_external_gpu))
1787+
if (NV_ATOMIC_READ(nvl->usage_count) != 0)
17791788
{
1789+
/*
1790+
* For external GPU: wait up to 5 seconds (10 iterations * 500ms)
1791+
* For internal GPU: wait up to 60 seconds (120 iterations * 500ms)
1792+
* This prevents indefinite hangs while still allowing time for
1793+
* graceful cleanup of in-progress operations.
1794+
*/
1795+
int max_wait_iterations = nv->is_external_gpu ? 10 : 120;
1796+
int wait_iterations = 0;
1797+
17801798
nv_printf(NV_DBG_ERRORS,
1781-
"NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count!\n",
1799+
"NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count (%d)%s\n",
17821800
NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
1783-
NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
1801+
NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn),
1802+
NV_ATOMIC_READ(nvl->usage_count),
1803+
nv->is_external_gpu ? " (external GPU)" : "");
17841804

17851805
/*
17861806
* We can't return from this function without corrupting state, so we wait for
1787-
* the usage count to go to zero.
1807+
* the usage count to go to zero, but with a timeout.
17881808
*/
1789-
while (NV_ATOMIC_READ(nvl->usage_count) != 0)
1809+
while ((NV_ATOMIC_READ(nvl->usage_count) != 0) &&
1810+
(wait_iterations < max_wait_iterations))
17901811
{
1791-
17921812
/*
17931813
* While waiting, release the locks so that other threads can make
17941814
* forward progress.
@@ -1797,6 +1817,7 @@ nv_pci_remove(struct pci_dev *pci_dev)
17971817
UNLOCK_NV_LINUX_DEVICES();
17981818

17991819
os_delay(500);
1820+
wait_iterations++;
18001821

18011822
/* Re-acquire the locks before checking again */
18021823
LOCK_NV_LINUX_DEVICES();
@@ -1815,10 +1836,32 @@ nv_pci_remove(struct pci_dev *pci_dev)
18151836
down(&nvl->ldata_lock);
18161837
}
18171838

1818-
nv_printf(NV_DBG_ERRORS,
1819-
"NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n",
1820-
NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
1821-
NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
1839+
if (NV_ATOMIC_READ(nvl->usage_count) != 0)
1840+
{
1841+
nv_printf(NV_DBG_ERRORS,
1842+
"NVRM: Timeout waiting for usage count on device %04x:%02x:%02x.%x (remaining: %d). Forcing removal.\n",
1843+
NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
1844+
NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn),
1845+
NV_ATOMIC_READ(nvl->usage_count));
1846+
/*
1847+
* Force the surprise removal flag so that any remaining
1848+
* close operations will take the fast-path.
1849+
*/
1850+
nv->flags |= NV_FLAG_IN_SURPRISE_REMOVAL;
1851+
1852+
/*
1853+
* Mark that we had a surprise removal so rsync cleanup
1854+
* warnings are suppressed during module unload.
1855+
*/
1856+
nv_set_rsync_had_surprise_removal();
1857+
}
1858+
else
1859+
{
1860+
nv_printf(NV_DBG_ERRORS,
1861+
"NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n",
1862+
NV_PCI_DOMAIN_NUMBER(pci_dev), NV_PCI_BUS_NUMBER(pci_dev),
1863+
NV_PCI_SLOT_NUMBER(pci_dev), PCI_FUNC(pci_dev->devfn));
1864+
}
18221865
}
18231866

18241867
rm_check_for_gpu_surprise_removal(sp, nv);

0 commit comments

Comments
 (0)