2727#include "nv-msi.h"
2828#include "nv-hypervisor.h"
2929#include "nv-reg.h"
30+ #include "nv-rsync.h"
3031
3132#if defined(NV_VGPU_KVM_BUILD )
3233#include "nv-vgpu-vfio-interface.h"
@@ -1736,6 +1737,13 @@ nv_pci_remove(struct pci_dev *pci_dev)
17361737
17371738 nv = NV_STATE_PTR (nvl );
17381739
1740+ /*
1741+ * Note: For external GPUs (eGPU via Thunderbolt), the NV_FLAG_IN_SURPRISE_REMOVAL
1742+ * flag is set later in the removal process - either when waiting for usage count
1743+ * times out, or when actual surprise removal is detected. Setting it too early
1744+ * can interfere with normal cleanup operations that need to acquire GPU locks.
1745+ */
1746+
17391747#if NV_IS_EXPORT_SYMBOL_GPL_iommu_dev_disable_feature
17401748#if defined(CONFIG_IOMMU_SVA ) && \
17411749 (defined(NV_IOASID_GET_PRESENT ) || defined(NV_MM_PASID_DROP_PRESENT ))
@@ -1773,22 +1781,34 @@ nv_pci_remove(struct pci_dev *pci_dev)
17731781 /*
17741782 * Sanity check: A removed device shouldn't have a non-zero usage_count.
17751783 * For eGPU, fall off the bus along with clients active is a valid scenario.
1776- * Hence skipping the sanity check for eGPU.
1784+ * We still wait for a short time to allow in-progress close operations
1785+ * to complete, but with a timeout to prevent hangs.
17771786 */
1778- if (( NV_ATOMIC_READ (nvl -> usage_count ) != 0 ) && !( nv -> is_external_gpu ) )
1787+ if (NV_ATOMIC_READ (nvl -> usage_count ) != 0 )
17791788 {
1789+ /*
1790+ * For external GPU: wait up to 5 seconds (10 iterations * 500ms)
1791+ * For internal GPU: wait up to 60 seconds (120 iterations * 500ms)
1792+ * This prevents indefinite hangs while still allowing time for
1793+ * graceful cleanup of in-progress operations.
1794+ */
1795+ int max_wait_iterations = nv -> is_external_gpu ? 10 : 120 ;
1796+ int wait_iterations = 0 ;
1797+
17801798 nv_printf (NV_DBG_ERRORS ,
1781- "NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count! \n" ,
1799+ "NVRM: Attempting to remove device %04x:%02x:%02x.%x with non-zero usage count (%d)%s \n" ,
17821800 NV_PCI_DOMAIN_NUMBER (pci_dev ), NV_PCI_BUS_NUMBER (pci_dev ),
1783- NV_PCI_SLOT_NUMBER (pci_dev ), PCI_FUNC (pci_dev -> devfn ));
1801+ NV_PCI_SLOT_NUMBER (pci_dev ), PCI_FUNC (pci_dev -> devfn ),
1802+ NV_ATOMIC_READ (nvl -> usage_count ),
1803+ nv -> is_external_gpu ? " (external GPU)" : "" );
17841804
17851805 /*
17861806 * We can't return from this function without corrupting state, so we wait for
1787- * the usage count to go to zero.
1807+ * the usage count to go to zero, but with a timeout .
17881808 */
1789- while (NV_ATOMIC_READ (nvl -> usage_count ) != 0 )
1809+ while ((NV_ATOMIC_READ (nvl -> usage_count ) != 0 ) &&
1810+ (wait_iterations < max_wait_iterations ))
17901811 {
1791-
17921812 /*
17931813 * While waiting, release the locks so that other threads can make
17941814 * forward progress.
@@ -1797,6 +1817,7 @@ nv_pci_remove(struct pci_dev *pci_dev)
17971817 UNLOCK_NV_LINUX_DEVICES ();
17981818
17991819 os_delay (500 );
1820+ wait_iterations ++ ;
18001821
18011822 /* Re-acquire the locks before checking again */
18021823 LOCK_NV_LINUX_DEVICES ();
@@ -1815,10 +1836,32 @@ nv_pci_remove(struct pci_dev *pci_dev)
18151836 down (& nvl -> ldata_lock );
18161837 }
18171838
1818- nv_printf (NV_DBG_ERRORS ,
1819- "NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n" ,
1820- NV_PCI_DOMAIN_NUMBER (pci_dev ), NV_PCI_BUS_NUMBER (pci_dev ),
1821- NV_PCI_SLOT_NUMBER (pci_dev ), PCI_FUNC (pci_dev -> devfn ));
1839+ if (NV_ATOMIC_READ (nvl -> usage_count ) != 0 )
1840+ {
1841+ nv_printf (NV_DBG_ERRORS ,
1842+ "NVRM: Timeout waiting for usage count on device %04x:%02x:%02x.%x (remaining: %d). Forcing removal.\n" ,
1843+ NV_PCI_DOMAIN_NUMBER (pci_dev ), NV_PCI_BUS_NUMBER (pci_dev ),
1844+ NV_PCI_SLOT_NUMBER (pci_dev ), PCI_FUNC (pci_dev -> devfn ),
1845+ NV_ATOMIC_READ (nvl -> usage_count ));
1846+ /*
1847+ * Force the surprise removal flag so that any remaining
1848+ * close operations will take the fast-path.
1849+ */
1850+ nv -> flags |= NV_FLAG_IN_SURPRISE_REMOVAL ;
1851+
1852+ /*
1853+ * Mark that we had a surprise removal so rsync cleanup
1854+ * warnings are suppressed during module unload.
1855+ */
1856+ nv_set_rsync_had_surprise_removal ();
1857+ }
1858+ else
1859+ {
1860+ nv_printf (NV_DBG_ERRORS ,
1861+ "NVRM: Continuing with GPU removal for device %04x:%02x:%02x.%x\n" ,
1862+ NV_PCI_DOMAIN_NUMBER (pci_dev ), NV_PCI_BUS_NUMBER (pci_dev ),
1863+ NV_PCI_SLOT_NUMBER (pci_dev ), PCI_FUNC (pci_dev -> devfn ));
1864+ }
18221865 }
18231866
18241867 rm_check_for_gpu_surprise_removal (sp , nv );
0 commit comments