Skip to content

Commit 0224667

Browse files
ankita-nvnirmoy
authored andcommitted
NVIDIA: SAUCE: vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
Add a CXL DVSEC-based readiness check for Blackwell-Next GPUs alongside the existing legacy BAR0 polling path. On probe and after reset, the driver reads the CXL Device DVSEC capability to determine whether the GPU memory is valid. This is checked by polling on the Memory_Active bit based on the Memory_Active_Timeout. Also check if MEM_INFO_VALID is set within 1 second per CXL spec 4.0 Tables 8-13. If not, return error. A static inline wrapper dispatches to the appropriate readiness check based on whether the CXL DVSEC capability is present. Add PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT to pci_regs.h for the timeout field encoding. cc: Kevin Tian <kevin.tian@intel.com> Suggested-by: Alex Williamson <alex@shazbot.org> Signed-off-by: Ankit Agrawal <ankita@nvidia.com> (backported from https://lore.kernel.org/all/20260416014504.63067-1-ankita@nvidia.com/) [nirmoy: kept both egm_node (existing EGM SAUCE) and cxl_dvsec in struct to avoid conflict with EGM backport] Signed-off-by: Nirmoy Das <nirmoyd@nvidia.com>
1 parent b2b7ddf commit 0224667

2 files changed

Lines changed: 89 additions & 8 deletions

File tree

drivers/vfio/pci/nvgrace-gpu/main.c

Lines changed: 88 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ struct nvgrace_gpu_pci_core_device {
6666
/* GPU has just been reset */
6767
bool reset_done;
6868
int egm_node;
69+
/* CXL Device DVSEC offset; 0 if not present (legacy GB path) */
70+
int cxl_dvsec;
6971
};
7072

7173
static bool egm_enabled;
@@ -246,7 +248,7 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
246248
vfio_pci_core_close_device(core_vdev);
247249
}
248250

249-
static int nvgrace_gpu_wait_device_ready(void __iomem *io)
251+
static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io)
250252
{
251253
unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
252254

@@ -260,6 +262,76 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io)
260262
return -ETIME;
261263
}
262264

265+
/*
266+
* Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low
267+
* (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2:
268+
* 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s,
269+
* 101b-111b = reserved (clamped to 256s).
270+
*/
271+
static inline unsigned long cxl_mem_active_timeout_ms(u8 timeout)
272+
{
273+
return 1000UL << (2 * min_t(u8, timeout, 4));
274+
}
275+
276+
/*
277+
* Check if CXL DVSEC reports memory as valid and active.
278+
*/
279+
static inline bool cxl_dvsec_mem_is_active(u32 status)
280+
{
281+
return (status & PCI_DVSEC_CXL_MEM_INFO_VALID) &&
282+
(status & PCI_DVSEC_CXL_MEM_ACTIVE);
283+
}
284+
285+
static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev)
286+
{
287+
struct pci_dev *pdev = nvdev->core_device.pdev;
288+
int cxl_dvsec = nvdev->cxl_dvsec;
289+
unsigned long mem_info_valid_deadline;
290+
unsigned long timeout;
291+
u32 dvsec_memory_status;
292+
u8 mem_active_timeout;
293+
294+
pci_read_config_dword(pdev, cxl_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
295+
&dvsec_memory_status);
296+
297+
if (cxl_dvsec_mem_is_active(dvsec_memory_status))
298+
return 0;
299+
300+
mem_active_timeout = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT,
301+
dvsec_memory_status);
302+
303+
timeout = jiffies +
304+
msecs_to_jiffies(cxl_mem_active_timeout_ms(mem_active_timeout));
305+
306+
mem_info_valid_deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS);
307+
308+
do {
309+
pci_read_config_dword(pdev,
310+
cxl_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0),
311+
&dvsec_memory_status);
312+
313+
if (cxl_dvsec_mem_is_active(dvsec_memory_status))
314+
return 0;
315+
316+
/* Bail early if MEM_INFO_VALID is not set within 1 second */
317+
if (!(dvsec_memory_status & PCI_DVSEC_CXL_MEM_INFO_VALID) &&
318+
time_after(jiffies, mem_info_valid_deadline))
319+
return -ETIME;
320+
321+
msleep(POLL_QUANTUM_MS);
322+
} while (!time_after(jiffies, timeout));
323+
324+
return -ETIME;
325+
}
326+
327+
static inline int nvgrace_gpu_wait_device_ready(struct nvgrace_gpu_pci_core_device *nvdev,
328+
void __iomem *io)
329+
{
330+
return nvdev->cxl_dvsec ?
331+
nvgrace_gpu_wait_device_ready_cxl(nvdev) :
332+
nvgrace_gpu_wait_device_ready_legacy(io);
333+
}
334+
263335
/*
264336
* If the GPU memory is accessed by the CPU while the GPU is not ready
265337
* after reset, it can cause harmless corrected RAS events to be logged.
@@ -279,7 +351,7 @@ nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
279351
if (!__vfio_pci_memory_enabled(vdev))
280352
return -EIO;
281353

282-
ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]);
354+
ret = nvgrace_gpu_wait_device_ready(nvdev, vdev->barmap[0]);
283355
if (ret)
284356
return ret;
285357

@@ -1157,11 +1229,16 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
11571229
* Ensure that the BAR0 region is enabled before accessing the
11581230
* registers.
11591231
*/
1160-
static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
1232+
static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev)
11611233
{
1234+
struct pci_dev *pdev = nvdev->core_device.pdev;
11621235
void __iomem *io;
11631236
int ret;
11641237

1238+
/* CXL path only reads PCI config space; no need to map BAR0. */
1239+
if (nvdev->cxl_dvsec)
1240+
return nvgrace_gpu_wait_device_ready_cxl(nvdev);
1241+
11651242
ret = pci_enable_device(pdev);
11661243
if (ret)
11671244
return ret;
@@ -1176,7 +1253,7 @@ static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev)
11761253
goto iomap_exit;
11771254
}
11781255

1179-
ret = nvgrace_gpu_wait_device_ready(io);
1256+
ret = nvgrace_gpu_wait_device_ready_legacy(io);
11801257

11811258
pci_iounmap(pdev, io);
11821259
iomap_exit:
@@ -1195,10 +1272,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
11951272
u64 egmpxm;
11961273
int ret;
11971274

1198-
ret = nvgrace_gpu_probe_check_device_ready(pdev);
1199-
if (ret)
1200-
return ret;
1201-
12021275
ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
12031276
if (!ret) {
12041277
ops = &nvgrace_gpu_pci_ops;
@@ -1215,6 +1288,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
12151288

12161289
dev_set_drvdata(&pdev->dev, &nvdev->core_device);
12171290

1291+
nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
1292+
PCI_DVSEC_CXL_DEVICE);
1293+
1294+
ret = nvgrace_gpu_probe_check_device_ready(nvdev);
1295+
if (ret)
1296+
goto out_put_vdev;
1297+
12181298
if (ops == &nvgrace_gpu_pci_ops) {
12191299
nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
12201300

include/uapi/linux/pci_regs.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,6 +1371,7 @@
13711371
#define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10))
13721372
#define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0)
13731373
#define PCI_DVSEC_CXL_MEM_ACTIVE _BITUL(1)
1374+
#define PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT __GENMASK(15, 13)
13741375
#define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28)
13751376
#define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10))
13761377
#define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10))

0 commit comments

Comments
 (0)