Skip to content

Commit 61e3d30

Browse files
sourabgupta3nvidia-bfigg
authored andcommitted
NVIDIA: SAUCE: Patch NVMe/NVMeoF driver to support GDS on Linux 7.0 Kernel
BugLink: https://bugs.launchpad.net/bugs/2150289 BugLink: https://bugs.launchpad.net/bugs/2134960 With this change, the NVMe and NVMeoF driver would be enabled to support GPUDirectStorage(GDS). NVMe driver introduced a way to use the blk_rq_dma_map API to DMA map requests instead of scatter gather lists. With these changes, GDS path also adopts a similar framework where we introduce blk based APIs(nvfs_blk_rq_dma_map_iter_start and nvfs_blk_rq_dma_map_iter_next) to map a DMA request. The NVMeoF path remains the same as previous releases. Signed-off-by: Sourab Gupta <sougupta@nvidia.com> Reviewed-by: Kiran Modukuri <kmodukuri@nvidia.com> Acked-by: Matthew R. Ochs <mochs@nvidia.com> Acked-by: Nirmoy Das <nirmoyd@nvidia.com> Acked-by: Jamie Nguyen <jamien@nvidia.com> Acked-by: Carol L Soto <csoto@nvidia.com> Signed-off-by: Brad Figg <bfigg@nvidia.com>
1 parent 53c2516 commit 61e3d30

8 files changed

Lines changed: 676 additions & 7 deletions

File tree

drivers/nvme/host/Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: GPL-2.0
22

33
ccflags-y += -I$(src)
4-
4+
ccflags-y += -DCONFIG_NVFS
55
obj-$(CONFIG_NVME_CORE) += nvme-core.o
66
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
77
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
@@ -20,10 +20,11 @@ nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
2020
nvme-core-$(CONFIG_NVME_HOST_AUTH) += auth.o
2121

2222
nvme-y += pci.o
23-
23+
nvme-y += nvfs-dma.o
2424
nvme-fabrics-y += fabrics.o
2525

2626
nvme-rdma-y += rdma.o
27+
nvme-rdma-y += nvfs-rdma.o
2728

2829
nvme-fc-y += fc.o
2930

drivers/nvme/host/nvfs-dma.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
*
5+
* This program is free software; you can redistribute it and/or modify it
6+
* under the terms and conditions of the GNU General Public License,
7+
* version 2, as published by the Free Software Foundation.
8+
*
9+
* This program is distributed in the hope it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12+
* more details.
13+
*/
14+
#ifdef CONFIG_NVFS
15+
#define NVFS_USE_DMA_ITER_API
16+
#define MODULE_PREFIX nvme_v2
17+
#include "nvfs.h"
18+
19+
struct nvfs_dma_rw_blk_iter_ops *nvfs_ops = NULL;
20+
21+
atomic_t nvfs_shutdown = ATOMIC_INIT(1);
22+
23+
DEFINE_PER_CPU(long, nvfs_n_ops);
24+
25+
#define NVIDIA_FS_COMPAT_FT(ops) \
26+
(NVIDIA_FS_CHECK_FT_BLK_DMA_MAP_ITER_START(ops) && NVIDIA_FS_CHECK_FT_BLK_DMA_MAP_ITER_NEXT(ops))
27+
28+
// protected via nvfs_module_mutex
29+
int REGISTER_FUNC(struct nvfs_dma_rw_blk_iter_ops *ops)
30+
{
31+
if (NVIDIA_FS_COMPAT_FT(ops)) {
32+
nvfs_ops = ops;
33+
atomic_set(&nvfs_shutdown, 0);
34+
return 0;
35+
} else
36+
return -EOPNOTSUPP;
37+
38+
}
39+
EXPORT_SYMBOL_GPL(REGISTER_FUNC);
40+
41+
// protected via nvfs_module_mutex
42+
void UNREGISTER_FUNC(void)
43+
{
44+
(void) atomic_cmpxchg(&nvfs_shutdown, 0, 1);
45+
do {
46+
msleep(NVFS_HOLD_TIME_MS);
47+
} while(nvfs_count_ops());
48+
nvfs_ops = NULL;
49+
}
50+
EXPORT_SYMBOL_GPL(UNREGISTER_FUNC);
51+
#endif

drivers/nvme/host/nvfs-dma.h

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
*
5+
* This program is free software; you can redistribute it and/or modify it
6+
* under the terms and conditions of the GNU General Public License,
7+
* version 2, as published by the Free Software Foundation.
8+
*
9+
* This program is distributed in the hope it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12+
* more details.
13+
*/
14+
15+
#ifndef NVFS_DMA_H
16+
#define NVFS_DMA_H
17+
18+
/* Forward declarations for functions from pci.c that we need */
19+
static blk_status_t nvme_pci_setup_data_prp(struct request *req,
20+
struct blk_dma_iter *iter);
21+
static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
22+
struct blk_dma_iter *iter);
23+
static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
24+
struct nvme_iod *iod);
25+
static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd);
26+
27+
static inline bool nvme_nvfs_unmap_sgls(struct request *req)
28+
{
29+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
30+
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
31+
struct device *dma_dev = nvmeq->dev->dev;
32+
unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
33+
struct nvme_sgl_desc *sg_list = iod->descriptors[0];
34+
enum dma_data_direction dir = rq_dma_dir(req);
35+
36+
/*
37+
* nr_descriptors == 0 means dma_pool_alloc failed before any SGL
38+
* entries were recorded; the first iter mapping is handled by
39+
* nvme_nvfs_map_data() directly, so nothing to unmap here.
40+
*/
41+
if (iod->nr_descriptors) {
42+
unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
43+
44+
for (i = 0; i < nr_entries; i++) {
45+
nvfs_ops->nvfs_dma_unmap_page(dma_dev,
46+
iod->nvfs_cookie,
47+
le64_to_cpu(sg_list[i].addr),
48+
le32_to_cpu(sg_list[i].length),
49+
dir);
50+
}
51+
}
52+
53+
return true;
54+
}
55+
56+
static inline bool nvme_nvfs_unmap_prps(struct request *req)
57+
{
58+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
59+
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
60+
struct device *dma_dev = nvmeq->dev->dev;
61+
enum dma_data_direction dma_dir = rq_dma_dir(req);
62+
unsigned int i;
63+
64+
/* Check if dma_vecs was allocated - if setup failed early, it might be NULL */
65+
if (!iod->dma_vecs)
66+
return true;
67+
68+
/* Unmap all DMA vectors - pass page pointer from dma_vecs */
69+
for (i = 0; i < iod->nr_dma_vecs; i++) {
70+
nvfs_ops->nvfs_dma_unmap_page(dma_dev,
71+
iod->nvfs_cookie,
72+
iod->dma_vecs[i].addr,
73+
iod->dma_vecs[i].len,
74+
dma_dir);
75+
}
76+
77+
/* Free the dma_vecs mempool allocation */
78+
mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
79+
iod->dma_vecs = NULL;
80+
iod->nr_dma_vecs = 0;
81+
82+
return true;
83+
}
84+
85+
static inline void nvme_nvfs_free_descriptors(struct request *req)
86+
{
87+
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
88+
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
89+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
90+
dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
91+
int i;
92+
93+
if (iod->nr_descriptors == 1) {
94+
dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
95+
dma_addr);
96+
return;
97+
}
98+
99+
for (i = 0; i < iod->nr_descriptors; i++) {
100+
__le64 *prp_list = iod->descriptors[i];
101+
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
102+
103+
dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
104+
dma_addr);
105+
dma_addr = next_dma_addr;
106+
}
107+
}
108+
109+
static inline bool nvme_nvfs_unmap_data(struct request *req)
110+
{
111+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
112+
bool ret;
113+
114+
/* Check if this was an NVFS I/O by checking the IOD_NVFS_IO flag */
115+
if (!(iod->flags & IOD_NVFS_IO))
116+
return false;
117+
118+
/* Clear the NVFS flag */
119+
iod->flags &= ~IOD_NVFS_IO;
120+
121+
/* Call appropriate unmap function based on command type */
122+
if (nvme_pci_cmd_use_sgl(&iod->cmd))
123+
ret = nvme_nvfs_unmap_sgls(req);
124+
else
125+
ret = nvme_nvfs_unmap_prps(req);
126+
127+
if (iod->nr_descriptors)
128+
nvme_nvfs_free_descriptors(req);
129+
130+
nvfs_put_ops();
131+
return ret;
132+
}
133+
134+
static inline blk_status_t nvme_nvfs_map_data(struct request *req,
135+
bool *is_nvfs_io)
136+
{
137+
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
138+
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
139+
struct nvme_dev *dev = nvmeq->dev;
140+
struct device *dma_dev = nvmeq->dev->dev;
141+
enum nvme_use_sgl use_sgl = nvme_pci_use_sgls(dev, req);
142+
struct blk_dma_iter iter;
143+
blk_status_t ret = BLK_STS_RESOURCE;
144+
145+
*is_nvfs_io = false;
146+
147+
/* Check integrity and try to get nvfs_ops */
148+
if (blk_integrity_rq(req) || !nvfs_get_ops()) {
149+
return ret;
150+
}
151+
152+
/* Initialize total_len for this request */
153+
iod->total_len = 0;
154+
155+
if (!nvfs_ops->nvfs_blk_rq_dma_map_iter_start(req, dma_dev,
156+
&iod->dma_state, &iter, &iod->nvfs_cookie)) {
157+
nvfs_put_ops();
158+
if (iter.status == BLK_STS_IOERR) {
159+
/* GPU DMA error — do not fall through to CPU path */
160+
*is_nvfs_io = true;
161+
ret = iter.status;
162+
}
163+
/* else: CPU page, let caller fall through to CPU path */
164+
return ret;
165+
}
166+
167+
/* NVFS can handle this request, set the flag */
168+
*is_nvfs_io = true;
169+
iod->flags |= IOD_NVFS_IO;
170+
171+
if (use_sgl == SGL_FORCED ||
172+
(use_sgl == SGL_SUPPORTED &&
173+
(sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
174+
ret = nvme_pci_setup_data_sgl(req, &iter);
175+
else
176+
ret = nvme_pci_setup_data_prp(req, &iter);
177+
178+
/* If setup failed, cleanup: unmap DMA, clear flag, release ops */
179+
if (ret != BLK_STS_OK) {
180+
/*
181+
* If setup failed before any mappings were tracked (dma_vecs is
182+
* NULL for PRP, or nr_descriptors is 0 for SGL), the first page
183+
* mapped by nvfs_blk_rq_dma_map_iter_start() won't be covered by
184+
* nvme_nvfs_unmap_data(). Unmap it directly using iter.
185+
*/
186+
bool early_fail = nvme_pci_cmd_use_sgl(&iod->cmd) ?
187+
!iod->nr_descriptors : !iod->dma_vecs;
188+
if (early_fail)
189+
nvfs_ops->nvfs_dma_unmap_page(dma_dev, iod->nvfs_cookie,
190+
iter.addr, iter.len, rq_dma_dir(req));
191+
nvme_nvfs_unmap_data(req);
192+
}
193+
194+
return ret;
195+
}
196+
197+
#endif /* NVFS_DMA_H */

drivers/nvme/host/nvfs-rdma.c

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
*
5+
* This program is free software; you can redistribute it and/or modify it
6+
* under the terms and conditions of the GNU General Public License,
7+
* version 2, as published by the Free Software Foundation.
8+
*
9+
* This program is distributed in the hope it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12+
* more details.
13+
*/
14+
15+
#ifdef CONFIG_NVFS
16+
#define MODULE_PREFIX nvme_rdma_v1
17+
#include "nvfs.h"
18+
19+
struct nvfs_dma_rw_ops *nvfs_ops;
20+
21+
atomic_t nvfs_shutdown = ATOMIC_INIT(1);
22+
23+
DEFINE_PER_CPU(long, nvfs_n_ops);
24+
25+
// must have for compatability
26+
#define NVIDIA_FS_COMPAT_FT(ops) \
27+
(NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))
28+
29+
// protected via nvfs_module_mutex
30+
int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
31+
{
32+
if (NVIDIA_FS_COMPAT_FT(ops)) {
33+
nvfs_ops = ops;
34+
atomic_set(&nvfs_shutdown, 0);
35+
return 0;
36+
} else
37+
return -EOPNOTSUPP;
38+
39+
}
40+
EXPORT_SYMBOL_GPL(REGISTER_FUNC);
41+
42+
// protected via nvfs_module_mutex
43+
void UNREGISTER_FUNC(void)
44+
{
45+
(void) atomic_cmpxchg(&nvfs_shutdown, 0, 1);
46+
do {
47+
msleep(NVFS_HOLD_TIME_MS);
48+
} while(nvfs_count_ops());
49+
nvfs_ops = NULL;
50+
}
51+
EXPORT_SYMBOL_GPL(UNREGISTER_FUNC);
52+
#endif

0 commit comments

Comments
 (0)