Skip to content

Commit f52b5c9

Browse files
P33Mpelwell
authored andcommitted
nvme-pci: manually allocate Host Memory Buffer segments on arm64
The Host Memory Buffer allocation algorithm interacts badly with arm64 platforms with no IOMMU for PCIe devices, such as BCM2711/BCM2712. The discontiguous VA allocation in nvme_host_mem_alloc_single() always fails, so nvme_alloc_host_mem_multi() falls back to the DMA coherent allocation scheme. On arm64, this will come out of CMA by default. Recent DRAM-less SSDs will request significant amounts of host memory - up to 128MB. As NVMe devices are set up early in boot, CMA is mostly-free so it ends up being claimed by a driver using it for opaque device-exclusive buffers. The divide-and-conquer allocation strategy also paradoxically results in increased CMA pressure if portions are already reserved. PCIe NVMe controllers implement a variably-sized HMB descriptor table, typically ranging from 32 to 256 entries in size. Therefore, aside from implementation-specific costs in the controller doing more granular look-ups, providing smaller orders is acceptable. Failing to provide a HMB does not prevent the controller from functioning. Create an alternate implementation for arm64 that creates a scatterlist and directly assigns contiguous pages from the buddy allocator, retrying with smaller orders on failure. This will avoid CMA by default. Signed-off-by: Jonathan Bell <jonathan@raspberrypi.com>
1 parent 9706f4d commit f52b5c9

1 file changed

Lines changed: 98 additions & 0 deletions

File tree

drivers/nvme/host/pci.c

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,11 @@ struct nvme_dev {
183183
/* host memory buffer support: */
184184
u64 host_mem_size;
185185
u32 nr_host_mem_descs;
186+
u32 nr_sgl_ents;
186187
u32 host_mem_descs_size;
187188
dma_addr_t host_mem_descs_dma;
188189
struct nvme_host_mem_buf_desc *host_mem_descs;
190+
struct scatterlist *host_mem_sgl;
189191
void **host_mem_desc_bufs;
190192
unsigned int nr_allocated_queues;
191193
unsigned int nr_write_queues;
@@ -2300,6 +2302,13 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
23002302
return ret;
23012303
}
23022304

2305+
#if IS_ENABLED(CONFIG_ARM64)
2306+
static void nvme_free_host_mem_multi(struct nvme_dev *dev)
2307+
{
2308+
dma_unmap_sg(dev->dev, dev->host_mem_sgl, dev->nr_host_mem_descs, DMA_FROM_DEVICE);
2309+
sgl_free(dev->host_mem_sgl);
2310+
}
2311+
#else
23032312
static void nvme_free_host_mem_multi(struct nvme_dev *dev)
23042313
{
23052314
int i;
@@ -2316,6 +2325,7 @@ static void nvme_free_host_mem_multi(struct nvme_dev *dev)
23162325
kfree(dev->host_mem_desc_bufs);
23172326
dev->host_mem_desc_bufs = NULL;
23182327
}
2328+
#endif
23192329

23202330
static void nvme_free_host_mem(struct nvme_dev *dev)
23212331
{
@@ -2358,6 +2368,93 @@ static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
23582368
return 0;
23592369
}
23602370

2371+
#if IS_ENABLED(CONFIG_ARM64)
2372+
static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
2373+
u32 chunk_size)
2374+
{
2375+
struct nvme_host_mem_buf_desc *descs;
2376+
u32 max_entries, len, descs_size;
2377+
dma_addr_t descs_dma;
2378+
struct scatterlist *slist;
2379+
struct page *page;
2380+
int i = 0, mapped_nents;
2381+
u64 size, tmp;
2382+
2383+
tmp = (preferred + chunk_size - 1);
2384+
do_div(tmp, chunk_size);
2385+
max_entries = tmp;
2386+
2387+
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
2388+
max_entries = dev->ctrl.hmmaxd;
2389+
2390+
descs_size = max_entries * sizeof(*descs);
2391+
/*
2392+
* Allocate the descriptor table from coherent memory -
2393+
* usually occupies less than/up to a single page.
2394+
*/
2395+
descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
2396+
GFP_KERNEL);
2397+
if (!descs)
2398+
goto out;
2399+
2400+
slist = kcalloc(max_entries, sizeof(struct scatterlist), GFP_KERNEL);
2401+
if (!slist)
2402+
goto out_free_descs;
2403+
2404+
sg_init_table(slist, max_entries);
2405+
2406+
dev_dbg(dev->dev, "Allocating HMB pref = %llu max_entries = %u\n",
2407+
preferred, max_entries);
2408+
2409+
for (size = 0; size < preferred && i < max_entries; size += len) {
2410+
int order;
2411+
2412+
len = min_t(u64, chunk_size, preferred - size);
2413+
order = get_order(len);
2414+
page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN, order);
2415+
if (!page)
2416+
break;
2417+
sg_set_page(&slist[i], page, len, 0);
2418+
i++;
2419+
}
2420+
if (size < preferred)
2421+
goto out_free_sgl;
2422+
2423+
mapped_nents = dma_map_sg(dev->dev, slist, i, DMA_FROM_DEVICE);
2424+
if (mapped_nents <= 0)
2425+
goto out_free_pages;
2426+
2427+
/* Flush in case the CPU has cached any parts of the DMA buffers */
2428+
dma_sync_sg_for_device(dev->dev, slist, i, DMA_FROM_DEVICE);
2429+
2430+
i = dev->nr_host_mem_descs = mapped_nents;
2431+
2432+
while (--i >= 0) {
2433+
descs[i].addr = sg_dma_address(&slist[i]);
2434+
WARN_ON_ONCE(sg_dma_len(&slist[i]) & (NVME_CTRL_PAGE_SIZE - 1));
2435+
descs[i].size = sg_dma_len(&slist[i]) / NVME_CTRL_PAGE_SIZE;
2436+
}
2437+
2438+
dev->host_mem_size = size;
2439+
dev->host_mem_descs = descs;
2440+
dev->host_mem_descs_dma = descs_dma;
2441+
dev->host_mem_descs_size = descs_size;
2442+
dev->host_mem_sgl = slist;
2443+
return 0;
2444+
2445+
out_free_pages:
2446+
/* Don't use mapped_nents here as it could be incomplete */
2447+
while (--i >= 0)
2448+
__free_pages(sg_page(&slist[i]), get_order(slist[i].length));
2449+
out_free_sgl:
2450+
kfree(slist);
2451+
out_free_descs:
2452+
dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
2453+
out:
2454+
dev->host_mem_descs = NULL;
2455+
return -ENOMEM;
2456+
}
2457+
#else
23612458
static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
23622459
u32 chunk_size)
23632460
{
@@ -2418,6 +2515,7 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
24182515
dev->host_mem_descs = NULL;
24192516
return -ENOMEM;
24202517
}
2518+
#endif
24212519

24222520
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
24232521
{

0 commit comments

Comments
 (0)