Skip to content

Commit 582cb2a

Browse files
P33Mpelwell
authored andcommitted
nvme-pci: manually allocate Host Memory Buffer segments on arm64
The Host Memory Buffer allocation algorithm interacts badly with arm64 platforms with no IOMMU for PCIe devices, such as BCM2711/BCM2712. The discontiguous VA allocation in nvme_host_mem_alloc_single() always fails, so nvme_alloc_host_mem_multi() falls back to the DMA coherent allocation scheme. On arm64, this will come out of CMA by default. Recent DRAM-less SSDs will request significant amounts of host memory - up to 128MB. As NVMe devices are set up early in boot, CMA is mostly-free so it ends up being claimed by a driver using it for opaque device-exclusive buffers. The divide-and-conquer allocation strategy also paradoxically results in increased CMA pressure if portions are already reserved. PCIe NVMe controllers implement a variably-sized HMB descriptor table, typically ranging from 32 to 256 entries in size. Therefore, aside from implementation-specific costs in the controller doing more granular look-ups, providing smaller orders is acceptable. Failing to provide a HMB does not prevent the controller from functioning. Create an alternate implementation for arm64 that creates a scatterlist and directly assigns contiguous pages from the buddy allocator, retrying with smaller orders on failure. This will avoid CMA by default. Signed-off-by: Jonathan Bell <jonathan@raspberrypi.com>
1 parent 50e6e32 commit 582cb2a

1 file changed

Lines changed: 98 additions & 0 deletions

File tree

drivers/nvme/host/pci.c

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,11 @@ struct nvme_dev {
183183
/* host memory buffer support: */
184184
u64 host_mem_size;
185185
u32 nr_host_mem_descs;
186+
u32 nr_sgl_ents;
186187
u32 host_mem_descs_size;
187188
dma_addr_t host_mem_descs_dma;
188189
struct nvme_host_mem_buf_desc *host_mem_descs;
190+
struct scatterlist *host_mem_sgl;
189191
void **host_mem_desc_bufs;
190192
unsigned int nr_allocated_queues;
191193
unsigned int nr_write_queues;
@@ -2371,6 +2373,13 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
23712373
return ret;
23722374
}
23732375

2376+
#if IS_ENABLED(CONFIG_ARM64)
2377+
static void nvme_free_host_mem_multi(struct nvme_dev *dev)
2378+
{
2379+
dma_unmap_sg(dev->dev, dev->host_mem_sgl, dev->nr_host_mem_descs, DMA_FROM_DEVICE);
2380+
sgl_free(dev->host_mem_sgl);
2381+
}
2382+
#else
23742383
static void nvme_free_host_mem_multi(struct nvme_dev *dev)
23752384
{
23762385
int i;
@@ -2387,6 +2396,7 @@ static void nvme_free_host_mem_multi(struct nvme_dev *dev)
23872396
kfree(dev->host_mem_desc_bufs);
23882397
dev->host_mem_desc_bufs = NULL;
23892398
}
2399+
#endif
23902400

23912401
static void nvme_free_host_mem(struct nvme_dev *dev)
23922402
{
@@ -2429,6 +2439,93 @@ static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
24292439
return 0;
24302440
}
24312441

2442+
#if IS_ENABLED(CONFIG_ARM64)
2443+
static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
2444+
u32 chunk_size)
2445+
{
2446+
struct nvme_host_mem_buf_desc *descs;
2447+
u32 max_entries, len, descs_size;
2448+
dma_addr_t descs_dma;
2449+
struct scatterlist *slist;
2450+
struct page *page;
2451+
int i = 0, mapped_nents;
2452+
u64 size, tmp;
2453+
2454+
tmp = (preferred + chunk_size - 1);
2455+
do_div(tmp, chunk_size);
2456+
max_entries = tmp;
2457+
2458+
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
2459+
max_entries = dev->ctrl.hmmaxd;
2460+
2461+
descs_size = max_entries * sizeof(*descs);
2462+
/*
2463+
* Allocate the descriptor table from coherent memory -
2464+
* usually occupies less than/up to a single page.
2465+
*/
2466+
descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
2467+
GFP_KERNEL);
2468+
if (!descs)
2469+
goto out;
2470+
2471+
slist = kcalloc(max_entries, sizeof(struct scatterlist), GFP_KERNEL);
2472+
if (!slist)
2473+
goto out_free_descs;
2474+
2475+
sg_init_table(slist, max_entries);
2476+
2477+
dev_dbg(dev->dev, "Allocating HMB pref = %llu max_entries = %u\n",
2478+
preferred, max_entries);
2479+
2480+
for (size = 0; size < preferred && i < max_entries; size += len) {
2481+
int order;
2482+
2483+
len = min_t(u64, chunk_size, preferred - size);
2484+
order = get_order(len);
2485+
page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN, order);
2486+
if (!page)
2487+
break;
2488+
sg_set_page(&slist[i], page, len, 0);
2489+
i++;
2490+
}
2491+
if (size < preferred)
2492+
goto out_free_sgl;
2493+
2494+
mapped_nents = dma_map_sg(dev->dev, slist, i, DMA_FROM_DEVICE);
2495+
if (mapped_nents <= 0)
2496+
goto out_free_pages;
2497+
2498+
/* Flush in case the CPU has cached any parts of the DMA buffers */
2499+
dma_sync_sg_for_device(dev->dev, slist, i, DMA_FROM_DEVICE);
2500+
2501+
i = dev->nr_host_mem_descs = mapped_nents;
2502+
2503+
while (--i >= 0) {
2504+
descs[i].addr = sg_dma_address(&slist[i]);
2505+
WARN_ON_ONCE(sg_dma_len(&slist[i]) & (NVME_CTRL_PAGE_SIZE - 1));
2506+
descs[i].size = sg_dma_len(&slist[i]) / NVME_CTRL_PAGE_SIZE;
2507+
}
2508+
2509+
dev->host_mem_size = size;
2510+
dev->host_mem_descs = descs;
2511+
dev->host_mem_descs_dma = descs_dma;
2512+
dev->host_mem_descs_size = descs_size;
2513+
dev->host_mem_sgl = slist;
2514+
return 0;
2515+
2516+
out_free_pages:
2517+
/* Don't use mapped_nents here as it could be incomplete */
2518+
while (--i >= 0)
2519+
__free_pages(sg_page(&slist[i]), get_order(slist[i].length));
2520+
out_free_sgl:
2521+
kfree(slist);
2522+
out_free_descs:
2523+
dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
2524+
out:
2525+
dev->host_mem_descs = NULL;
2526+
return -ENOMEM;
2527+
}
2528+
#else
24322529
static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
24332530
u32 chunk_size)
24342531
{
@@ -2489,6 +2586,7 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
24892586
dev->host_mem_descs = NULL;
24902587
return -ENOMEM;
24912588
}
2589+
#endif
24922590

24932591
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
24942592
{

0 commit comments

Comments
 (0)