Skip to content

Commit 3eb5ca2

Browse files
yhuang-intelopsiff
authored andcommitted
mm: tune PCP high automatically
[ Upstream commit 51a755c ] Conflict: none The target to tune PCP high automatically is as follows, - Minimize allocation/freeing from/to shared zone - Minimize idle pages in PCP - Minimize pages in PCP if the system free pages is too few To reach these target, a tuning algorithm as follows is designed, - When we refill PCP via allocating from the zone, increase PCP high. Because if we had larger PCP, we could avoid to allocate from the zone. - In periodic vmstat updating kworker (via refresh_cpu_vm_stats()), decrease PCP high to try to free possible idle PCP pages. - When page reclaiming is active for the zone, stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. So, the PCP high can be tuned to the page allocating/freeing depth of workloads eventually. One issue of the algorithm is that if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small. But this isn't a severe issue, because there are no idle pages in this case. One alternative choice is to increase PCP high when we drain PCP via trying to free pages to the zone, but don't increase PCP high during PCP refilling. This can avoid the issue above. But if the number of pages allocated is much less than that of pages freed on a CPU, there will be many idle pages in PCP and it is hard to free these idle pages. 1/8 (>> 3) of PCP high will be decreased periodically. The value 1/8 is kind of arbitrary. Just to make sure that the idle PCP pages will be freed eventually. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the build time decreases 3.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 11.0% to 0.5%. The number of PCP draining for high order pages freeing (free_high) decreases 65.6%. The number of pages allocated from zone (instead of from PCP) decreases 83.9%. Intel-SIG: commit 51a755c mm: tune PCP high automatically. Backport Auto-tune per-CPU pageset size. Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" <ying.huang@intel.com> Suggested-by: Mel Gorman <mgorman@techsingularity.net> Suggested-by: Michal Hocko <mhocko@suse.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: David Hildenbrand <david@redhat.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Christoph Lameter <cl@linux.com> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: Sudeep Holla <sudeep.holla@arm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [ Aubrey Li: amend commit log ] Signed-off-by: Aubrey Li <aubrey.li@linux.intel.com>
1 parent 4173a42 commit 3eb5ca2

3 files changed

Lines changed: 99 additions & 29 deletions

File tree

include/linux/gfp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ extern void page_frag_free(void *addr);
320320
#define free_page(addr) free_pages((addr), 0)
321321

322322
void page_alloc_init_cpuhp(void);
323+
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
323324
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
324325
void drain_all_pages(struct zone *zone);
325326
void drain_local_pages(struct zone *zone);

mm/page_alloc.c

Lines changed: 94 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2175,6 +2175,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
21752175
return i;
21762176
}
21772177

2178+
/*
2179+
* Called from the vmstat counter updater to decay the PCP high.
2180+
* Return whether there are addition works to do.
2181+
*/
2182+
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
2183+
{
2184+
int high_min, to_drain, batch;
2185+
int todo = 0;
2186+
2187+
high_min = READ_ONCE(pcp->high_min);
2188+
batch = READ_ONCE(pcp->batch);
2189+
/*
2190+
* Decrease pcp->high periodically to try to free possible
2191+
* idle PCP pages. And, avoid to free too many pages to
2192+
* control latency. This caps pcp->high decrement too.
2193+
*/
2194+
if (pcp->high > high_min) {
2195+
pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2196+
pcp->high - (pcp->high >> 3), high_min);
2197+
if (pcp->high > high_min)
2198+
todo++;
2199+
}
2200+
2201+
to_drain = pcp->count - pcp->high;
2202+
if (to_drain > 0) {
2203+
spin_lock(&pcp->lock);
2204+
free_pcppages_bulk(zone, to_drain, pcp, 0);
2205+
spin_unlock(&pcp->lock);
2206+
todo++;
2207+
}
2208+
2209+
return todo;
2210+
}
2211+
21782212
#ifdef CONFIG_NUMA
21792213
/*
21802214
* Called from the vmstat counter updater to drain pagesets of this
@@ -2343,14 +2377,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
23432377
return true;
23442378
}
23452379

2346-
static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
2380+
static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high)
23472381
{
23482382
int min_nr_free, max_nr_free;
2349-
int batch = READ_ONCE(pcp->batch);
23502383

2351-
/* Free everything if batch freeing high-order pages. */
2384+
/* Free as much as possible if batch freeing high-order pages. */
23522385
if (unlikely(free_high))
2353-
return pcp->count;
2386+
return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX);
23542387

23552388
/* Check for PCP disabled or boot pageset */
23562389
if (unlikely(high < batch))
@@ -2365,36 +2398,56 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high)
23652398
* freeing of pages without any allocation.
23662399
*/
23672400
batch <<= pcp->free_factor;
2368-
if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
2401+
if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX)
23692402
pcp->free_factor++;
23702403
batch = clamp(batch, min_nr_free, max_nr_free);
23712404

23722405
return batch;
23732406
}
23742407

23752408
static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
2376-
bool free_high)
2409+
int batch, bool free_high)
23772410
{
2378-
int high = READ_ONCE(pcp->high_min);
2411+
int high, high_min, high_max;
2412+
2413+
high_min = READ_ONCE(pcp->high_min);
2414+
high_max = READ_ONCE(pcp->high_max);
2415+
high = pcp->high = clamp(pcp->high, high_min, high_max);
23792416

2380-
if (unlikely(!high || free_high))
2417+
if (unlikely(!high))
23812418
return 0;
23822419

2383-
if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
2384-
return high;
2420+
if (unlikely(free_high)) {
2421+
pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
2422+
high_min);
2423+
return 0;
2424+
}
23852425

23862426
/*
23872427
* If reclaim is active, limit the number of pages that can be
23882428
* stored on pcp lists
23892429
*/
2390-
return min(READ_ONCE(pcp->batch) << 2, high);
2430+
if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) {
2431+
pcp->high = max(high - (batch << pcp->free_factor), high_min);
2432+
return min(batch << 2, pcp->high);
2433+
}
2434+
2435+
if (pcp->count >= high && high_min != high_max) {
2436+
int need_high = (batch << pcp->free_factor) + batch;
2437+
2438+
/* pcp->high should be large enough to hold batch freed pages */
2439+
if (pcp->high < need_high)
2440+
pcp->high = clamp(need_high, high_min, high_max);
2441+
}
2442+
2443+
return high;
23912444
}
23922445

23932446
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
23942447
struct page *page, int migratetype,
23952448
unsigned int order)
23962449
{
2397-
int high;
2450+
int high, batch;
23982451
int pindex;
23992452
bool free_high = false;
24002453

@@ -2409,6 +2462,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
24092462
list_add(&page->pcp_list, &pcp->lists[pindex]);
24102463
pcp->count += 1 << order;
24112464

2465+
batch = READ_ONCE(pcp->batch);
24122466
/*
24132467
* As high-order pages other than THP's stored on PCP can contribute
24142468
* to fragmentation, limit the number stored when PCP is heavily
@@ -2419,14 +2473,15 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
24192473
free_high = (pcp->free_factor &&
24202474
(pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
24212475
(!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
2422-
pcp->count >= READ_ONCE(pcp->batch)));
2476+
pcp->count >= READ_ONCE(batch)));
24232477
pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
24242478
} else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) {
24252479
pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER;
24262480
}
2427-
high = nr_pcp_high(pcp, zone, free_high);
2481+
high = nr_pcp_high(pcp, zone, batch, free_high);
24282482
if (pcp->count >= high) {
2429-
free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex);
2483+
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
2484+
pcp, pindex);
24302485
}
24312486
}
24322487

@@ -2710,24 +2765,38 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
27102765
return page;
27112766
}
27122767

2713-
static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
2768+
static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order)
27142769
{
2715-
int high, batch, max_nr_alloc;
2770+
int high, base_batch, batch, max_nr_alloc;
2771+
int high_max, high_min;
27162772

2717-
high = READ_ONCE(pcp->high_min);
2718-
batch = READ_ONCE(pcp->batch);
2773+
base_batch = READ_ONCE(pcp->batch);
2774+
high_min = READ_ONCE(pcp->high_min);
2775+
high_max = READ_ONCE(pcp->high_max);
2776+
high = pcp->high = clamp(pcp->high, high_min, high_max);
27192777

27202778
/* Check for PCP disabled or boot pageset */
2721-
if (unlikely(high < batch))
2779+
if (unlikely(high < base_batch))
27222780
return 1;
27232781

2782+
if (order)
2783+
batch = base_batch;
2784+
else
2785+
batch = (base_batch << pcp->alloc_factor);
2786+
27242787
/*
2725-
* Double the number of pages allocated each time there is subsequent
2726-
* allocation of order-0 pages without any freeing.
2788+
* If we had larger pcp->high, we could avoid to allocate from
2789+
* zone.
27272790
*/
2791+
if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
2792+
high = pcp->high = min(high + batch, high_max);
2793+
27282794
if (!order) {
2729-
max_nr_alloc = max(high - pcp->count - batch, batch);
2730-
batch <<= pcp->alloc_factor;
2795+
max_nr_alloc = max(high - pcp->count - base_batch, base_batch);
2796+
/*
2797+
* Double the number of pages allocated each time there is
2798+
* subsequent allocation of order-0 pages without any freeing.
2799+
*/
27312800
if (batch <= max_nr_alloc &&
27322801
pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
27332802
pcp->alloc_factor++;
@@ -2758,7 +2827,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
27582827

27592828
do {
27602829
if (list_empty(list)) {
2761-
int batch = nr_pcp_alloc(pcp, order);
2830+
int batch = nr_pcp_alloc(pcp, zone, order);
27622831
int alloced;
27632832

27642833
alloced = rmqueue_bulk(zone, order,

mm/vmstat.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -814,9 +814,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
814814

815815
for_each_populated_zone(zone) {
816816
struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
817-
#ifdef CONFIG_NUMA
818817
struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
819-
#endif
820818

821819
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
822820
int v;
@@ -832,10 +830,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
832830
#endif
833831
}
834832
}
835-
#ifdef CONFIG_NUMA
836833

837834
if (do_pagesets) {
838835
cond_resched();
836+
837+
changes += decay_pcp_high(zone, this_cpu_ptr(pcp));
838+
#ifdef CONFIG_NUMA
839839
/*
840840
* Deal with draining the remote pageset of this
841841
* processor
@@ -862,8 +862,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
862862
drain_zone_pages(zone, this_cpu_ptr(pcp));
863863
changes++;
864864
}
865-
}
866865
#endif
866+
}
867867
}
868868

869869
for_each_online_pgdat(pgdat) {

0 commit comments

Comments
 (0)