Skip to content

Commit f30f6ed

Browse files
arighisforshee
authored andcommitted
sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection
BugLink: https://bugs.launchpad.net/bugs/2150671 On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting different per-core frequencies), the wakeup path uses select_idle_capacity() and prioritizes idle CPUs with higher capacity for better task placement. However, when those CPUs belong to SMT cores, their effective capacity can be much lower than the nominal capacity when the sibling thread is busy: SMT siblings compete for shared resources, so a "high capacity" CPU that is idle but whose sibling is busy does not deliver its full capacity. This effective capacity reduction cannot be modeled by the static capacity value alone. Introduce SMT awareness in the asym-capacity idle selection policy: when SMT is active, always prefer fully-idle SMT cores over partially-idle ones. Prioritizing fully-idle SMT cores yields better task placement because the effective capacity of partially-idle SMT cores is reduced; always preferring them when available leads to more accurate capacity usage on task wakeup. On an SMT system with asymmetric CPU capacities (NVIDIA Vera Rubin), SMT-aware idle selection has been shown to improve throughput by around 15-18% over NO_ASYM mainline and by around 60% over ASYM mainline, for CPU-bound workloads (NVBLAS) running an amount of tasks equal to the amount of SMT cores. Reported-by: Felix Abecassis <fabecassis@nvidia.com> Signed-off-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org> Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com> Link: https://patch.msgid.link/20260511142502.3873984-1-arighi@nvidia.com (cherry picked from commit 25a32e4 linux-next) Signed-off-by: Andrea Righi <arighi@nvidia.com> Acked-by: Seth Forshee <sforshee@nvidia.com> Acked-by: Nirmoy Das <nirmoyd@nvidia.com> Acked-by: Matthew R. Ochs <mochs@nvidia.com> Signed-off-by: Seth Forshee <sforshee@nvidia.com>
1 parent 1b42a98 commit f30f6ed

1 file changed

Lines changed: 114 additions & 6 deletions

File tree

kernel/sched/fair.c

Lines changed: 114 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7805,6 +7805,54 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
78057805
return idle_cpu;
78067806
}
78077807

7808+
/*
7809+
* Idle-capacity scan converts util_fits_cpu() outcomes into preference ranks,
7810+
* where lower values indicate a better fit - see select_idle_capacity().
7811+
*
7812+
* A CPU that both fits the task and sits on a fully-idle SMT core is returned
7813+
* immediately and is never assigned one of these ranks. On !SMT every CPU is
7814+
* its own "core", so the early return covers all fits-and-idle cases and the
7815+
* core-tier ranks below become unreachable.
7816+
*
7817+
* Rank Val Tier Meaning
7818+
* ------------------------------ --- ------ ---------------------------
7819+
* ASYM_IDLE_UCLAMP_MISFIT -4 core Idle core; capacity fits
7820+
* util but uclamp_min misses.
7821+
* ASYM_IDLE_COMPLETE_MISFIT -3 core Idle core; capacity does
7822+
* not fit. Still beats every
7823+
* thread-tier rank: a busy
7824+
* sibling cuts effective
7825+
* capacity more than a
7826+
* misfit hurts a quiet core.
7827+
* ASYM_IDLE_THREAD_FITS -2 thread Busy SMT sibling; capacity
7828+
* fits util + uclamp.
7829+
* ASYM_IDLE_THREAD_UCLAMP_MISFIT -1 thread Busy SMT sibling; capacity
7830+
* fits but uclamp_min misses
7831+
* (native util_fits_cpu()
7832+
* return value).
7833+
* ASYM_IDLE_THREAD_MISFIT 0 thread Busy SMT sibling; capacity
7834+
* does not fit.
7835+
*
7836+
* ASYM_IDLE_CORE_BIAS (-3) is an offset, not a state. On an idle core,
7837+
* fits += ASYM_IDLE_CORE_BIAS rebases thread-tier ranks into the core tier:
7838+
*
7839+
* ASYM_IDLE_THREAD_UCLAMP_MISFIT (-1) + BIAS -> ASYM_IDLE_UCLAMP_MISFIT (-4)
7840+
* ASYM_IDLE_THREAD_MISFIT (0) + BIAS -> ASYM_IDLE_COMPLETE_MISFIT (-3)
7841+
*
7842+
* ASYM_IDLE_THREAD_FITS (-2) is never rebased because a fully-fitting idle-core
7843+
* candidate early-returns from select_idle_capacity().
7844+
*/
7845+
enum asym_fits_state {
7846+
ASYM_IDLE_UCLAMP_MISFIT = -4,
7847+
ASYM_IDLE_COMPLETE_MISFIT,
7848+
ASYM_IDLE_THREAD_FITS,
7849+
ASYM_IDLE_THREAD_UCLAMP_MISFIT,
7850+
ASYM_IDLE_THREAD_MISFIT,
7851+
7852+
/* util_fits_cpu() bias for idle core */
7853+
ASYM_IDLE_CORE_BIAS = -3,
7854+
};
7855+
78087856
/*
78097857
* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
78107858
* the task fits. If no CPU is big enough, but there are idle ones, try to
@@ -7813,8 +7861,14 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
78137861
static int
78147862
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
78157863
{
7864+
/*
7865+
* On !SMT systems, has_idle_core is always false and preferred_core
7866+
* is always true (CPU == core), so the SMT preference logic below
7867+
* collapses to the plain capacity scan.
7868+
*/
7869+
bool has_idle_core = sched_smt_active() && test_idle_cores(target);
78167870
unsigned long task_util, util_min, util_max, best_cap = 0;
7817-
int fits, best_fits = 0;
7871+
int fits, best_fits = ASYM_IDLE_THREAD_MISFIT;
78187872
int cpu, best_cpu = -1;
78197873
struct cpumask *cpus;
78207874

@@ -7826,25 +7880,56 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
78267880
util_max = uclamp_eff_value(p, UCLAMP_MAX);
78277881

78287882
for_each_cpu_wrap(cpu, cpus, target) {
7883+
bool preferred_core = !has_idle_core || is_core_idle(cpu);
78297884
unsigned long cpu_cap = capacity_of(cpu);
78307885

78317886
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
78327887
continue;
78337888

78347889
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
78357890

7836-
/* This CPU fits with all requirements */
7837-
if (fits > 0)
7891+
/*
7892+
* Perfect fit: capacity satisfies util + uclamp and the CPU
7893+
* sits on a fully-idle SMT core, this is a !SMT system, or
7894+
* there is no idle core to find.
7895+
* Short-circuit the rank-based selection and return
7896+
* immediately.
7897+
*/
7898+
if (fits > 0 && preferred_core)
78387899
return cpu;
78397900
/*
78407901
* Only the min performance hint (i.e. uclamp_min) doesn't fit.
78417902
* Look for the CPU with best capacity.
78427903
*/
78437904
else if (fits < 0)
78447905
cpu_cap = get_actual_cpu_capacity(cpu);
7906+
/*
7907+
* fits > 0 implies we are not on a preferred core, but the util
7908+
* fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS
7909+
* so the effective range becomes
7910+
* [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_THREAD_MISFIT], where:
7911+
* ASYM_IDLE_THREAD_MISFIT - does not fit
7912+
* ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN
7913+
* ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core
7914+
*/
7915+
else if (fits > 0)
7916+
fits = ASYM_IDLE_THREAD_FITS;
78457917

78467918
/*
7847-
* First, select CPU which fits better (-1 being better than 0).
7919+
* If we are on a preferred core, translate the range of fits
7920+
* of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_THREAD_MISFIT] to
7921+
* [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT].
7922+
* This ensures that an idle core is always given priority over
7923+
* (partially) busy core.
7924+
*
7925+
* A fully fitting idle core would have returned early and hence
7926+
* fits > 0 for preferred_core need not be dealt with.
7927+
*/
7928+
if (preferred_core)
7929+
fits += ASYM_IDLE_CORE_BIAS;
7930+
7931+
/*
7932+
* First, select CPU which fits better (lower is more preferred).
78487933
* Then, select the one with best capacity at same level.
78497934
*/
78507935
if ((fits < best_fits) ||
@@ -7855,6 +7940,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
78557940
}
78567941
}
78577942

7943+
/*
7944+
* A value in the [ASYM_IDLE_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT]
7945+
* range means the chosen CPU is in a fully idle SMT core. Values above
7946+
* ASYM_IDLE_COMPLETE_MISFIT mean we never ranked such a CPU best.
7947+
*
7948+
* The asym-capacity wakeup path returns from select_idle_sibling()
7949+
* after this function and never runs select_idle_cpu(), so the usual
7950+
* select_idle_cpu() tail that clears idle cores must live here when the
7951+
* idle-core preference did not win.
7952+
*/
7953+
if (has_idle_core && best_fits > ASYM_IDLE_COMPLETE_MISFIT)
7954+
set_idle_cores(target, false);
7955+
78587956
return best_cpu;
78597957
}
78607958

@@ -7863,12 +7961,22 @@ static inline bool asym_fits_cpu(unsigned long util,
78637961
unsigned long util_max,
78647962
int cpu)
78657963
{
7866-
if (sched_asym_cpucap_active())
7964+
if (sched_asym_cpucap_active()) {
78677965
/*
78687966
* Return true only if the cpu fully fits the task requirements
78697967
* which include the utilization and the performance hints.
7968+
*
7969+
* When SMT is active, also require that the core has no busy
7970+
* siblings.
7971+
*
7972+
* Note: gating on is_core_idle() also makes the early-bailout
7973+
* candidates in select_idle_sibling() (target, prev,
7974+
* recent_used_cpu) idle-core-aware on ASYM+SMT, which the
7975+
* NO_ASYM path does not do.
78707976
*/
7871-
return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
7977+
return (!sched_smt_active() || is_core_idle(cpu)) &&
7978+
(util_fits_cpu(util, util_min, util_max, cpu) > 0);
7979+
}
78727980

78737981
return true;
78747982
}

0 commit comments

Comments
 (0)