Skip to content

Commit f1c92bd

Browse files
committed
NVIDIA: SAUCE: sched/fair: Prefer fully-idle SMT cores in asym-capacity idle selection
On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting different per-core frequencies), the wakeup path uses select_idle_capacity() and prioritizes idle CPUs with higher capacity for better task placement. However, when those CPUs belong to SMT cores, their effective capacity can be much lower than the nominal capacity when the sibling thread is busy: SMT siblings compete for shared resources, so a "high capacity" CPU that is idle but whose sibling is busy does not deliver its full capacity. This effective capacity reduction cannot be modeled by the static capacity value alone. Introduce SMT awareness in the asym-capacity idle selection policy: when SMT is active, always prefer fully-idle SMT cores over partially-idle ones. Prioritizing fully-idle SMT cores yields better task placement because the effective capacity of partially-idle SMT cores is reduced; always preferring them when available leads to more accurate capacity usage on task wakeup. On an SMT system with asymmetric CPU capacities, SMT-aware idle selection has been shown to improve throughput by around 15-18% for CPU-bound workloads, running an amount of tasks equal to the amount of SMT cores. Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Christian Loehle <christian.loehle@arm.com> Cc: Koba Ko <kobak@nvidia.com> Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com> Reported-by: Felix Abecassis <fabecassis@nvidia.com> Signed-off-by: Andrea Righi <arighi@nvidia.com> (cherry picked from https://lore.kernel.org/all/20260428051720.3180182-1-arighi@nvidia.com) Signed-off-by: Andrea Righi <arighi@nvidia.com>
1 parent be61197 commit f1c92bd

1 file changed

Lines changed: 65 additions & 5 deletions

File tree

kernel/sched/fair.c

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7615,6 +7615,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
76157615
return idle_cpu;
76167616
}
76177617

7618+
/*
7619+
* Idle-capacity scan ranks transformed util_fits_cpu() outcomes; lower values
7620+
* are more preferred (see select_idle_capacity()).
7621+
*/
7622+
enum asym_fits_state {
7623+
/* In descending order of preference */
7624+
ASYM_IDLE_CORE_UCLAMP_MISFIT = -4,
7625+
ASYM_IDLE_CORE_COMPLETE_MISFIT,
7626+
ASYM_IDLE_THREAD_FITS,
7627+
ASYM_IDLE_THREAD_UCLAMP_MISFIT,
7628+
ASYM_IDLE_COMPLETE_MISFIT,
7629+
7630+
/* asym_fits_cpu() bias for an idle core. */
7631+
ASYM_IDLE_CORE_BIAS = -3,
7632+
};
7633+
76187634
/*
76197635
* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
76207636
* the task fits. If no CPU is big enough, but there are idle ones, try to
@@ -7623,8 +7639,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
76237639
static int
76247640
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
76257641
{
7642+
bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
76267643
unsigned long task_util, util_min, util_max, best_cap = 0;
7627-
int fits, best_fits = 0;
7644+
int fits, best_fits = ASYM_IDLE_COMPLETE_MISFIT;
76287645
int cpu, best_cpu = -1;
76297646
struct cpumask *cpus;
76307647

@@ -7636,6 +7653,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
76367653
util_max = uclamp_eff_value(p, UCLAMP_MAX);
76377654

76387655
for_each_cpu_wrap(cpu, cpus, target) {
7656+
bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
76397657
unsigned long cpu_cap = capacity_of(cpu);
76407658

76417659
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -7644,17 +7662,41 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
76447662
fits = util_fits_cpu(task_util, util_min, util_max, cpu);
76457663

76467664
/* This CPU fits with all requirements */
7647-
if (fits > 0)
7665+
if (fits > 0 && preferred_core)
76487666
return cpu;
76497667
/*
76507668
* Only the min performance hint (i.e. uclamp_min) doesn't fit.
76517669
* Look for the CPU with best capacity.
76527670
*/
76537671
else if (fits < 0)
76547672
cpu_cap = get_actual_cpu_capacity(cpu);
7673+
/*
7674+
* fits > 0 implies we are not on a preferred core
7675+
* but the util fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS
7676+
* so the effective range becomes [ASYM_IDLE_THREAD_FITS,
7677+
* ASYM_IDLE_COMPLETE_MISFIT] where:
7678+
* ASYM_IDLE_COMPLETE_MISFIT - does not fit
7679+
* ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN
7680+
* ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core
7681+
*/
7682+
else if (fits > 0)
7683+
fits = ASYM_IDLE_THREAD_FITS;
7684+
7685+
/*
7686+
* If we are on a preferred core, translate the range of fits
7687+
* of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] to
7688+
* [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_COMPLETE_MISFIT].
7689+
* This ensures that an idle core is always given priority over
7690+
* (partially) busy core.
7691+
*
7692+
* A fully fitting idle core would have returned early and hence
7693+
* fits > 0 for preferred_core need not be dealt with.
7694+
*/
7695+
if (preferred_core)
7696+
fits += ASYM_IDLE_CORE_BIAS;
76557697

76567698
/*
7657-
* First, select CPU which fits better (-1 being better than 0).
7699+
* First, select CPU which fits better (lower is more preferred).
76587700
* Then, select the one with best capacity at same level.
76597701
*/
76607702
if ((fits < best_fits) ||
@@ -7665,6 +7707,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
76657707
}
76667708
}
76677709

7710+
/*
7711+
* A value in the [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_BIAS]
7712+
* range means the chosen CPU is in a fully idle SMT core. Values above
7713+
* ASYM_IDLE_CORE_BIAS mean we never ranked such a CPU best.
7714+
*
7715+
* The asym-capacity wakeup path returns from select_idle_sibling()
7716+
* after this function and never runs select_idle_cpu(), so the usual
7717+
* select_idle_cpu() tail that clears idle cores must live here when the
7718+
* idle-core preference did not win.
7719+
*/
7720+
if (prefers_idle_core && best_fits > ASYM_IDLE_CORE_BIAS)
7721+
set_idle_cores(target, false);
7722+
76687723
return best_cpu;
76697724
}
76707725

@@ -7673,12 +7728,17 @@ static inline bool asym_fits_cpu(unsigned long util,
76737728
unsigned long util_max,
76747729
int cpu)
76757730
{
7676-
if (sched_asym_cpucap_active())
7731+
if (sched_asym_cpucap_active()) {
76777732
/*
76787733
* Return true only if the cpu fully fits the task requirements
76797734
* which include the utilization and the performance hints.
7735+
*
7736+
* When SMT is active, also require that the core has no busy
7737+
* siblings.
76807738
*/
7681-
return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
7739+
return (!sched_smt_active() || is_core_idle(cpu)) &&
7740+
(util_fits_cpu(util, util_min, util_max, cpu) > 0);
7741+
}
76827742

76837743
return true;
76847744
}

0 commit comments

Comments
 (0)