From 55ee0d2c62b38fb8075105600be2be2d397c5933 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Mon, 20 Apr 2026 09:24:01 +0200
Subject: [PATCH 1/4] NVIDIA: VR: SAUCE: sched/fair: Attach sched_domain_shared
 to sd_asym_cpucapacity

BugLink: https://bugs.launchpad.net/bugs/2150671

On asymmetric CPU capacity systems, the wakeup path uses
select_idle_capacity(), which scans the span of sd_asym_cpucapacity
rather than sd_llc.

The has_idle_cores hint however lives on sd_llc->shared, so the
wakeup-time read of has_idle_cores operates on an LLC-scoped blob while
the actual scan/decision spans the wider asym domain; nr_busy_cpus also
lives in the same shared sched_domain data, but it's never used in the
asym CPU capacity scenario.

Therefore, move the sched_domain_shared object to sd_asym_cpucapacity
whenever the CPU has a SD_ASYM_CPUCAPACITY_FULL ancestor and that
ancestor is non-overlapping (i.e., not built from SD_NUMA). In that case
the scope of has_idle_cores matches the scope of the wakeup scan.

Fall back to attaching the shared object to sd_llc in three cases:

  1) plain symmetric systems (no SD_ASYM_CPUCAPACITY_FULL anywhere);

  2) CPUs in an exclusive cpuset that carves out a symmetric capacity
     island: has_asym is system-wide but those CPUs have no
     SD_ASYM_CPUCAPACITY_FULL ancestor in their hierarchy and follow
     the symmetric LLC path in select_idle_sibling();

  3) exotic topologies where SD_ASYM_CPUCAPACITY_FULL lands on an
     SD_NUMA-built domain. init_sched_domain_shared() keys the shared
     blob off cpumask_first(span), which on overlapping NUMA domains
     would alias unrelated spans onto the same blob. Keep the shared
     object on the LLC there; select_idle_capacity() gracefully skips
     the has_idle_cores preference when sd->shared is NULL.

While at it, also rename the per-CPU sd_llc_shared to sd_balance_shared,
as it is no longer strictly tied to the LLC.

Co-developed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
(backported from https://lore.kernel.org/all/20260428051720.3180182-1-arighi@nvidia.com)
[ arighi:
   - backport full logic to attach sd->shared in build_sched_domains()
   - do not rename sd_llc_shared to reduce the risk of conflicts ]
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c     |   6 ++-
 kernel/sched/topology.c | 101 +++++++++++++++++++++++++++++++++-------
 2 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab4114712be74..bd2c6ebc85e7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12617,7 +12617,8 @@ static void set_cpu_sd_state_busy(int cpu)
 		goto unlock;
 	sd->nohz_idle = 0;
 
-	atomic_inc(&sd->shared->nr_busy_cpus);
+	if (sd->shared)
+		atomic_inc(&sd->shared->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -12646,7 +12647,8 @@ static void set_cpu_sd_state_idle(int cpu)
 		goto unlock;
 	sd->nohz_idle = 1;
 
-	atomic_dec(&sd->shared->nr_busy_cpus);
+	if (sd->shared)
+		atomic_dec(&sd->shared->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82d..7bc2d13b3bf57 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -680,16 +680,38 @@ static void update_top_cache_domain(int cpu)
 	int id = cpu;
 	int size = 1;
 
+	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+	/*
+	 * The shared object is attached to sd_asym_cpucapacity only when the
+	 * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+	 * On overlapping (NUMA) asym domains we fall back to letting the
+	 * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+	 * here.
+	 */
+	if (sd && sd->shared)
+		sds = sd->shared;
+
+	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+
 	sd = highest_flag_domain(cpu, SD_SHARE_LLC);
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
-		sds = sd->shared;
+
+		/*
+		 * If sd_asym_cpucapacity didn't claim the shared object,
+		 * sd_llc must have one linked.
+		 */
+		if (!sds) {
+			WARN_ON_ONCE(!sd->shared);
+			sds = sd->shared;
+		}
 	}
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
+
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -708,9 +730,6 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
-
-	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
-	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
@@ -1640,7 +1659,7 @@ sd_init(struct sched_domain_topology_level *tl,
 {
 	struct sd_data *sdd = &tl->data;
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-	int sd_id, sd_weight, sd_flags = 0;
+	int sd_weight, sd_flags = 0;
 	struct cpumask *sd_span;
 
 	sd_weight = cpumask_weight(tl->mask(tl, cpu));
@@ -1688,7 +1707,6 @@ sd_init(struct sched_domain_topology_level *tl,
 
 	sd_span = sched_domain_span(sd);
 	cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
-	sd_id = cpumask_first(sd_span);
 
 	sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
 
@@ -1727,16 +1745,6 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->cache_nice_tries = 1;
 	}
 
-	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
-	 */
-	if (sd->flags & SD_SHARE_LLC) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
-		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}
-
 	sd->private = sdd;
 
 	return sd;
@@ -2548,6 +2556,16 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 	return true;
 }
 
+static void init_sched_domain_shared(struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+	int sd_id = cpumask_first(sched_domain_span(sd));
+
+	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+	atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+	atomic_inc(&sd->shared->ref);
+}
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -2605,6 +2623,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain *sd_asym = NULL;
+		bool asym_claimed = false;
+
+		sd = *per_cpu_ptr(d.sd, i);
+		if (!sd)
+			continue;
+
+		/*
+		 * In case of ASYM_CPUCAPACITY, attach sd->shared to
+		 * sd_asym_cpucapacity for wakeup stat tracking.
+		 *
+		 * Caveats:
+		 *
+		 * 1) has_asym is system-wide, but a given CPU may still
+		 *    lack an SD_ASYM_CPUCAPACITY_FULL ancestor (e.g., an
+		 *    exclusive cpuset carving out a symmetric capacity island).
+		 *    Such CPUs must fall through to the LLC seeding path below.
+		 *
+		 * 2) Skip the asym attach if the asym ancestor is an
+		 *    overlapping domain (SD_NUMA). On those topologies let the
+		 *    LLC path own the shared object instead.
+		 *
+		 * XXX: This assumes SD_ASYM_CPUCAPACITY_FULL domain
+		 * always has more than one group else it is prone to
+		 * degeneration.
+		 */
+		sd_asym = sd;
+		while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+			sd_asym = sd_asym->parent;
+
+		if (sd_asym && !(sd_asym->flags & SD_NUMA)) {
+			init_sched_domain_shared(sd_asym);
+			asym_claimed = true;
+		}
+
+		/* First, find the topmost SD_SHARE_LLC domain */
+		sd = *per_cpu_ptr(d.sd, i);
+		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+			sd = sd->parent;
+
+		if (sd->flags & SD_SHARE_LLC) {
+			/*
+			 * Initialize the sd->shared for SD_SHARE_LLC unless
+			 * the asym path above already claimed it.
+			 */
+			if (!asym_claimed)
+				init_sched_domain_shared(sd);
+		}
+	}
+
 	/*
 	 * Calculate an allowed NUMA imbalance such that LLCs do not get
 	 * imbalanced.

From 42459d49ffef39bf26029faf7af816c737ddf1df Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 11 Mar 2026 18:43:19 +0100
Subject: [PATCH 2/4] NVIDIA: VR: SAUCE: sched/fair: Prefer fully-idle SMT
 cores in asym-capacity idle selection

BugLink: https://bugs.launchpad.net/bugs/2150671

On systems with asymmetric CPU capacity (e.g., ACPI/CPPC reporting
different per-core frequencies), the wakeup path uses
select_idle_capacity() and prioritizes idle CPUs with higher capacity
for better task placement. However, when those CPUs belong to SMT cores,
their effective capacity can be much lower than the nominal capacity
when the sibling thread is busy: SMT siblings compete for shared
resources, so a "high capacity" CPU that is idle but whose sibling is
busy does not deliver its full capacity. This effective capacity
reduction cannot be modeled by the static capacity value alone.

Introduce SMT awareness in the asym-capacity idle selection policy: when
SMT is active, always prefer fully-idle SMT cores over partially-idle
ones.

Prioritizing fully-idle SMT cores yields better task placement because
the effective capacity of partially-idle SMT cores is reduced; always
preferring them when available leads to more accurate capacity usage on
task wakeup.

On an SMT system with asymmetric CPU capacities, SMT-aware idle
selection has been shown to improve throughput by around 15-18% for
CPU-bound workloads, running an amount of tasks equal to the amount of
SMT cores.

Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Christian Loehle <christian.loehle@arm.com>
Cc: Koba Ko <kobak@nvidia.com>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Reported-by: Felix Abecassis <fabecassis@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
(cherry picked from https://lore.kernel.org/all/20260428051720.3180182-1-arighi@nvidia.com)
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c | 70 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 65 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bd2c6ebc85e7b..e4393d393f70d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7762,6 +7762,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	return idle_cpu;
 }
 
+/*
+ * Idle-capacity scan ranks transformed util_fits_cpu() outcomes; lower values
+ * are more preferred (see select_idle_capacity()).
+ */
+enum asym_fits_state {
+	/* In descending order of preference */
+	ASYM_IDLE_CORE_UCLAMP_MISFIT = -4,
+	ASYM_IDLE_CORE_COMPLETE_MISFIT,
+	ASYM_IDLE_THREAD_FITS,
+	ASYM_IDLE_THREAD_UCLAMP_MISFIT,
+	ASYM_IDLE_COMPLETE_MISFIT,
+
+	/* asym_fits_cpu() bias for an idle core. */
+	ASYM_IDLE_CORE_BIAS = -3,
+};
+
 /*
  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
  * the task fits. If no CPU is big enough, but there are idle ones, try to
@@ -7770,8 +7786,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 {
+	bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
 	unsigned long task_util, util_min, util_max, best_cap = 0;
-	int fits, best_fits = 0;
+	int fits, best_fits = ASYM_IDLE_COMPLETE_MISFIT;
 	int cpu, best_cpu = -1;
 	struct cpumask *cpus;
 
@@ -7783,6 +7800,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	util_max = uclamp_eff_value(p, UCLAMP_MAX);
 
 	for_each_cpu_wrap(cpu, cpus, target) {
+		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
 		unsigned long cpu_cap = capacity_of(cpu);
 
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -7791,7 +7809,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
 
 		/* This CPU fits with all requirements */
-		if (fits > 0)
+		if (fits > 0 && preferred_core)
 			return cpu;
 		/*
 		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
@@ -7799,9 +7817,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		 */
 		else if (fits < 0)
 			cpu_cap = get_actual_cpu_capacity(cpu);
+		/*
+		 * fits > 0 implies we are not on a preferred core
+		 * but the util fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS
+		 * so the effective range becomes
+		 * [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_COMPLETE_MISFIT] where:
+		 *    ASYM_IDLE_COMPLETE_MISFIT - does not fit
+		 *    ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN
+		 *    ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core
+		 */
+		else if (fits > 0)
+			fits = ASYM_IDLE_THREAD_FITS;
+
+		/*
+		 * If we are on a preferred core, translate the range of fits
+		 * of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] to
+		 * [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_COMPLETE_MISFIT].
+		 * This ensures that an idle core is always given priority over
+		 * (partially) busy core.
+		 *
+		 * A fully fitting idle core would have returned early and hence
+		 * fits > 0 for preferred_core need not be dealt with.
+		 */
+		if (preferred_core)
+			fits += ASYM_IDLE_CORE_BIAS;
 
 		/*
-		 * First, select CPU which fits better (-1 being better than 0).
+		 * First, select CPU which fits better (lower is more preferred).
 		 * Then, select the one with best capacity at same level.
 		 */
 		if ((fits < best_fits) ||
@@ -7812,6 +7854,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		}
 	}
 
+	/*
+	 * A value in the [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_BIAS]
+	 * range means the chosen CPU is in a fully idle SMT core. Values above
+	 * ASYM_IDLE_CORE_BIAS mean we never ranked such a CPU best.
+	 *
+	 * The asym-capacity wakeup path returns from select_idle_sibling()
+	 * after this function and never runs select_idle_cpu(), so the usual
+	 * select_idle_cpu() tail that clears idle cores must live here when the
+	 * idle-core preference did not win.
+	 */
+	if (prefers_idle_core && best_fits > ASYM_IDLE_CORE_BIAS)
+		set_idle_cores(target, false);
+
 	return best_cpu;
 }
 
@@ -7820,12 +7875,17 @@ static inline bool asym_fits_cpu(unsigned long util,
 				 unsigned long util_max,
 				 int cpu)
 {
-	if (sched_asym_cpucap_active())
+	if (sched_asym_cpucap_active()) {
 		/*
 		 * Return true only if the cpu fully fits the task requirements
 		 * which include the utilization and the performance hints.
+		 *
+		 * When SMT is active, also require that the core has no busy
+		 * siblings.
 		 */
-		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+		return (!sched_smt_active() || is_core_idle(cpu)) &&
+		       (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+	}
 
 	return true;
 }

From 55e068ef94cd86840a9b7ce8bce1fde38efaa481 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 25 Mar 2026 16:39:32 +0100
Subject: [PATCH 3/4] NVIDIA: VR: SAUCE: sched/fair: Reject misfit pulls onto
 busy SMT siblings on asym-capacity

BugLink: https://bugs.launchpad.net/bugs/2150671

When SD_ASYM_CPUCAPACITY load balancing considers pulling a misfit task,
capacity_of(dst_cpu) can overstate available compute if the SMT sibling is
busy: the core does not deliver its full nominal capacity.

If SMT is active and dst_cpu is not on a fully idle core, skip this
destination so we do not migrate a misfit expecting a capacity upgrade we
cannot actually provide.

Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Christian Loehle <christian.loehle@arm.com>
Cc: Koba Ko <kobak@nvidia.com>
Cc: K Prateek Nayak <kprateek.nayak@amd.com>
Reported-by: Felix Abecassis <fabecassis@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
(cherry picked from https://lore.kernel.org/all/20260428051720.3180182-1-arighi@nvidia.com)
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4393d393f70d..3513c47f94c8f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9390,6 +9390,7 @@ struct lb_env {
 
 	int			dst_cpu;
 	struct rq		*dst_rq;
+	bool			dst_core_idle;
 
 	struct cpumask		*dst_grpmask;
 	int			new_dst_cpu;
@@ -10635,10 +10636,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	 * We can use max_capacity here as reduction in capacity on some
 	 * CPUs in the group should either be possible to resolve
 	 * internally or be covered by avg_load imbalance (eventually).
+	 *
+	 * When SMT is active, only pull a misfit to dst_cpu if it is on a
+	 * fully idle core; otherwise the effective capacity of the core is
+	 * reduced and we may not actually provide more capacity than the
+	 * source.
 	 */
 	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
 	    (sgs->group_type == group_misfit_task) &&
-	    (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
+	    (!env->dst_core_idle ||
+	     !capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
 	     sds->local_stat.group_type != group_has_spare))
 		return false;
 
@@ -11204,6 +11211,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	unsigned long sum_util = 0;
 	bool sg_overloaded = 0, sg_overutilized = 0;
 
+	env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu);
+
 	do {
 		struct sg_lb_stats *sgs = &tmp_sgs;
 		int local_group;

From 9011a2603e7aa3d815dcbfda78fd8e469136cbce Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Tue, 21 Apr 2026 16:52:46 +0530
Subject: [PATCH 4/4] NVIDIA: VR: SAUCE: sched/fair: Add SIS_UTIL support to
 select_idle_capacity()

BugLink: https://bugs.launchpad.net/bugs/2150671

Add to select_idle_capacity() the same SIS_UTIL-controlled idle-scan
mechanism, already used by select_idle_cpu(): when sched_feat(SIS_UTIL)
is enabled and the LLC domain has sched_domain_shared data, derive the
per-attempt scan limit from sd->shared->nr_idle_scan.

That bounds the walk on large LLCs and allows an early return once the
scan limit is reached, if we already picked a sufficiently strong
idle-core candidate (best_fits == ASYM_IDLE_CORE_UCLAMP_MISFIT).

Co-developed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
(cherry picked from https://lore.kernel.org/all/20260428051720.3180182-1-arighi@nvidia.com)
Signed-off-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/fair.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3513c47f94c8f..82714027a6564 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7791,6 +7791,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	int fits, best_fits = ASYM_IDLE_COMPLETE_MISFIT;
 	int cpu, best_cpu = -1;
 	struct cpumask *cpus;
+	int nr = INT_MAX;
 
 	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
@@ -7799,10 +7800,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	util_min = uclamp_eff_value(p, UCLAMP_MIN);
 	util_max = uclamp_eff_value(p, UCLAMP_MAX);
 
+	if (sched_feat(SIS_UTIL) && sd->shared) {
+		/*
+		 * Same nr_idle_scan hint as select_idle_cpu(), nr only limits
+		 * the scan when not preferring an idle core.
+		 */
+		nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
+		/* overloaded domain is unlikely to have idle cpu/core */
+		if (nr == 1)
+			return -1;
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target) {
 		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
 		unsigned long cpu_cap = capacity_of(cpu);
 
+		/*
+		 * Good-enough early exit (mirrors select_idle_cpu() logic).
+		 */
+		if (!prefers_idle_core &&
+		    --nr <= 0 && best_fits == ASYM_IDLE_CORE_UCLAMP_MISFIT)
+			return best_cpu;
+
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;