diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab4114712be74..82714027a6564 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7762,6 +7762,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 	return idle_cpu;
 }
 
+/*
+ * Idle-capacity scan ranks transformed util_fits_cpu() outcomes; lower values
+ * are more preferred (see select_idle_capacity()).
+ */
+enum asym_fits_state {
+	/* In descending order of preference */
+	ASYM_IDLE_CORE_UCLAMP_MISFIT = -4,
+	ASYM_IDLE_CORE_COMPLETE_MISFIT,
+	ASYM_IDLE_THREAD_FITS,
+	ASYM_IDLE_THREAD_UCLAMP_MISFIT,
+	ASYM_IDLE_COMPLETE_MISFIT,
+
+	/* asym_fits_cpu() bias for an idle core. */
+	ASYM_IDLE_CORE_BIAS = -3,
+};
+
 /*
  * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
  * the task fits. If no CPU is big enough, but there are idle ones, try to
@@ -7770,10 +7786,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 static int
 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 {
+	bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
 	unsigned long task_util, util_min, util_max, best_cap = 0;
-	int fits, best_fits = 0;
+	int fits, best_fits = ASYM_IDLE_COMPLETE_MISFIT;
 	int cpu, best_cpu = -1;
 	struct cpumask *cpus;
+	int nr = INT_MAX;
 
 	cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
@@ -7782,16 +7800,35 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	util_min = uclamp_eff_value(p, UCLAMP_MIN);
 	util_max = uclamp_eff_value(p, UCLAMP_MAX);
 
+	if (sched_feat(SIS_UTIL) && sd->shared) {
+		/*
+		 * Same nr_idle_scan hint as select_idle_cpu(), nr only limits
+		 * the scan when not preferring an idle core.
+		 */
+		nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
+		/* overloaded domain is unlikely to have idle cpu/core */
+		if (nr == 1)
+			return -1;
+	}
+
 	for_each_cpu_wrap(cpu, cpus, target) {
+		bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
 		unsigned long cpu_cap = capacity_of(cpu);
 
+		/*
+		 * Good-enough early exit (mirrors select_idle_cpu() logic).
+		 */
+		if (!prefers_idle_core &&
+		    --nr <= 0 && best_fits == ASYM_IDLE_CORE_UCLAMP_MISFIT)
+			return best_cpu;
+
 		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
 			continue;
 
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
 
 		/* This CPU fits with all requirements */
-		if (fits > 0)
+		if (fits > 0 && preferred_core)
 			return cpu;
 		/*
 		 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
@@ -7799,9 +7836,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		 */
 		else if (fits < 0)
 			cpu_cap = get_actual_cpu_capacity(cpu);
+		/*
+		 * fits > 0 implies we are not on a preferred core
+		 * but the util fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS
+		 * so the effective range becomes
+		 * [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_COMPLETE_MISFIT] where:
+		 *    ASYM_IDLE_COMPLETE_MISFIT - does not fit
+		 *    ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN
+		 *    ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core
+		 */
+		else if (fits > 0)
+			fits = ASYM_IDLE_THREAD_FITS;
 
 		/*
-		 * First, select CPU which fits better (-1 being better than 0).
+		 * If we are on a preferred core, translate the range of fits
+		 * of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] to
+		 * [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_COMPLETE_MISFIT].
+		 * This ensures that an idle core is always given priority over
+		 * (partially) busy core.
+		 *
+		 * A fully fitting idle core would have returned early and hence
+		 * fits > 0 for preferred_core need not be dealt with.
+		 */
+		if (preferred_core)
+			fits += ASYM_IDLE_CORE_BIAS;
+
+		/*
+		 * First, select CPU which fits better (lower is more preferred).
 		 * Then, select the one with best capacity at same level.
 		 */
 		if ((fits < best_fits) ||
@@ -7812,6 +7873,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 		}
 	}
 
+	/*
+	 * A value in the [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_BIAS]
+	 * range means the chosen CPU is in a fully idle SMT core. Values above
+	 * ASYM_IDLE_CORE_BIAS mean we never ranked such a CPU best.
+	 *
+	 * The asym-capacity wakeup path returns from select_idle_sibling()
+	 * after this function and never runs select_idle_cpu(), so the usual
+	 * select_idle_cpu() tail that clears idle cores must live here when the
+	 * idle-core preference did not win.
+	 */
+	if (prefers_idle_core && best_fits > ASYM_IDLE_CORE_BIAS)
+		set_idle_cores(target, false);
+
 	return best_cpu;
 }
 
@@ -7820,12 +7894,17 @@ static inline bool asym_fits_cpu(unsigned long util,
 				 unsigned long util_max,
 				 int cpu)
 {
-	if (sched_asym_cpucap_active())
+	if (sched_asym_cpucap_active()) {
 		/*
 		 * Return true only if the cpu fully fits the task requirements
 		 * which include the utilization and the performance hints.
+		 *
+		 * When SMT is active, also require that the core has no busy
+		 * siblings.
 		 */
-		return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+		return (!sched_smt_active() || is_core_idle(cpu)) &&
+		       (util_fits_cpu(util, util_min, util_max, cpu) > 0);
+	}
 
 	return true;
 }
@@ -9330,6 +9409,7 @@ struct lb_env {
 
 	int			dst_cpu;
 	struct rq		*dst_rq;
+	bool			dst_core_idle;
 
 	struct cpumask		*dst_grpmask;
 	int			new_dst_cpu;
@@ -10575,10 +10655,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	 * We can use max_capacity here as reduction in capacity on some
 	 * CPUs in the group should either be possible to resolve
 	 * internally or be covered by avg_load imbalance (eventually).
+	 *
+	 * When SMT is active, only pull a misfit to dst_cpu if it is on a
+	 * fully idle core; otherwise the effective capacity of the core is
+	 * reduced and we may not actually provide more capacity than the
+	 * source.
 	 */
 	if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
 	    (sgs->group_type == group_misfit_task) &&
-	    (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
+	    (!env->dst_core_idle ||
+	     !capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
 	     sds->local_stat.group_type != group_has_spare))
 		return false;
 
@@ -11144,6 +11230,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 	unsigned long sum_util = 0;
 	bool sg_overloaded = 0, sg_overutilized = 0;
 
+	env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu);
+
 	do {
 		struct sg_lb_stats *sgs = &tmp_sgs;
 		int local_group;
@@ -12617,7 +12705,8 @@ static void set_cpu_sd_state_busy(int cpu)
 		goto unlock;
 	sd->nohz_idle = 0;
 
-	atomic_inc(&sd->shared->nr_busy_cpus);
+	if (sd->shared)
+		atomic_inc(&sd->shared->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
@@ -12646,7 +12735,8 @@ static void set_cpu_sd_state_idle(int cpu)
 		goto unlock;
 	sd->nohz_idle = 1;
 
-	atomic_dec(&sd->shared->nr_busy_cpus);
+	if (sd->shared)
+		atomic_dec(&sd->shared->nr_busy_cpus);
 unlock:
 	rcu_read_unlock();
 }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82d..7bc2d13b3bf57 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -680,16 +680,38 @@ static void update_top_cache_domain(int cpu)
 	int id = cpu;
 	int size = 1;
 
+	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
+	/*
+	 * The shared object is attached to sd_asym_cpucapacity only when the
+	 * asym domain is non-overlapping (i.e., not built from SD_NUMA).
+	 * On overlapping (NUMA) asym domains we fall back to letting the
+	 * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
+	 * here.
+	 */
+	if (sd && sd->shared)
+		sds = sd->shared;
+
+	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+
 	sd = highest_flag_domain(cpu, SD_SHARE_LLC);
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
-		sds = sd->shared;
+
+		/*
+		 * If sd_asym_cpucapacity didn't claim the shared object,
+		 * sd_llc must have one linked.
+		 */
+		if (!sds) {
+			WARN_ON_ONCE(!sd->shared);
+			sds = sd->shared;
+		}
 	}
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
 	per_cpu(sd_llc_id, cpu) = id;
+
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -708,9 +730,6 @@ static void update_top_cache_domain(int cpu)
 
 	sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
 	rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
-
-	sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
-	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
@@ -1640,7 +1659,7 @@ sd_init(struct sched_domain_topology_level *tl,
 {
 	struct sd_data *sdd = &tl->data;
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-	int sd_id, sd_weight, sd_flags = 0;
+	int sd_weight, sd_flags = 0;
 	struct cpumask *sd_span;
 
 	sd_weight = cpumask_weight(tl->mask(tl, cpu));
@@ -1688,7 +1707,6 @@ sd_init(struct sched_domain_topology_level *tl,
 
 	sd_span = sched_domain_span(sd);
 	cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
-	sd_id = cpumask_first(sd_span);
 
 	sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
 
@@ -1727,16 +1745,6 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->cache_nice_tries = 1;
 	}
 
-	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
-	 */
-	if (sd->flags & SD_SHARE_LLC) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
-		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}
-
 	sd->private = sdd;
 
 	return sd;
@@ -2548,6 +2556,16 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 	return true;
 }
 
+static void init_sched_domain_shared(struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+	int sd_id = cpumask_first(sched_domain_span(sd));
+
+	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+	atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+	atomic_inc(&sd->shared->ref);
+}
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -2605,6 +2623,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain *sd_asym = NULL;
+		bool asym_claimed = false;
+
+		sd = *per_cpu_ptr(d.sd, i);
+		if (!sd)
+			continue;
+
+		/*
+		 * In case of ASYM_CPUCAPACITY, attach sd->shared to
+		 * sd_asym_cpucapacity for wakeup stat tracking.
+		 *
+		 * Caveats:
+		 *
+		 * 1) has_asym is system-wide, but a given CPU may still
+		 *    lack an SD_ASYM_CPUCAPACITY_FULL ancestor (e.g., an
+		 *    exclusive cpuset carving out a symmetric capacity island).
+		 *    Such CPUs must fall through to the LLC seeding path below.
+		 *
+		 * 2) Skip the asym attach if the asym ancestor is an
+		 *    overlapping domain (SD_NUMA). On those topologies let the
+		 *    LLC path own the shared object instead.
+		 *
+		 * XXX: This assumes SD_ASYM_CPUCAPACITY_FULL domain
+		 * always has more than one group else it is prone to
+		 * degeneration.
+		 */
+		sd_asym = sd;
+		while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
+			sd_asym = sd_asym->parent;
+
+		if (sd_asym && !(sd_asym->flags & SD_NUMA)) {
+			init_sched_domain_shared(sd_asym);
+			asym_claimed = true;
+		}
+
+		/* First, find the topmost SD_SHARE_LLC domain */
+		sd = *per_cpu_ptr(d.sd, i);
+		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+			sd = sd->parent;
+
+		if (sd->flags & SD_SHARE_LLC) {
+			/*
+			 * Initialize the sd->shared for SD_SHARE_LLC unless
+			 * the asym path above already claimed it.
+			 */
+			if (!asym_claimed)
+				init_sched_domain_shared(sd);
+		}
+	}
+
 	/*
 	 * Calculate an allowed NUMA imbalance such that LLCs do not get
 	 * imbalanced.