diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab4114712be74..82714027a6564 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7762,6 +7762,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return idle_cpu; } +/* + * Idle-capacity scan ranks transformed util_fits_cpu() outcomes; lower values + * are more preferred (see select_idle_capacity()). + */ +enum asym_fits_state { + /* In descending order of preference */ + ASYM_IDLE_CORE_UCLAMP_MISFIT = -4, + ASYM_IDLE_CORE_COMPLETE_MISFIT, + ASYM_IDLE_THREAD_FITS, + ASYM_IDLE_THREAD_UCLAMP_MISFIT, + ASYM_IDLE_COMPLETE_MISFIT, + + /* asym_fits_cpu() bias for an idle core. */ + ASYM_IDLE_CORE_BIAS = -3, +}; + /* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to @@ -7770,10 +7786,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { + bool prefers_idle_core = sched_smt_active() && test_idle_cores(target); unsigned long task_util, util_min, util_max, best_cap = 0; - int fits, best_fits = 0; + int fits, best_fits = ASYM_IDLE_COMPLETE_MISFIT; int cpu, best_cpu = -1; struct cpumask *cpus; + int nr = INT_MAX; cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); @@ -7782,16 +7800,35 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); + if (sched_feat(SIS_UTIL) && sd->shared) { + /* + * Same nr_idle_scan hint as select_idle_cpu(), nr only limits + * the scan when not preferring an idle core. + */ + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; + /* overloaded domain is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + for_each_cpu_wrap(cpu, cpus, target) { + bool preferred_core = !prefers_idle_core || is_core_idle(cpu); unsigned long cpu_cap = capacity_of(cpu); + /* + * Good-enough early exit (mirrors select_idle_cpu() logic). + */ + if (!prefers_idle_core && + --nr <= 0 && best_fits == ASYM_IDLE_CORE_UCLAMP_MISFIT) + return best_cpu; + if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); /* This CPU fits with all requirements */ - if (fits > 0) + if (fits > 0 && preferred_core) return cpu; /* * Only the min performance hint (i.e. uclamp_min) doesn't fit. @@ -7799,9 +7836,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) */ else if (fits < 0) cpu_cap = get_actual_cpu_capacity(cpu); + /* + * fits > 0 implies we are not on a preferred core + * but the util fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS + * so the effective range becomes + * [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_COMPLETE_MISFIT] where: + * ASYM_IDLE_COMPLETE_MISFIT - does not fit + * ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN + * ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core + */ + else if (fits > 0) + fits = ASYM_IDLE_THREAD_FITS; /* - * First, select CPU which fits better (-1 being better than 0). + * If we are on a preferred core, translate the range of fits + * of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] to + * [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_COMPLETE_MISFIT]. + * This ensures that an idle core is always given priority over + * (partially) busy core. + * + * A fully fitting idle core would have returned early and hence + * fits > 0 for preferred_core need not be dealt with. + */ + if (preferred_core) + fits += ASYM_IDLE_CORE_BIAS; + + /* + * First, select CPU which fits better (lower is more preferred). * Then, select the one with best capacity at same level. */ if ((fits < best_fits) || @@ -7812,6 +7873,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) } } + /* + * A value in the [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_BIAS] + * range means the chosen CPU is in a fully idle SMT core. Values above + * ASYM_IDLE_CORE_BIAS mean we never ranked such a CPU best. + * + * The asym-capacity wakeup path returns from select_idle_sibling() + * after this function and never runs select_idle_cpu(), so the usual + * select_idle_cpu() tail that clears idle cores must live here when the + * idle-core preference did not win. + */ + if (prefers_idle_core && best_fits > ASYM_IDLE_CORE_BIAS) + set_idle_cores(target, false); + return best_cpu; } @@ -7820,12 +7894,17 @@ static inline bool asym_fits_cpu(unsigned long util, unsigned long util_max, int cpu) { - if (sched_asym_cpucap_active()) + if (sched_asym_cpucap_active()) { /* * Return true only if the cpu fully fits the task requirements * which include the utilization and the performance hints. + * + * When SMT is active, also require that the core has no busy + * siblings. */ - return (util_fits_cpu(util, util_min, util_max, cpu) > 0); + return (!sched_smt_active() || is_core_idle(cpu)) && + (util_fits_cpu(util, util_min, util_max, cpu) > 0); + } return true; } @@ -9330,6 +9409,7 @@ struct lb_env { int dst_cpu; struct rq *dst_rq; + bool dst_core_idle; struct cpumask *dst_grpmask; int new_dst_cpu; @@ -10575,10 +10655,16 @@ static bool update_sd_pick_busiest(struct lb_env *env, * We can use max_capacity here as reduction in capacity on some * CPUs in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). + * + * When SMT is active, only pull a misfit to dst_cpu if it is on a + * fully idle core; otherwise the effective capacity of the core is + * reduced and we may not actually provide more capacity than the + * source. */ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type == group_misfit_task) && - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || + (!env->dst_core_idle || + !capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; @@ -11144,6 +11230,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd unsigned long sum_util = 0; bool sg_overloaded = 0, sg_overutilized = 0; + env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu); + do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -12617,7 +12705,8 @@ static void set_cpu_sd_state_busy(int cpu) goto unlock; sd->nohz_idle = 0; - atomic_inc(&sd->shared->nr_busy_cpus); + if (sd->shared) + atomic_inc(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } @@ -12646,7 +12735,8 @@ static void set_cpu_sd_state_idle(int cpu) goto unlock; sd->nohz_idle = 1; - atomic_dec(&sd->shared->nr_busy_cpus); + if (sd->shared) + atomic_dec(&sd->shared->nr_busy_cpus); unlock: rcu_read_unlock(); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82d..7bc2d13b3bf57 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -680,16 +680,38 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); + /* + * The shared object is attached to sd_asym_cpucapacity only when the + * asym domain is non-overlapping (i.e., not built from SD_NUMA). + * On overlapping (NUMA) asym domains we fall back to letting the + * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL + * here. + */ + if (sd && sd->shared) + sds = sd->shared; + + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - sds = sd->shared; + + /* + * If sd_asym_cpucapacity didn't claim the shared object, + * sd_llc must have one linked. + */ + if (!sds) { + WARN_ON_ONCE(!sd->shared); + sds = sd->shared; + } } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_CLUSTER); @@ -708,9 +730,6 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); - - sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); - rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } /* @@ -1640,7 +1659,7 @@ sd_init(struct sched_domain_topology_level *tl, { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); - int sd_id, sd_weight, sd_flags = 0; + int sd_weight, sd_flags = 0; struct cpumask *sd_span; sd_weight = cpumask_weight(tl->mask(tl, cpu)); @@ -1688,7 +1707,6 @@ sd_init(struct sched_domain_topology_level *tl, sd_span = sched_domain_span(sd); cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); - sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1727,16 +1745,6 @@ sd_init(struct sched_domain_topology_level *tl, sd->cache_nice_tries = 1; } - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_LLC) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } - sd->private = sdd; return sd; @@ -2548,6 +2556,16 @@ static bool topology_span_sane(const struct cpumask *cpu_map) return true; } +static void init_sched_domain_shared(struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + int sd_id = cpumask_first(sched_domain_span(sd)); + + sd->shared = *per_cpu_ptr(sdd->sds, sd_id); + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -2605,6 +2623,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + for_each_cpu(i, cpu_map) { + struct sched_domain *sd_asym = NULL; + bool asym_claimed = false; + + sd = *per_cpu_ptr(d.sd, i); + if (!sd) + continue; + + /* + * In case of ASYM_CPUCAPACITY, attach sd->shared to + * sd_asym_cpucapacity for wakeup stat tracking. + * + * Caveats: + * + * 1) has_asym is system-wide, but a given CPU may still + * lack an SD_ASYM_CPUCAPACITY_FULL ancestor (e.g., an + * exclusive cpuset carving out a symmetric capacity island). + * Such CPUs must fall through to the LLC seeding path below. + * + * 2) Skip the asym attach if the asym ancestor is an + * overlapping domain (SD_NUMA). On those topologies let the + * LLC path own the shared object instead. + * + * XXX: This assumes SD_ASYM_CPUCAPACITY_FULL domain + * always has more than one group else it is prone to + * degeneration. + */ + sd_asym = sd; + while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL)) + sd_asym = sd_asym->parent; + + if (sd_asym && !(sd_asym->flags & SD_NUMA)) { + init_sched_domain_shared(sd_asym); + asym_claimed = true; + } + + /* First, find the topmost SD_SHARE_LLC domain */ + sd = *per_cpu_ptr(d.sd, i); + while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; + + if (sd->flags & SD_SHARE_LLC) { + /* + * Initialize the sd->shared for SD_SHARE_LLC unless + * the asym path above already claimed it. + */ + if (!asym_claimed) + init_sched_domain_shared(sd); + } + } + /* * Calculate an allowed NUMA imbalance such that LLCs do not get * imbalanced.