Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 98 additions & 8 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -7762,6 +7762,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
return idle_cpu;
}

/*
* Idle-capacity scan ranks transformed util_fits_cpu() outcomes; lower values
* are more preferred (see select_idle_capacity()).
*/
enum asym_fits_state {
/* In descending order of preference */
ASYM_IDLE_CORE_UCLAMP_MISFIT = -4,
ASYM_IDLE_CORE_COMPLETE_MISFIT,
ASYM_IDLE_THREAD_FITS,
ASYM_IDLE_THREAD_UCLAMP_MISFIT,
ASYM_IDLE_COMPLETE_MISFIT,

/* asym_fits_cpu() bias for an idle core. */
ASYM_IDLE_CORE_BIAS = -3,
};

/*
* Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
* the task fits. If no CPU is big enough, but there are idle ones, try to
Expand All @@ -7770,10 +7786,12 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
bool prefers_idle_core = sched_smt_active() && test_idle_cores(target);
unsigned long task_util, util_min, util_max, best_cap = 0;
int fits, best_fits = 0;
int fits, best_fits = ASYM_IDLE_COMPLETE_MISFIT;
int cpu, best_cpu = -1;
struct cpumask *cpus;
int nr = INT_MAX;

cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
Expand All @@ -7782,26 +7800,69 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);

if (sched_feat(SIS_UTIL) && sd->shared) {
/*
* Same nr_idle_scan hint as select_idle_cpu(), nr only limits
* the scan when not preferring an idle core.
*/
nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
/* overloaded domain is unlikely to have idle cpu/core */
if (nr == 1)
return -1;
}

for_each_cpu_wrap(cpu, cpus, target) {
bool preferred_core = !prefers_idle_core || is_core_idle(cpu);
unsigned long cpu_cap = capacity_of(cpu);

/*
* Good-enough early exit (mirrors select_idle_cpu() logic).
*/
if (!prefers_idle_core &&
--nr <= 0 && best_fits == ASYM_IDLE_CORE_UCLAMP_MISFIT)
return best_cpu;

if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;

fits = util_fits_cpu(task_util, util_min, util_max, cpu);

/* This CPU fits with all requirements */
if (fits > 0)
if (fits > 0 && preferred_core)
return cpu;
/*
* Only the min performance hint (i.e. uclamp_min) doesn't fit.
* Look for the CPU with best capacity.
*/
else if (fits < 0)
cpu_cap = get_actual_cpu_capacity(cpu);
/*
* fits > 0 implies we are not on a preferred core
* but the util fits CPU capacity. Set fits to ASYM_IDLE_THREAD_FITS
* so the effective range becomes
* [ASYM_IDLE_THREAD_FITS, ASYM_IDLE_COMPLETE_MISFIT] where:
* ASYM_IDLE_COMPLETE_MISFIT - does not fit
* ASYM_IDLE_THREAD_UCLAMP_MISFIT - fits with the exception of UCLAMP_MIN
* ASYM_IDLE_THREAD_FITS - fits with the exception of preferred_core
*/
else if (fits > 0)
fits = ASYM_IDLE_THREAD_FITS;

/*
* First, select CPU which fits better (-1 being better than 0).
* If we are on a preferred core, translate the range of fits
* of [ASYM_IDLE_THREAD_UCLAMP_MISFIT, ASYM_IDLE_COMPLETE_MISFIT] to
* [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_COMPLETE_MISFIT].
* This ensures that an idle core is always given priority over
* (partially) busy core.
*
* A fully fitting idle core would have returned early and hence
* fits > 0 for preferred_core need not be dealt with.
*/
if (preferred_core)
fits += ASYM_IDLE_CORE_BIAS;

/*
* First, select CPU which fits better (lower is more preferred).
* Then, select the one with best capacity at same level.
*/
if ((fits < best_fits) ||
Expand All @@ -7812,6 +7873,19 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
}
}

/*
* A value in the [ASYM_IDLE_CORE_UCLAMP_MISFIT, ASYM_IDLE_CORE_BIAS]
* range means the chosen CPU is in a fully idle SMT core. Values above
* ASYM_IDLE_CORE_BIAS mean we never ranked such a CPU best.
*
* The asym-capacity wakeup path returns from select_idle_sibling()
* after this function and never runs select_idle_cpu(), so the usual
* select_idle_cpu() tail that clears idle cores must live here when the
* idle-core preference did not win.
*/
if (prefers_idle_core && best_fits > ASYM_IDLE_CORE_BIAS)
set_idle_cores(target, false);

return best_cpu;
}

Expand All @@ -7820,12 +7894,17 @@ static inline bool asym_fits_cpu(unsigned long util,
unsigned long util_max,
int cpu)
{
if (sched_asym_cpucap_active())
if (sched_asym_cpucap_active()) {
/*
* Return true only if the cpu fully fits the task requirements
* which include the utilization and the performance hints.
*
* When SMT is active, also require that the core has no busy
* siblings.
*/
return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
return (!sched_smt_active() || is_core_idle(cpu)) &&
(util_fits_cpu(util, util_min, util_max, cpu) > 0);
}

return true;
}
Expand Down Expand Up @@ -9330,6 +9409,7 @@ struct lb_env {

int dst_cpu;
struct rq *dst_rq;
bool dst_core_idle;

struct cpumask *dst_grpmask;
int new_dst_cpu;
Expand Down Expand Up @@ -10575,10 +10655,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* We can use max_capacity here as reduction in capacity on some
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*
* When SMT is active, only pull a misfit to dst_cpu if it is on a
* fully idle core; otherwise the effective capacity of the core is
* reduced and we may not actually provide more capacity than the
* source.
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
(!env->dst_core_idle ||
!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;

Expand Down Expand Up @@ -11144,6 +11230,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
unsigned long sum_util = 0;
bool sg_overloaded = 0, sg_overutilized = 0;

env->dst_core_idle = !sched_smt_active() || is_core_idle(env->dst_cpu);

do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
Expand Down Expand Up @@ -12617,7 +12705,8 @@ static void set_cpu_sd_state_busy(int cpu)
goto unlock;
sd->nohz_idle = 0;

atomic_inc(&sd->shared->nr_busy_cpus);
if (sd->shared)
atomic_inc(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
Expand Down Expand Up @@ -12646,7 +12735,8 @@ static void set_cpu_sd_state_idle(int cpu)
goto unlock;
sd->nohz_idle = 1;

atomic_dec(&sd->shared->nr_busy_cpus);
if (sd->shared)
atomic_dec(&sd->shared->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
Expand Down
101 changes: 85 additions & 16 deletions kernel/sched/topology.c
Original file line number Diff line number Diff line change
Expand Up @@ -680,16 +680,38 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;

sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
/*
* The shared object is attached to sd_asym_cpucapacity only when the
* asym domain is non-overlapping (i.e., not built from SD_NUMA).
* On overlapping (NUMA) asym domains we fall back to letting the
* SD_SHARE_LLC path own the shared object, so sd->shared may be NULL
* here.
*/
if (sd && sd->shared)
sds = sd->shared;

rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);

sd = highest_flag_domain(cpu, SD_SHARE_LLC);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
sds = sd->shared;

/*
* If sd_asym_cpucapacity didn't claim the shared object,
* sd_llc must have one linked.
*/
if (!sds) {
WARN_ON_ONCE(!sd->shared);
sds = sd->shared;
}
}

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;

rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);

sd = lowest_flag_domain(cpu, SD_CLUSTER);
Expand All @@ -708,9 +730,6 @@ static void update_top_cache_domain(int cpu)

sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);

sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL);
rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
}

/*
Expand Down Expand Up @@ -1640,7 +1659,7 @@ sd_init(struct sched_domain_topology_level *tl,
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
int sd_id, sd_weight, sd_flags = 0;
int sd_weight, sd_flags = 0;
struct cpumask *sd_span;

sd_weight = cpumask_weight(tl->mask(tl, cpu));
Expand Down Expand Up @@ -1688,7 +1707,6 @@ sd_init(struct sched_domain_topology_level *tl,

sd_span = sched_domain_span(sd);
cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
sd_id = cpumask_first(sd_span);

sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);

Expand Down Expand Up @@ -1727,16 +1745,6 @@ sd_init(struct sched_domain_topology_level *tl,
sd->cache_nice_tries = 1;
}

/*
* For all levels sharing cache; connect a sched_domain_shared
* instance.
*/
if (sd->flags & SD_SHARE_LLC) {
sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_inc(&sd->shared->ref);
atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
}

sd->private = sdd;

return sd;
Expand Down Expand Up @@ -2548,6 +2556,16 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
return true;
}

static void init_sched_domain_shared(struct sched_domain *sd)
{
struct sd_data *sdd = sd->private;
int sd_id = cpumask_first(sched_domain_span(sd));

sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
atomic_inc(&sd->shared->ref);
}

/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
Expand Down Expand Up @@ -2605,6 +2623,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}

for_each_cpu(i, cpu_map) {
struct sched_domain *sd_asym = NULL;
bool asym_claimed = false;

sd = *per_cpu_ptr(d.sd, i);
if (!sd)
continue;

/*
* In case of ASYM_CPUCAPACITY, attach sd->shared to
* sd_asym_cpucapacity for wakeup stat tracking.
*
* Caveats:
*
* 1) has_asym is system-wide, but a given CPU may still
* lack an SD_ASYM_CPUCAPACITY_FULL ancestor (e.g., an
* exclusive cpuset carving out a symmetric capacity island).
* Such CPUs must fall through to the LLC seeding path below.
*
* 2) Skip the asym attach if the asym ancestor is an
* overlapping domain (SD_NUMA). On those topologies let the
* LLC path own the shared object instead.
*
* XXX: This assumes SD_ASYM_CPUCAPACITY_FULL domain
* always has more than one group else it is prone to
* degeneration.
*/
sd_asym = sd;
while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL))
sd_asym = sd_asym->parent;

if (sd_asym && !(sd_asym->flags & SD_NUMA)) {
init_sched_domain_shared(sd_asym);
asym_claimed = true;
}

/* First, find the topmost SD_SHARE_LLC domain */
sd = *per_cpu_ptr(d.sd, i);
while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
sd = sd->parent;

if (sd->flags & SD_SHARE_LLC) {
/*
* Initialize the sd->shared for SD_SHARE_LLC unless
* the asym path above already claimed it.
*/
if (!asym_claimed)
init_sched_domain_shared(sd);
}
}

/*
* Calculate an allowed NUMA imbalance such that LLCs do not get
* imbalanced.
Expand Down
Loading