cgroup/cpuset: Fix remote root partition creation problem
Since commit181c8e091a
("cgroup/cpuset: Introduce remote partition"), a remote partition can be created underneath a non-partition root cpuset as long as its exclusive_cpus are set to distribute exclusive CPUs down to its children. The generate_sched_domains() function, however, doesn't take into account this new behavior and hence will fail to create the sched domain needed for a remote root (non-isolated) partition. There are two issues related to remote partition support. First of all, generate_sched_domains() has a fast path that is activated if root_load_balance is true and top_cpuset.nr_subparts is non-zero. The later condition isn't quite correct for remote partitions as nr_subparts just shows the number of local child partitions underneath it. There can be no local child partition under top_cpuset even if there are remote partitions further down the hierarchy. Fix that by checking for subpartitions_cpus which contains exclusive CPUs allocated to both local and remote partitions. Secondly, the valid partition check for subtree skipping in the csa[] generation loop isn't enough as remote partition does not need to have a partition root parent. Fix this problem by breaking csa[] array generation loop of generate_sched_domains() into v1 and v2 specific parts and checking a cpuset's exclusive_cpus before skipping its subtree in the v2 case. Also simplify generate_sched_domains() for cgroup v2 as only non-isolating partition roots should be included in building the cpuset array and none of the v1 scheduling attributes other than a different way to create an isolated partition are supported. Fixes:181c8e091a
("cgroup/cpuset: Introduce remote partition") Signed-off-by: Waiman Long <longman@redhat.com> Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
parent
6fe960147e
commit
ccac8e8de9
@ -169,7 +169,7 @@ struct cpuset {
|
|||||||
/* for custom sched domain */
|
/* for custom sched domain */
|
||||||
int relax_domain_level;
|
int relax_domain_level;
|
||||||
|
|
||||||
/* number of valid sub-partitions */
|
/* number of valid local child partitions */
|
||||||
int nr_subparts;
|
int nr_subparts;
|
||||||
|
|
||||||
/* partition root state */
|
/* partition root state */
|
||||||
@ -957,13 +957,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
|||||||
int nslot; /* next empty doms[] struct cpumask slot */
|
int nslot; /* next empty doms[] struct cpumask slot */
|
||||||
struct cgroup_subsys_state *pos_css;
|
struct cgroup_subsys_state *pos_css;
|
||||||
bool root_load_balance = is_sched_load_balance(&top_cpuset);
|
bool root_load_balance = is_sched_load_balance(&top_cpuset);
|
||||||
|
bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
|
||||||
|
|
||||||
doms = NULL;
|
doms = NULL;
|
||||||
dattr = NULL;
|
dattr = NULL;
|
||||||
csa = NULL;
|
csa = NULL;
|
||||||
|
|
||||||
/* Special case for the 99% of systems with one, full, sched domain */
|
/* Special case for the 99% of systems with one, full, sched domain */
|
||||||
if (root_load_balance && !top_cpuset.nr_subparts) {
|
if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
|
||||||
single_root_domain:
|
single_root_domain:
|
||||||
ndoms = 1;
|
ndoms = 1;
|
||||||
doms = alloc_sched_domains(ndoms);
|
doms = alloc_sched_domains(ndoms);
|
||||||
@ -992,16 +993,18 @@ single_root_domain:
|
|||||||
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
|
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
|
||||||
if (cp == &top_cpuset)
|
if (cp == &top_cpuset)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
if (cgrpv2)
|
||||||
|
goto v2;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* v1:
|
||||||
* Continue traversing beyond @cp iff @cp has some CPUs and
|
* Continue traversing beyond @cp iff @cp has some CPUs and
|
||||||
* isn't load balancing. The former is obvious. The
|
* isn't load balancing. The former is obvious. The
|
||||||
* latter: All child cpusets contain a subset of the
|
* latter: All child cpusets contain a subset of the
|
||||||
* parent's cpus, so just skip them, and then we call
|
* parent's cpus, so just skip them, and then we call
|
||||||
* update_domain_attr_tree() to calc relax_domain_level of
|
* update_domain_attr_tree() to calc relax_domain_level of
|
||||||
* the corresponding sched domain.
|
* the corresponding sched domain.
|
||||||
*
|
|
||||||
* If root is load-balancing, we can skip @cp if it
|
|
||||||
* is a subset of the root's effective_cpus.
|
|
||||||
*/
|
*/
|
||||||
if (!cpumask_empty(cp->cpus_allowed) &&
|
if (!cpumask_empty(cp->cpus_allowed) &&
|
||||||
!(is_sched_load_balance(cp) &&
|
!(is_sched_load_balance(cp) &&
|
||||||
@ -1009,16 +1012,28 @@ single_root_domain:
|
|||||||
housekeeping_cpumask(HK_TYPE_DOMAIN))))
|
housekeeping_cpumask(HK_TYPE_DOMAIN))))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (root_load_balance &&
|
|
||||||
cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (is_sched_load_balance(cp) &&
|
if (is_sched_load_balance(cp) &&
|
||||||
!cpumask_empty(cp->effective_cpus))
|
!cpumask_empty(cp->effective_cpus))
|
||||||
csa[csn++] = cp;
|
csa[csn++] = cp;
|
||||||
|
|
||||||
/* skip @cp's subtree if not a partition root */
|
/* skip @cp's subtree */
|
||||||
if (!is_partition_valid(cp))
|
pos_css = css_rightmost_descendant(pos_css);
|
||||||
|
continue;
|
||||||
|
|
||||||
|
v2:
|
||||||
|
/*
|
||||||
|
* Only valid partition roots that are not isolated and with
|
||||||
|
* non-empty effective_cpus will be saved into csn[].
|
||||||
|
*/
|
||||||
|
if ((cp->partition_root_state == PRS_ROOT) &&
|
||||||
|
!cpumask_empty(cp->effective_cpus))
|
||||||
|
csa[csn++] = cp;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Skip @cp's subtree if not a partition root and has no
|
||||||
|
* exclusive CPUs to be granted to child cpusets.
|
||||||
|
*/
|
||||||
|
if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
|
||||||
pos_css = css_rightmost_descendant(pos_css);
|
pos_css = css_rightmost_descendant(pos_css);
|
||||||
}
|
}
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
@ -1072,6 +1087,20 @@ restart:
|
|||||||
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
|
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
|
||||||
GFP_KERNEL);
|
GFP_KERNEL);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Cgroup v2 doesn't support domain attributes, just set all of them
|
||||||
|
* to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
|
||||||
|
* subset of HK_TYPE_DOMAIN housekeeping CPUs.
|
||||||
|
*/
|
||||||
|
if (cgrpv2) {
|
||||||
|
for (i = 0; i < ndoms; i++) {
|
||||||
|
cpumask_copy(doms[i], csa[i]->effective_cpus);
|
||||||
|
if (dattr)
|
||||||
|
dattr[i] = SD_ATTR_INIT;
|
||||||
|
}
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
|
||||||
for (nslot = 0, i = 0; i < csn; i++) {
|
for (nslot = 0, i = 0; i < csn; i++) {
|
||||||
struct cpuset *a = csa[i];
|
struct cpuset *a = csa[i];
|
||||||
struct cpumask *dp;
|
struct cpumask *dp;
|
||||||
@ -1231,7 +1260,7 @@ static void rebuild_sched_domains_locked(void)
|
|||||||
* root should be only a subset of the active CPUs. Since a CPU in any
|
* root should be only a subset of the active CPUs. Since a CPU in any
|
||||||
* partition root could be offlined, all must be checked.
|
* partition root could be offlined, all must be checked.
|
||||||
*/
|
*/
|
||||||
if (top_cpuset.nr_subparts) {
|
if (!cpumask_empty(subpartitions_cpus)) {
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
||||||
if (!is_partition_valid(cs)) {
|
if (!is_partition_valid(cs)) {
|
||||||
@ -4575,7 +4604,7 @@ static void cpuset_handle_hotplug(void)
|
|||||||
* In the rare case that hotplug removes all the cpus in
|
* In the rare case that hotplug removes all the cpus in
|
||||||
* subpartitions_cpus, we assumed that cpus are updated.
|
* subpartitions_cpus, we assumed that cpus are updated.
|
||||||
*/
|
*/
|
||||||
if (!cpus_updated && top_cpuset.nr_subparts)
|
if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
|
||||||
cpus_updated = true;
|
cpus_updated = true;
|
||||||
|
|
||||||
/* For v1, synchronize cpus_allowed to cpu_active_mask */
|
/* For v1, synchronize cpus_allowed to cpu_active_mask */
|
||||||
|
Loading…
Reference in New Issue
Block a user