From ab03125268679e058e1e7b6612f6d12610761769 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Jul 2024 11:00:34 -0400 Subject: [PATCH] cgroup: Show # of subsystem CSSes in cgroup.stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cgroup subsystem state (CSS) is an abstraction in the cgroup layer to help manage different structures in various cgroup subsystems by being an embedded element inside a larger structure like cpuset or mem_cgroup. The /proc/cgroups file shows the number of cgroups for each of the subsystems. With cgroup v1, the number of CSSes is the same as the number of cgroups. That is not the case anymore with cgroup v2. The /proc/cgroups file cannot show the actual number of CSSes for the subsystems that are bound to cgroup v2. So if a v2 cgroup subsystem is leaking cgroups (usually memory cgroup), we can't tell by looking at /proc/cgroups which cgroup subsystems may be responsible. As cgroup v2 had deprecated the use of /proc/cgroups, the hierarchical cgroup.stat file is now being extended to show the number of live and dying CSSes associated with all the non-inhibited cgroup subsystems that have been bound to cgroup v2. The number includes CSSes in the current cgroup as well as in all the descendants underneath it. This will help us pinpoint which subsystems are responsible for the increasing number of dying (nr_dying_descendants) cgroups. The CSSes dying counts are stored in the cgroup structure itself instead of inside the CSS as suggested by Johannes. This will allow us to accurately track dying counts of cgroup subsystems that have recently been disabled in a cgroup. It is now possible that a zero subsystem number is coupled with a non-zero dying subsystem number. The cgroup-v2.rst file is updated to discuss this new behavior. With this patch applied, a sample output from root cgroup.stat file was shown below. nr_descendants 56 nr_subsys_cpuset 1 nr_subsys_cpu 43 nr_subsys_io 43 nr_subsys_memory 56 nr_subsys_perf_event 57 nr_subsys_hugetlb 1 nr_subsys_pids 56 nr_subsys_rdma 1 nr_subsys_misc 1 nr_dying_descendants 30 nr_dying_subsys_cpuset 0 nr_dying_subsys_cpu 0 nr_dying_subsys_io 0 nr_dying_subsys_memory 30 nr_dying_subsys_perf_event 0 nr_dying_subsys_hugetlb 0 nr_dying_subsys_pids 0 nr_dying_subsys_rdma 0 nr_dying_subsys_misc 0 Another sample output from system.slice/cgroup.stat was: nr_descendants 34 nr_subsys_cpuset 0 nr_subsys_cpu 32 nr_subsys_io 32 nr_subsys_memory 34 nr_subsys_perf_event 35 nr_subsys_hugetlb 0 nr_subsys_pids 34 nr_subsys_rdma 0 nr_subsys_misc 0 nr_dying_descendants 30 nr_dying_subsys_cpuset 0 nr_dying_subsys_cpu 0 nr_dying_subsys_io 0 nr_dying_subsys_memory 30 nr_dying_subsys_perf_event 0 nr_dying_subsys_hugetlb 0 nr_dying_subsys_pids 0 nr_dying_subsys_rdma 0 nr_dying_subsys_misc 0 Note that 'debug' controller wasn't used to provide this information because the controller is not recommended in productions kernels, also many of them won't enable CONFIG_CGROUP_DEBUG by default. Similar information could be retrieved with debuggers like drgn but that's also not always available (e.g. lockdown) and the additional cost of runtime tracking here is deemed marginal. tj: Added Michal's paragraphs on why this is not added the debug controller to the commit message. Signed-off-by: Waiman Long Acked-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Kamalesh Babulal Cc: Michal Koutný Link: http://lkml.kernel.org/r/20240715150034.2583772-1-longman@redhat.com Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 12 +++++- include/linux/cgroup-defs.h | 14 +++++++ kernel/cgroup/cgroup.c | 55 ++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 86311c2907cd..70cefccd07ce 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -981,6 +981,14 @@ All cgroup core files are prefixed with "cgroup." A dying cgroup can consume system resources not exceeding limits, which were active at the moment of cgroup deletion. + nr_subsys_ + Total number of live cgroup subsystems (e.g memory + cgroup) at and beneath the current cgroup. + + nr_dying_subsys_ + Total number of dying cgroup subsystems (e.g. memory + cgroup) at and beneath the current cgroup. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". @@ -2939,8 +2947,8 @@ Deprecated v1 Core Features - "cgroup.clone_children" is removed. -- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" file - at the root instead. +- /proc/cgroups is meaningless for v2. Use "cgroup.controllers" or + "cgroup.stat" files at the root instead. Issues with v1 and Rationales for v2 diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ae04035b6cbe..eb0f6f349496 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -210,6 +210,14 @@ struct cgroup_subsys_state { * fields of the containing structure. */ struct cgroup_subsys_state *parent; + + /* + * Keep track of total numbers of visible descendant CSSes. + * The total number of dying CSSes is tracked in + * css->cgroup->nr_dying_subsys[ssid]. + * Protected by cgroup_mutex. + */ + int nr_descendants; }; /* @@ -470,6 +478,12 @@ struct cgroup { /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; + /* + * Keep track of total number of dying CSSes at and below this cgroup. + * Protected by cgroup_mutex. + */ + int nr_dying_subsys[CGROUP_SUBSYS_COUNT]; + struct cgroup_root *root; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c8e4b62b436a..601600afdd20 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3669,12 +3669,40 @@ static int cgroup_events_show(struct seq_file *seq, void *v) static int cgroup_stat_show(struct seq_file *seq, void *v) { struct cgroup *cgroup = seq_css(seq)->cgroup; + struct cgroup_subsys_state *css; + int dying_cnt[CGROUP_SUBSYS_COUNT]; + int ssid; seq_printf(seq, "nr_descendants %d\n", cgroup->nr_descendants); + + /* + * Show the number of live and dying csses associated with each of + * non-inhibited cgroup subsystems that is bound to cgroup v2. + * + * Without proper lock protection, racing is possible. So the + * numbers may not be consistent when that happens. + */ + rcu_read_lock(); + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + dying_cnt[ssid] = -1; + if ((BIT(ssid) & cgrp_dfl_inhibit_ss_mask) || + (cgroup_subsys[ssid]->root != &cgrp_dfl_root)) + continue; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + dying_cnt[ssid] = cgroup->nr_dying_subsys[ssid]; + seq_printf(seq, "nr_subsys_%s %d\n", cgroup_subsys[ssid]->name, + css ? (css->nr_descendants + 1) : 0); + } + seq_printf(seq, "nr_dying_descendants %d\n", cgroup->nr_dying_descendants); - + for (ssid = 0; ssid < CGROUP_SUBSYS_COUNT; ssid++) { + if (dying_cnt[ssid] >= 0) + seq_printf(seq, "nr_dying_subsys_%s %d\n", + cgroup_subsys[ssid]->name, dying_cnt[ssid]); + } + rcu_read_unlock(); return 0; } @@ -5424,6 +5452,8 @@ static void css_release_work_fn(struct work_struct *work) list_del_rcu(&css->sibling); if (ss) { + struct cgroup *parent_cgrp; + /* css release path */ if (!list_empty(&css->rstat_css_node)) { cgroup_rstat_flush(cgrp); @@ -5433,6 +5463,14 @@ static void css_release_work_fn(struct work_struct *work) cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); + + cgrp->nr_dying_subsys[ss->id]--; + WARN_ON_ONCE(css->nr_descendants || cgrp->nr_dying_subsys[ss->id]); + parent_cgrp = cgroup_parent(cgrp); + while (parent_cgrp) { + parent_cgrp->nr_dying_subsys[ss->id]--; + parent_cgrp = cgroup_parent(parent_cgrp); + } } else { struct cgroup *tcgrp; @@ -5517,8 +5555,11 @@ static int online_css(struct cgroup_subsys_state *css) rcu_assign_pointer(css->cgroup->subsys[ss->id], css); atomic_inc(&css->online_cnt); - if (css->parent) + if (css->parent) { atomic_inc(&css->parent->online_cnt); + while ((css = css->parent)) + css->nr_descendants++; + } } return ret; } @@ -5540,6 +5581,16 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); wake_up_all(&css->cgroup->offline_waitq); + + css->cgroup->nr_dying_subsys[ss->id]++; + /* + * Parent css and cgroup cannot be freed until after the freeing + * of child css, see css_free_rwork_fn(). + */ + while ((css = css->parent)) { + css->nr_descendants--; + css->cgroup->nr_dying_subsys[ss->id]++; + } } /**