diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst index 6acebd9e72c8..0f9f9a7b1f6c 100644 --- a/Documentation/admin-guide/cgroup-v1/pids.rst +++ b/Documentation/admin-guide/cgroup-v1/pids.rst @@ -36,7 +36,8 @@ superset of parent/child/pids.current. The pids.events file contains event counters: - - max: Number of times fork failed because limit was hit. + - max: Number of times fork failed in the cgroup because limit was hit in + self or ancestors. Example ------- diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8fbb0519d556..dfeb51c994e6 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -239,6 +239,10 @@ cgroup v2 currently supports the following mount options. will not be tracked by the memory controller (even if cgroup v2 is remounted later on). + pids_localevents + Represent fork failures inside cgroup's pids.events:max (v1 behavior), + not its limit being hit (v2 behavior). + Organizing Processes and Threads -------------------------------- @@ -2205,12 +2209,13 @@ PID Interface Files descendants has ever reached. pids.events - A read-only flat-keyed file which exists on non-root cgroups. The - following entries are defined. Unless specified otherwise, a value - change in this file generates a file modified event. + A read-only flat-keyed file which exists on non-root cgroups. Unless + specified otherwise, a value change in this file generates a file + modified event. The following entries are defined. max - Number of times fork failed because limit was hit. + The number of times the cgroup's number of processes hit the + limit (see also pids_localevents). Organisational operations are not blocked by cgroup policies, so it is possible to have pids.current > pids.max. This can be done by either diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ea48c861cd36..b36690ca0d3f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -119,7 +119,12 @@ enum { /* * Enable hugetlb accounting for the memory controller. */ - CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), + + /* + * Enable legacy local pids.events. + */ + CGRP_ROOT_PIDS_LOCAL_EVENTS = (1 << 20), }; /* cftype->flags */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e32b6972c478..9c9943ea5f89 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1922,6 +1922,7 @@ enum cgroup2_param { Opt_memory_localevents, Opt_memory_recursiveprot, Opt_memory_hugetlb_accounting, + Opt_pids_localevents, nr__cgroup2_params }; @@ -1931,6 +1932,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), + fsparam_flag("pids_localevents", Opt_pids_localevents), {} }; @@ -1960,6 +1962,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_hugetlb_accounting: ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; return 0; + case Opt_pids_localevents: + ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + return 0; } return -EINVAL; } @@ -1989,6 +1994,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + + if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS; } } @@ -2004,6 +2014,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_recursiveprot"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) seq_puts(seq, ",memory_hugetlb_accounting"); + if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + seq_puts(seq, ",pids_localevents"); return 0; } @@ -7062,7 +7074,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n" - "memory_hugetlb_accounting\n"); + "memory_hugetlb_accounting\n" + "pids_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 0e5ec7d59b4d..a557f5c8300b 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -38,6 +38,14 @@ #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX_STR "max" +enum pidcg_event { + /* Fork failed in subtree because this pids_cgroup limit was hit. */ + PIDCG_MAX, + /* Fork failed in this pids_cgroup because ancestor limit was hit. */ + PIDCG_FORKFAIL, + NR_PIDCG_EVENTS, +}; + struct pids_cgroup { struct cgroup_subsys_state css; @@ -52,8 +60,7 @@ struct pids_cgroup { /* Handle for "pids.events" */ struct cgroup_file events_file; - /* Number of times fork failed because limit was hit. */ - atomic64_t events_limit; + atomic64_t events[NR_PIDCG_EVENTS]; }; static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) @@ -148,12 +155,13 @@ static void pids_charge(struct pids_cgroup *pids, int num) * pids_try_charge - hierarchically try to charge the pid count * @pids: the pid cgroup state * @num: the number of pids to charge + * @fail: storage of pid cgroup causing the fail * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge * succeeded, otherwise -EAGAIN. */ -static int pids_try_charge(struct pids_cgroup *pids, int num) +static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail) { struct pids_cgroup *p, *q; @@ -166,9 +174,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) * p->limit is %PIDS_MAX then we know that this test will never * fail. */ - if (new > limit) + if (new > limit) { + *fail = p; goto revert; - + } /* * Not technically accurate if we go over limit somewhere up * the hierarchy, but that's tolerable for the watermark. @@ -236,7 +245,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) static int pids_can_fork(struct task_struct *task, struct css_set *cset) { struct cgroup_subsys_state *css; - struct pids_cgroup *pids; + struct pids_cgroup *pids, *pids_over_limit; int err; if (cset) @@ -244,15 +253,23 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset) else css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - err = pids_try_charge(pids, 1); + err = pids_try_charge(pids, 1, &pids_over_limit); if (err) { - /* Only log the first time events_limit is incremented. */ - if (atomic64_inc_return(&pids->events_limit) == 1) { + /* compatibility on v1 where events were notified in leaves. */ + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys)) + pids_over_limit = pids; + + /* Only log the first time limit is hit. */ + if (atomic64_inc_return(&pids->events[PIDCG_FORKFAIL]) == 1) { pr_info("cgroup: fork rejected by pids controller in "); - pr_cont_cgroup_path(css->cgroup); + pr_cont_cgroup_path(pids->css.cgroup); pr_cont("\n"); } + atomic64_inc(&pids_over_limit->events[PIDCG_MAX]); + cgroup_file_notify(&pids->events_file); + if (pids_over_limit != pids) + cgroup_file_notify(&pids_over_limit->events_file); } return err; } @@ -340,8 +357,13 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css, static int pids_events_show(struct seq_file *sf, void *v) { struct pids_cgroup *pids = css_pids(seq_css(sf)); + enum pidcg_event pe = PIDCG_MAX; - seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit)); + if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) || + cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) + pe = PIDCG_FORKFAIL; + + seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events[pe])); return 0; }