From bdeb868c0ddf04c4777bf651834495baaf4f991b Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Mon, 23 Sep 2024 21:54:30 +0800 Subject: [PATCH 1/5] sched: Add dummy version of sched_group_set_idle() Fix the following error when build with CONFIG_GROUP_SCHED_WEIGHT && !CONFIG_FAIR_GROUP_SCHED: kernel/sched/core.c:9634:15: error: implicit declaration of function 'sched_group_set_idle'; did you mean 'scx_group_set_idle'? [-Wimplicit-function-declaration] 9634 | ret = sched_group_set_idle(css_tg(css), idle); | ^~~~~~~~~~~~~~~~~~~~ | scx_group_set_idle Fixes: e179e80c5d4f ("sched: Introduce CONFIG_GROUP_SCHED_WEIGHT") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409220859.UiCAoFOW-lkp@intel.com/ Signed-off-by: Yu Liao Signed-off-by: Tejun Heo --- kernel/sched/sched.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8063db62b027..91d14061fdca 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -582,6 +582,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #endif /* CONFIG_SMP */ #else /* !CONFIG_FAIR_GROUP_SCHED */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ From 7ebd84d627e40cb9fb12b338588e81b6cca371e3 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Mon, 23 Sep 2024 21:54:31 +0800 Subject: [PATCH 2/5] sched: Put task_group::idle under CONFIG_GROUP_SCHED_WEIGHT When build with CONFIG_GROUP_SCHED_WEIGHT && !CONFIG_FAIR_GROUP_SCHED, the idle member is not defined: kernel/sched/ext.c:3701:16: error: 'struct task_group' has no member named 'idle' 3701 | if (!tg->idle) | ^~ Fix this by putting 'idle' under new CONFIG_GROUP_SCHED_WEIGHT. tj: Move idle field upward to avoid breaking up CONFIG_FAIR_GROUP_SCHED block. Fixes: e179e80c5d4f ("sched: Introduce CONFIG_GROUP_SCHED_WEIGHT") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409220859.UiCAoFOW-lkp@intel.com/ Signed-off-by: Yu Liao Signed-off-by: Tejun Heo --- kernel/sched/sched.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91d14061fdca..b1c3588a8f00 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -432,16 +432,17 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put From edf1c586e92675c4e0eb27758fcdb55a56838de1 Mon Sep 17 00:00:00 2001 From: Pat Somaru Date: Fri, 20 Sep 2024 15:41:59 -0400 Subject: [PATCH 3/5] sched, sched_ext: Disable SM_IDLE/rq empty path when scx_enabled() Disable the rq empty path when scx is enabled. SCX must consult the BPF scheduler (via the dispatch path in balance) to determine if rq is empty. This fixes stalls when scx is enabled. Signed-off-by: Pat Somaru Fixes: 3dcac251b066 ("sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()") Signed-off-by: Tejun Heo --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b6cc1cf499d6..43e453ab7e20 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6591,7 +6591,8 @@ static void __sched notrace __schedule(int sched_mode) */ prev_state = READ_ONCE(prev->__state); if (sched_mode == SM_IDLE) { - if (!rq->nr_running) { + /* SCX must consult the BPF scheduler to tell if rq is empty */ + if (!rq->nr_running && !scx_enabled()) { next = prev; goto picked; } From 62d3726d4cd66f3e48dfe0f0401e0d74e58c2170 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Sep 2024 06:45:22 -1000 Subject: [PATCH 4/5] sched_ext: Fix build when !CONFIG_STACKTRACE a2f4b16e736d ("sched_ext: Build fix on !CONFIG_STACKTRACE[_SUPPORT]") tried fixing build when !CONFIG_STACKTRACE but didn't so fully. Also put stack_trace_print() and stack_trace_save() inside CONFIG_STACKTRACE to fix build when !CONFIG_STACKTRACE. Signed-off-by: Tejun Heo Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202409220642.fDW2OmWc-lkp@intel.com/ --- kernel/sched/ext.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9ee5a9a261cc..7c320dcd72d5 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4469,8 +4469,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work) if (ei->msg[0] != '\0') pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); - +#ifdef CONFIG_STACKTRACE stack_trace_print(ei->bt, ei->bt_len, 2); +#endif } else { pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); @@ -4847,10 +4848,10 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, return; ei->exit_code = exit_code; - +#ifdef CONFIG_STACKTRACE if (kind >= SCX_EXIT_ERROR) ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); - +#endif va_start(args, fmt); vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); va_end(args); From 431844b65f4c1b988ccd886f2ed29c138f7bb262 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 21 Sep 2024 21:39:21 +0200 Subject: [PATCH 5/5] sched_ext: Provide a sysfs enable_seq counter As discussed during the distro-centric session within the sched_ext Microconference at LPC 2024, introduce a sequence counter that is incremented every time a BPF scheduler is loaded. This feature can help distributions in diagnosing potential performance regressions by identifying systems where users are running (or have ran) custom BPF schedulers. Example: arighi@virtme-ng~> cat /sys/kernel/sched_ext/enable_seq 0 arighi@virtme-ng~> sudo scx_simple local=1 global=0 ^CEXIT: unregistered from user space arighi@virtme-ng~> cat /sys/kernel/sched_ext/enable_seq 1 In this way user-space tools (such as Ubuntu's apport and similar) are able to gather and include this information in bug reports. Cc: Giovanni Gherdovich Cc: Kleber Sacilotto de Souza Cc: Marcelo Henrique Cerri Cc: Phil Auld Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- Documentation/scheduler/sched-ext.rst | 10 ++++++++++ kernel/sched/ext.c | 17 +++++++++++++++++ tools/sched_ext/scx_show_state.py | 1 + 3 files changed, 28 insertions(+) diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index a707d2181a77..6c0d70e2e27d 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -83,6 +83,15 @@ The current status of the BPF scheduler can be determined as follows: # cat /sys/kernel/sched_ext/root/ops simple +You can check if any BPF scheduler has ever been loaded since boot by examining +this monotonically incrementing counter (a value of zero indicates that no BPF +scheduler has been loaded): + +.. code-block:: none + + # cat /sys/kernel/sched_ext/enable_seq + 1 + ``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more detailed information: @@ -96,6 +105,7 @@ detailed information: enable_state : enabled (2) bypass_depth : 0 nr_rejected : 0 + enable_seq : 1 If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can be determined as follows: diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7c320dcd72d5..c09e3dc38c34 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -874,6 +874,13 @@ static struct scx_exit_info *scx_exit_info; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +/* + * A monotically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used by to check if any custom sched_ext + * scheduler has ever been used in the system. + */ +static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); + /* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger @@ -4154,11 +4161,19 @@ static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, } SCX_ATTR(hotplug_seq); +static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); +} +SCX_ATTR(enable_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, &scx_attr_hotplug_seq.attr, + &scx_attr_enable_seq.attr, NULL, }; @@ -5177,6 +5192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) kobject_uevent(scx_root_kobj, KOBJ_ADD); mutex_unlock(&scx_ops_enable_mutex); + atomic_long_inc(&scx_enable_seq); + return 0; err_del: diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py index d457d2a74e1e..8bc626ede1c4 100644 --- a/tools/sched_ext/scx_show_state.py +++ b/tools/sched_ext/scx_show_state.py @@ -37,3 +37,4 @@ print(f'switched_all : {read_static_key("__scx_switched_all")}') print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') +print(f'enable_seq : {read_atomic("scx_enable_seq")}')