From bdeb868c0ddf04c4777bf651834495baaf4f991b Mon Sep 17 00:00:00 2001
From: Yu Liao <liaoyu15@huawei.com>
Date: Mon, 23 Sep 2024 21:54:30 +0800
Subject: [PATCH 1/5] sched: Add dummy version of sched_group_set_idle()

Fix the following error when build with CONFIG_GROUP_SCHED_WEIGHT &&
!CONFIG_FAIR_GROUP_SCHED:

kernel/sched/core.c:9634:15: error: implicit declaration of function
'sched_group_set_idle'; did you mean 'scx_group_set_idle'? [-Wimplicit-function-declaration]
  9634 |         ret = sched_group_set_idle(css_tg(css), idle);
       |               ^~~~~~~~~~~~~~~~~~~~
       |               scx_group_set_idle

Fixes: e179e80c5d4f ("sched: Introduce CONFIG_GROUP_SCHED_WEIGHT")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202409220859.UiCAoFOW-lkp@intel.com/
Signed-off-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/sched.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8063db62b027..91d14061fdca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -582,6 +582,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
 #endif /* CONFIG_SMP */
 #else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; }
+static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #else /* CONFIG_CGROUP_SCHED */

From 7ebd84d627e40cb9fb12b338588e81b6cca371e3 Mon Sep 17 00:00:00 2001
From: Yu Liao <liaoyu15@huawei.com>
Date: Mon, 23 Sep 2024 21:54:31 +0800
Subject: [PATCH 2/5] sched: Put task_group::idle under
 CONFIG_GROUP_SCHED_WEIGHT

When build with CONFIG_GROUP_SCHED_WEIGHT && !CONFIG_FAIR_GROUP_SCHED,
the idle member is not defined:

kernel/sched/ext.c:3701:16: error: 'struct task_group' has no member named 'idle'
  3701 |         if (!tg->idle)
       |                ^~

Fix this by putting 'idle' under new CONFIG_GROUP_SCHED_WEIGHT.

tj: Move idle field upward to avoid breaking up CONFIG_FAIR_GROUP_SCHED block.

Fixes: e179e80c5d4f ("sched: Introduce CONFIG_GROUP_SCHED_WEIGHT")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202409220859.UiCAoFOW-lkp@intel.com/
Signed-off-by: Yu Liao <liaoyu15@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/sched.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91d14061fdca..b1c3588a8f00 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -432,16 +432,17 @@ struct cfs_bandwidth {
 struct task_group {
 	struct cgroup_subsys_state css;
 
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+	/* A positive value indicates that this is a SCHED_IDLE group. */
+	int			idle;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each CPU */
 	struct sched_entity	**se;
 	/* runqueue "owned" by this group on each CPU */
 	struct cfs_rq		**cfs_rq;
 	unsigned long		shares;
-
-	/* A positive value indicates that this is a SCHED_IDLE group. */
-	int			idle;
-
 #ifdef	CONFIG_SMP
 	/*
 	 * load_avg can be heavily contended at clock tick time, so put

From edf1c586e92675c4e0eb27758fcdb55a56838de1 Mon Sep 17 00:00:00 2001
From: Pat Somaru <patso@likewhatevs.io>
Date: Fri, 20 Sep 2024 15:41:59 -0400
Subject: [PATCH 3/5] sched, sched_ext: Disable SM_IDLE/rq empty path when
 scx_enabled()

Disable the rq empty path when scx is enabled. SCX must consult the BPF
scheduler (via the dispatch path in balance) to determine if rq is empty.

This fixes stalls when scx is enabled.

Signed-off-by: Pat Somaru <patso@likewhatevs.io>
Fixes: 3dcac251b066 ("sched/core: Introduce SM_IDLE and an idle re-entry fast-path in __schedule()")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b6cc1cf499d6..43e453ab7e20 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6591,7 +6591,8 @@ static void __sched notrace __schedule(int sched_mode)
 	 */
 	prev_state = READ_ONCE(prev->__state);
 	if (sched_mode == SM_IDLE) {
-		if (!rq->nr_running) {
+		/* SCX must consult the BPF scheduler to tell if rq is empty */
+		if (!rq->nr_running && !scx_enabled()) {
 			next = prev;
 			goto picked;
 		}

From 62d3726d4cd66f3e48dfe0f0401e0d74e58c2170 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 23 Sep 2024 06:45:22 -1000
Subject: [PATCH 4/5] sched_ext: Fix build when !CONFIG_STACKTRACE

a2f4b16e736d ("sched_ext: Build fix on !CONFIG_STACKTRACE[_SUPPORT]") tried
fixing build when !CONFIG_STACKTRACE but didn't so fully. Also put
stack_trace_print() and stack_trace_save() inside CONFIG_STACKTRACE to fix
build when !CONFIG_STACKTRACE.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202409220642.fDW2OmWc-lkp@intel.com/
---
 kernel/sched/ext.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9ee5a9a261cc..7c320dcd72d5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4469,8 +4469,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 
 		if (ei->msg[0] != '\0')
 			pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg);
-
+#ifdef CONFIG_STACKTRACE
 		stack_trace_print(ei->bt, ei->bt_len, 2);
+#endif
 	} else {
 		pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
 			scx_ops.name, ei->reason);
@@ -4847,10 +4848,10 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
 		return;
 
 	ei->exit_code = exit_code;
-
+#ifdef CONFIG_STACKTRACE
 	if (kind >= SCX_EXIT_ERROR)
 		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
-
+#endif
 	va_start(args, fmt);
 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
 	va_end(args);

From 431844b65f4c1b988ccd886f2ed29c138f7bb262 Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@linux.dev>
Date: Sat, 21 Sep 2024 21:39:21 +0200
Subject: [PATCH 5/5] sched_ext: Provide a sysfs enable_seq counter

As discussed during the distro-centric session within the sched_ext
Microconference at LPC 2024, introduce a sequence counter that is
incremented every time a BPF scheduler is loaded.

This feature can help distributions in diagnosing potential performance
regressions by identifying systems where users are running (or have ran)
custom BPF schedulers.

Example:

 arighi@virtme-ng~> cat /sys/kernel/sched_ext/enable_seq
 0
 arighi@virtme-ng~> sudo scx_simple
 local=1 global=0
 ^CEXIT: unregistered from user space
 arighi@virtme-ng~> cat /sys/kernel/sched_ext/enable_seq
 1

In this way user-space tools (such as Ubuntu's apport and similar) are
able to gather and include this information in bug reports.

Cc: Giovanni Gherdovich <giovanni.gherdovich@suse.com>
Cc: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Cc: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Cc: Phil Auld <pauld@redhat.com>
Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 10 ++++++++++
 kernel/sched/ext.c                    | 17 +++++++++++++++++
 tools/sched_ext/scx_show_state.py     |  1 +
 3 files changed, 28 insertions(+)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index a707d2181a77..6c0d70e2e27d 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -83,6 +83,15 @@ The current status of the BPF scheduler can be determined as follows:
     # cat /sys/kernel/sched_ext/root/ops
     simple
 
+You can check if any BPF scheduler has ever been loaded since boot by examining
+this monotonically incrementing counter (a value of zero indicates that no BPF
+scheduler has been loaded):
+
+.. code-block:: none
+
+    # cat /sys/kernel/sched_ext/enable_seq
+    1
+
 ``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
 detailed information:
 
@@ -96,6 +105,7 @@ detailed information:
     enable_state  : enabled (2)
     bypass_depth  : 0
     nr_rejected   : 0
+    enable_seq    : 1
 
 If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can
 be determined as follows:
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7c320dcd72d5..c09e3dc38c34 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -874,6 +874,13 @@ static struct scx_exit_info *scx_exit_info;
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
+/*
+ * A monotically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * scheduler has ever been used in the system.
+ */
+static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
+
 /*
  * The maximum amount of time in jiffies that a task may be runnable without
  * being scheduled on a CPU. If this timeout is exceeded, it will trigger
@@ -4154,11 +4161,19 @@ static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj,
 }
 SCX_ATTR(hotplug_seq);
 
+static ssize_t scx_attr_enable_seq_show(struct kobject *kobj,
+					struct kobj_attribute *ka, char *buf)
+{
+	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq));
+}
+SCX_ATTR(enable_seq);
+
 static struct attribute *scx_global_attrs[] = {
 	&scx_attr_state.attr,
 	&scx_attr_switch_all.attr,
 	&scx_attr_nr_rejected.attr,
 	&scx_attr_hotplug_seq.attr,
+	&scx_attr_enable_seq.attr,
 	NULL,
 };
 
@@ -5177,6 +5192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	kobject_uevent(scx_root_kobj, KOBJ_ADD);
 	mutex_unlock(&scx_ops_enable_mutex);
 
+	atomic_long_inc(&scx_enable_seq);
+
 	return 0;
 
 err_del:
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index d457d2a74e1e..8bc626ede1c4 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -37,3 +37,4 @@ print(f'switched_all  : {read_static_key("__scx_switched_all")}')
 print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
 print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
 print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
+print(f'enable_seq    : {read_atomic("scx_enable_seq")}')