diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index a707d2181a77..6c0d70e2e27d 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -83,6 +83,15 @@ The current status of the BPF scheduler can be determined as follows: # cat /sys/kernel/sched_ext/root/ops simple +You can check if any BPF scheduler has ever been loaded since boot by examining +this monotonically incrementing counter (a value of zero indicates that no BPF +scheduler has been loaded): + +.. code-block:: none + + # cat /sys/kernel/sched_ext/enable_seq + 1 + ``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more detailed information: @@ -96,6 +105,7 @@ detailed information: enable_state : enabled (2) bypass_depth : 0 nr_rejected : 0 + enable_seq : 1 If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can be determined as follows: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b6cc1cf499d6..43e453ab7e20 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6591,7 +6591,8 @@ static void __sched notrace __schedule(int sched_mode) */ prev_state = READ_ONCE(prev->__state); if (sched_mode == SM_IDLE) { - if (!rq->nr_running) { + /* SCX must consult the BPF scheduler to tell if rq is empty */ + if (!rq->nr_running && !scx_enabled()) { next = prev; goto picked; } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 9ee5a9a261cc..c09e3dc38c34 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -874,6 +874,13 @@ static struct scx_exit_info *scx_exit_info; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +/* + * A monotically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used by to check if any custom sched_ext + * scheduler has ever been used in the system. + */ +static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); + /* * The maximum amount of time in jiffies that a task may be runnable without * being scheduled on a CPU. If this timeout is exceeded, it will trigger @@ -4154,11 +4161,19 @@ static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, } SCX_ATTR(hotplug_seq); +static ssize_t scx_attr_enable_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_enable_seq)); +} +SCX_ATTR(enable_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, &scx_attr_hotplug_seq.attr, + &scx_attr_enable_seq.attr, NULL, }; @@ -4469,8 +4484,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work) if (ei->msg[0] != '\0') pr_err("sched_ext: %s: %s\n", scx_ops.name, ei->msg); - +#ifdef CONFIG_STACKTRACE stack_trace_print(ei->bt, ei->bt_len, 2); +#endif } else { pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason); @@ -4847,10 +4863,10 @@ static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, return; ei->exit_code = exit_code; - +#ifdef CONFIG_STACKTRACE if (kind >= SCX_EXIT_ERROR) ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); - +#endif va_start(args, fmt); vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); va_end(args); @@ -5176,6 +5192,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) kobject_uevent(scx_root_kobj, KOBJ_ADD); mutex_unlock(&scx_ops_enable_mutex); + atomic_long_inc(&scx_enable_seq); + return 0; err_del: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8063db62b027..b1c3588a8f00 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -432,16 +432,17 @@ struct cfs_bandwidth { struct task_group { struct cgroup_subsys_state css; +#ifdef CONFIG_GROUP_SCHED_WEIGHT + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each CPU */ struct sched_entity **se; /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; - - /* A positive value indicates that this is a SCHED_IDLE group. */ - int idle; - #ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put @@ -582,6 +583,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #endif /* CONFIG_SMP */ #else /* !CONFIG_FAIR_GROUP_SCHED */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } +static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } #endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py index d457d2a74e1e..8bc626ede1c4 100644 --- a/tools/sched_ext/scx_show_state.py +++ b/tools/sched_ext/scx_show_state.py @@ -37,3 +37,4 @@ print(f'switched_all : {read_static_key("__scx_switched_all")}') print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') +print(f'enable_seq : {read_atomic("scx_enable_seq")}')