From 160216568cddc9d6e7f36133ba41d25459d90de4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 27 Sep 2024 10:02:40 -1000 Subject: [PATCH] sched_ext: Decouple locks in scx_ops_disable_workfn() The disable path uses three big locks - scx_fork_rwsem, scx_cgroup_rwsem and cpus_read_lock. Currently, the locks are grabbed together which is prone to locking order problems. With the preceding scx_cgroup_enabled change, we can decouple them: - As cgroup disabling no longer requires modifying a static_key which requires cpus_read_lock(), no need to grab cpus_read_lock() before grabbing scx_cgroup_rwsem. - cgroup can now be independently disabled before tasks are moved back to the fair class. Relocate scx_cgroup_exit() invocation before scx_fork_rwsem is grabbed, drop now unnecessary cpus_read_lock() and move static_key operations out of scx_fork_rwsem. This decouples all three locks in the disable path. Signed-off-by: Tejun Heo Reported-and-tested-by: Aboorva Devarajan Link: http://lkml.kernel.org/r/8cd0ec0c4c7c1bc0119e61fbef0bee9d5e24022d.camel@linux.ibm.com --- kernel/sched/ext.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f9675ced75e8..7d0ce7e6ea1f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4456,21 +4456,23 @@ static void scx_ops_disable_workfn(struct kthread_work *work) WRITE_ONCE(scx_switching_all, false); /* - * Avoid racing against fork and cgroup changes. See scx_ops_enable() - * for explanation on the locking order. + * Shut down cgroup support before tasks so that the cgroup attach path + * doesn't race against scx_ops_exit_task(). + */ + scx_cgroup_lock(); + scx_cgroup_exit(); + scx_cgroup_unlock(); + + /* + * The BPF scheduler is going away. All tasks including %TASK_DEAD ones + * must be switched out and exited synchronously. */ percpu_down_write(&scx_fork_rwsem); - cpus_read_lock(); - scx_cgroup_lock(); scx_ops_init_task_enabled = false; spin_lock_irq(&scx_tasks_lock); scx_task_iter_init(&sti); - /* - * The BPF scheduler is going away. All tasks including %TASK_DEAD ones - * must be switched out and exited synchronously. - */ while ((p = scx_task_iter_next_locked(&sti))) { const struct sched_class *old_class = p->sched_class; struct sched_enq_and_set_ctx ctx; @@ -4488,23 +4490,18 @@ static void scx_ops_disable_workfn(struct kthread_work *work) } scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); + percpu_up_write(&scx_fork_rwsem); /* no task is on scx, turn off all the switches and flush in-progress calls */ - static_branch_disable_cpuslocked(&__scx_ops_enabled); + static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) - static_branch_disable_cpuslocked(&scx_has_op[i]); - static_branch_disable_cpuslocked(&scx_ops_enq_last); - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); + static_branch_disable(&scx_has_op[i]); + static_branch_disable(&scx_ops_enq_last); + static_branch_disable(&scx_ops_enq_exiting); + static_branch_disable(&scx_ops_cpu_preempt); + static_branch_disable(&scx_builtin_idle_enabled); synchronize_rcu(); - scx_cgroup_exit(); - - scx_cgroup_unlock(); - cpus_read_unlock(); - percpu_up_write(&scx_fork_rwsem); - if (ei->kind >= SCX_EXIT_ERROR) { pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", scx_ops.name, ei->reason);