06e51be3d5
Currently, a dsq is always a FIFO. A task which is dispatched earlier gets consumed or executed earlier. While this is sufficient when dsq's are used for simple staging areas for tasks which are ready to execute, it'd make dsq's a lot more useful if they can implement custom ordering. This patch adds a vtime-ordered priority queue to dsq's. When the BPF scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it can specify the vtime tha the task should be inserted at and the task is inserted into the priority queue in the dsq which is ordered according to time_before64() comparison of the vtime values. A DSQ can either be a FIFO or priority queue and automatically switches between the two depending on whether scx_bpf_dispatch() or scx_bpf_dispatch_vtime() is used. Using the wrong variant while the DSQ already has the other type queued is not allowed and triggers an ops error. Built-in DSQs must always be FIFOs. This makes it very easy for the BPF schedulers to implement proper vtime based scheduling within each dsq very easy and efficient at a negligible cost in terms of code complexity and overhead. scx_simple and scx_example_flatcg are updated to default to weighted vtime scheduling (the latter within each cgroup). FIFO scheduling can be selected with -f option. v4: - As allowing mixing priority queue and FIFO on the same DSQ sometimes led to unexpected starvations, DSQs now error out if both modes are used at the same time and the built-in DSQs are no longer allowed to be priority queues. - Explicit type struct scx_dsq_node added to contain fields needed to be linked on DSQs. This will be used to implement stateful iterator. - Tasks are now always linked on dsq->list whether the DSQ is in FIFO or PRIQ mode. This confines PRIQ related complexities to the enqueue and dequeue paths. Other paths only need to look at dsq->list. This will also ease implementing BPF iterator. - Print p->scx.dsq_flags in debug dump. v3: - SCX_TASK_DSQ_ON_PRIQ flag is moved from p->scx.flags into its own p->scx.dsq_flags. The flag is protected with the dsq lock unlike other flags in p->scx.flags. This led to flag corruption in some cases. - Add comments explaining the interaction between using consumption of p->scx.slice to determine vtime progress and yielding. v2: - p->scx.dsq_vtime was not initialized on load or across cgroup migrations leading to some tasks being stalled for extended period of time depending on how saturated the machine is. Fixed. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: David Vernet <dvernet@meta.com>
108 lines
2.3 KiB
C
108 lines
2.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
|
|
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
|
|
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
|
|
*/
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include <signal.h>
|
|
#include <libgen.h>
|
|
#include <bpf/bpf.h>
|
|
#include <scx/common.h>
|
|
#include "scx_simple.bpf.skel.h"
|
|
|
|
const char help_fmt[] =
|
|
"A simple sched_ext scheduler.\n"
|
|
"\n"
|
|
"See the top-level comment in .bpf.c for more details.\n"
|
|
"\n"
|
|
"Usage: %s [-f] [-v]\n"
|
|
"\n"
|
|
" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
|
|
" -v Print libbpf debug messages\n"
|
|
" -h Display this help and exit\n";
|
|
|
|
static bool verbose;
|
|
static volatile int exit_req;
|
|
|
|
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
|
{
|
|
if (level == LIBBPF_DEBUG && !verbose)
|
|
return 0;
|
|
return vfprintf(stderr, format, args);
|
|
}
|
|
|
|
static void sigint_handler(int simple)
|
|
{
|
|
exit_req = 1;
|
|
}
|
|
|
|
static void read_stats(struct scx_simple *skel, __u64 *stats)
|
|
{
|
|
int nr_cpus = libbpf_num_possible_cpus();
|
|
__u64 cnts[2][nr_cpus];
|
|
__u32 idx;
|
|
|
|
memset(stats, 0, sizeof(stats[0]) * 2);
|
|
|
|
for (idx = 0; idx < 2; idx++) {
|
|
int ret, cpu;
|
|
|
|
ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
|
|
&idx, cnts[idx]);
|
|
if (ret < 0)
|
|
continue;
|
|
for (cpu = 0; cpu < nr_cpus; cpu++)
|
|
stats[idx] += cnts[idx][cpu];
|
|
}
|
|
}
|
|
|
|
int main(int argc, char **argv)
|
|
{
|
|
struct scx_simple *skel;
|
|
struct bpf_link *link;
|
|
__u32 opt;
|
|
__u64 ecode;
|
|
|
|
libbpf_set_print(libbpf_print_fn);
|
|
signal(SIGINT, sigint_handler);
|
|
signal(SIGTERM, sigint_handler);
|
|
restart:
|
|
skel = SCX_OPS_OPEN(simple_ops, scx_simple);
|
|
|
|
while ((opt = getopt(argc, argv, "fvh")) != -1) {
|
|
switch (opt) {
|
|
case 'f':
|
|
skel->rodata->fifo_sched = true;
|
|
break;
|
|
case 'v':
|
|
verbose = true;
|
|
break;
|
|
default:
|
|
fprintf(stderr, help_fmt, basename(argv[0]));
|
|
return opt != 'h';
|
|
}
|
|
}
|
|
|
|
SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
|
|
link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
|
|
|
|
while (!exit_req && !UEI_EXITED(skel, uei)) {
|
|
__u64 stats[2];
|
|
|
|
read_stats(skel, stats);
|
|
printf("local=%llu global=%llu\n", stats[0], stats[1]);
|
|
fflush(stdout);
|
|
sleep(1);
|
|
}
|
|
|
|
bpf_link__destroy(link);
|
|
ecode = UEI_REPORT(skel, uei);
|
|
scx_simple__destroy(skel);
|
|
|
|
if (UEI_ECODE_RESTART(ecode))
|
|
goto restart;
|
|
return 0;
|
|
}
|