Skip to content

Commit 2605625

Browse files
authored
Merge pull request #1605 from sched-ext/bpfland-numa-load-balancing
scx_bpfland: NUMA load balancer
2 parents 36e7d5a + b1adee5 commit 2605625

File tree

3 files changed

+234
-31
lines changed

3 files changed

+234
-31
lines changed

scheds/rust/scx_bpfland/src/bpf/intf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ enum consts {
1919
NSEC_PER_USEC = 1000ULL,
2020
NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC),
2121
NSEC_PER_SEC = (1000ULL * NSEC_PER_MSEC),
22+
23+
/* Kernel definitions */
24+
CLOCK_BOOTTIME = 7,
2225
};
2326

2427
#ifndef __VMLINUX_H__

scheds/rust/scx_bpfland/src/bpf/main.bpf.c

Lines changed: 226 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -106,18 +106,84 @@ private(BPFLAND) struct bpf_cpumask __kptr *primary_cpumask;
106106
*/
107107
const volatile bool smt_enabled = true;
108108

109+
/*
110+
* Disable NUMA rebalancing.
111+
*/
112+
const volatile bool numa_disabled = false;
113+
109114
/*
110115
* Current global vruntime.
111116
*/
112117
static u64 vtime_now;
113118

119+
/*
120+
* Timer used to update NUMA statistics.
121+
*/
122+
struct numa_timer {
123+
struct bpf_timer timer;
124+
};
125+
126+
struct {
127+
__uint(type, BPF_MAP_TYPE_ARRAY);
128+
__uint(max_entries, 1);
129+
__type(key, u32);
130+
__type(value, struct numa_timer);
131+
} numa_timer SEC(".maps");
132+
133+
/*
134+
* Per-node context.
135+
*/
136+
struct node_ctx {
137+
u64 tot_perf_lvl;
138+
u64 nr_cpus;
139+
u64 perf_lvl;
140+
bool need_rebalance;
141+
};
142+
143+
/* CONFIG_NODES_SHIFT should be always <= 10 */
144+
#define MAX_NUMA_NODES 1024
145+
146+
struct {
147+
__uint(type, BPF_MAP_TYPE_ARRAY);
148+
__type(key, int);
149+
__type(value, struct node_ctx);
150+
__uint(max_entries, MAX_NUMA_NODES);
151+
__uint(map_flags, 0);
152+
} node_ctx_stor SEC(".maps");
153+
154+
/*
155+
* Return a node context.
156+
*/
157+
struct node_ctx *try_lookup_node_ctx(int node)
158+
{
159+
return bpf_map_lookup_elem(&node_ctx_stor, &node);
160+
}
161+
162+
/*
163+
* Return true if @node needs a rebalance, false otherwise.
164+
*/
165+
static bool node_rebalance(int node)
166+
{
167+
const struct node_ctx *nctx;
168+
169+
if (numa_disabled)
170+
return false;
171+
172+
nctx = try_lookup_node_ctx(node);
173+
if (!nctx)
174+
return false;
175+
176+
return nctx->need_rebalance;
177+
}
178+
114179
/*
115180
* Per-CPU context.
116181
*/
117182
struct cpu_ctx {
118183
u64 tot_runtime;
119184
u64 prev_runtime;
120185
u64 last_running;
186+
u64 perf_lvl;
121187
struct bpf_cpumask __kptr *smt_cpumask;
122188
struct bpf_cpumask __kptr *l2_cpumask;
123189
struct bpf_cpumask __kptr *l3_cpumask;
@@ -519,12 +585,18 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
519585
}
520586

521587
/*
522-
* Search for any full-idle CPU in the same node and
523-
* primary domain.
588+
* Search for any full-idle CPU in the primary domain.
589+
*
590+
* If the current node needs a rebalance, look for any
591+
* full-idle CPU also on different nodes.
524592
*/
525593
if (p_mask) {
526-
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p_mask, node,
527-
SCX_PICK_IDLE_CORE | __COMPAT_SCX_PICK_IDLE_IN_NODE);
594+
u64 flags = SCX_PICK_IDLE_CORE;
595+
596+
if (!node_rebalance(node))
597+
flags |= __COMPAT_SCX_PICK_IDLE_IN_NODE;
598+
599+
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p_mask, node, flags);
528600
if (cpu >= 0) {
529601
*is_idle = true;
530602
goto out_put_cpumask;
@@ -546,7 +618,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
546618
* Search for any idle CPU in the primary domain that shares the same
547619
* L2 cache.
548620
*/
549-
if (l2_mask) {
621+
if (l2_mask && !node_rebalance(node)) {
550622
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(l2_mask, node,
551623
__COMPAT_SCX_PICK_IDLE_IN_NODE);
552624
if (cpu >= 0) {
@@ -559,7 +631,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
559631
* Search for any idle CPU in the primary domain that shares the same
560632
* L3 cache.
561633
*/
562-
if (l3_mask) {
634+
if (l3_mask && !node_rebalance(node)) {
563635
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(l3_mask, node,
564636
__COMPAT_SCX_PICK_IDLE_IN_NODE);
565637
if (cpu >= 0) {
@@ -944,19 +1016,15 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
9441016
}
9451017

9461018
/*
947-
* Scale target CPU frequency based on the performance level selected
948-
* from user-space and the CPU utilization.
1019+
* Update CPU load and scale target performance level accordingly.
9491020
*/
950-
static void update_cpuperf_target(struct task_struct *p, struct task_ctx *tctx)
1021+
static void update_cpu_load(struct task_struct *p, struct task_ctx *tctx)
9511022
{
9521023
u64 now = scx_bpf_now();
9531024
s32 cpu = scx_bpf_task_cpu(p);
9541025
u64 perf_lvl, delta_runtime, delta_t;
9551026
struct cpu_ctx *cctx;
9561027

957-
if (cpufreq_perf_lvl >= 0)
958-
return;
959-
9601028
/*
9611029
* For non-interactive tasks determine their cpufreq scaling factor as
9621030
* a function of their CPU utilization.
@@ -970,17 +1038,26 @@ static void update_cpuperf_target(struct task_struct *p, struct task_ctx *tctx)
9701038
* utilization, normalized in the range [0 .. SCX_CPUPERF_ONE].
9711039
*/
9721040
delta_t = now - cctx->last_running;
973-
delta_runtime = cctx->tot_runtime - cctx->prev_runtime;
974-
perf_lvl = delta_runtime * SCX_CPUPERF_ONE / delta_t;
1041+
if (!delta_t)
1042+
return;
9751043

976-
perf_lvl = MIN(perf_lvl, SCX_CPUPERF_ONE);
1044+
/*
1045+
* Refresh target performance level, if utilization is above 75%
1046+
* bump up the performance level to the max.
1047+
*/
1048+
delta_runtime = cctx->tot_runtime - cctx->prev_runtime;
1049+
perf_lvl = MIN(delta_runtime * SCX_CPUPERF_ONE / delta_t, SCX_CPUPERF_ONE);
1050+
if (perf_lvl >= SCX_CPUPERF_ONE - SCX_CPUPERF_ONE / 4)
1051+
perf_lvl = SCX_CPUPERF_ONE;
1052+
cctx->perf_lvl = perf_lvl;
9771053

9781054
/*
979-
* Apply the dynamic cpuperf scaling factor.
1055+
* Refresh the dynamic cpuperf scaling factor if needed.
9801056
*/
981-
scx_bpf_cpuperf_set(cpu, perf_lvl);
1057+
if (cpufreq_perf_lvl < 0)
1058+
scx_bpf_cpuperf_set(cpu, cctx->perf_lvl);
9821059

983-
cctx->last_running = scx_bpf_now();
1060+
cctx->last_running = now;
9841061
cctx->prev_runtime = cctx->tot_runtime;
9851062
}
9861063

@@ -998,7 +1075,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
9981075
/*
9991076
* Adjust target CPU frequency before the task starts to run.
10001077
*/
1001-
update_cpuperf_target(p, tctx);
1078+
update_cpu_load(p, tctx);
10021079

10031080
/*
10041081
* Update the global vruntime as a new task is starting to use a
@@ -1014,17 +1091,11 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
10141091
*/
10151092
void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
10161093
{
1017-
u64 now = scx_bpf_now(), slice;
1094+
u64 now = scx_bpf_now(), slice, delta_runtime;
10181095
s32 cpu = scx_bpf_task_cpu(p);
10191096
struct cpu_ctx *cctx;
10201097
struct task_ctx *tctx;
10211098

1022-
if (cpufreq_perf_lvl < 0) {
1023-
cctx = try_lookup_cpu_ctx(cpu);
1024-
if (cctx)
1025-
cctx->tot_runtime += now - cctx->last_running;
1026-
}
1027-
10281099
__sync_fetch_and_sub(&nr_running, 1);
10291100

10301101
tctx = try_lookup_task_ctx(p);
@@ -1049,6 +1120,15 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
10491120
* Update task's vruntime.
10501121
*/
10511122
tctx->deadline += scale_by_task_weight_inverse(p, slice);
1123+
1124+
/*
1125+
* Update CPU runtime.
1126+
*/
1127+
cctx = try_lookup_cpu_ctx(cpu);
1128+
if (!cctx)
1129+
return;
1130+
delta_runtime = now - cctx->last_running;
1131+
cctx->tot_runtime += delta_runtime;
10521132
}
10531133

10541134
void BPF_STRUCT_OPS(bpfland_runnable, struct task_struct *p, u64 enq_flags)
@@ -1256,31 +1336,130 @@ int enable_primary_cpu(struct cpu_arg *input)
12561336
static void init_cpuperf_target(void)
12571337
{
12581338
const struct cpumask *online_cpumask;
1339+
struct node_ctx *nctx;
12591340
u64 perf_lvl;
1341+
int node;
12601342
s32 cpu;
12611343

1262-
if (cpufreq_perf_lvl < 0)
1263-
return;
1264-
12651344
online_cpumask = scx_bpf_get_online_cpumask();
12661345
bpf_for (cpu, 0, nr_cpu_ids) {
12671346
if (!bpf_cpumask_test_cpu(cpu, online_cpumask))
12681347
continue;
1269-
perf_lvl = MIN(cpufreq_perf_lvl, SCX_CPUPERF_ONE);
1348+
1349+
/* Set the initial cpufreq performance level */
1350+
if (cpufreq_perf_lvl < 0)
1351+
perf_lvl = SCX_CPUPERF_ONE;
1352+
else
1353+
perf_lvl = MIN(cpufreq_perf_lvl, SCX_CPUPERF_ONE);
12701354
scx_bpf_cpuperf_set(cpu, perf_lvl);
1355+
1356+
/* Evaluate the amount of online CPUs for each node */
1357+
node = __COMPAT_scx_bpf_cpu_node(cpu);
1358+
nctx = try_lookup_node_ctx(node);
1359+
if (nctx)
1360+
nctx->nr_cpus++;
12711361
}
12721362
scx_bpf_put_cpumask(online_cpumask);
12731363
}
12741364

1365+
/*
1366+
* Refresh NUMA statistics.
1367+
*/
1368+
static int numa_timerfn(void *map, int *key, struct bpf_timer *timer)
1369+
{
1370+
const struct cpumask *online_cpumask;
1371+
struct node_ctx *nctx;
1372+
int node, err;
1373+
bool has_idle_nodes = false;
1374+
s32 cpu;
1375+
1376+
/*
1377+
* Update node statistics.
1378+
*/
1379+
online_cpumask = scx_bpf_get_online_cpumask();
1380+
bpf_for (cpu, 0, nr_cpu_ids) {
1381+
struct cpu_ctx *cctx;
1382+
1383+
if (!bpf_cpumask_test_cpu(cpu, online_cpumask))
1384+
continue;
1385+
1386+
cctx = try_lookup_cpu_ctx(cpu);
1387+
if (!cctx)
1388+
continue;
1389+
1390+
node = __COMPAT_scx_bpf_cpu_node(cpu);
1391+
nctx = try_lookup_node_ctx(node);
1392+
if (!nctx)
1393+
continue;
1394+
1395+
nctx->tot_perf_lvl += cctx->perf_lvl;
1396+
}
1397+
scx_bpf_put_cpumask(online_cpumask);
1398+
1399+
/*
1400+
* Update node utilization.
1401+
*/
1402+
bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) {
1403+
nctx = try_lookup_node_ctx(node);
1404+
if (!nctx || !nctx->nr_cpus)
1405+
continue;
1406+
1407+
/*
1408+
* Evaluate node utilization as the average perf_lvl among
1409+
* its CPUs.
1410+
*/
1411+
nctx->perf_lvl = nctx->tot_perf_lvl / nctx->nr_cpus;
1412+
1413+
/*
1414+
* System has at least one idle node if its current
1415+
* utilization is 25% or below.
1416+
*/
1417+
if (nctx->perf_lvl <= SCX_CPUPERF_ONE / 4)
1418+
has_idle_nodes = true;
1419+
1420+
/*
1421+
* Reset partial performance level.
1422+
*/
1423+
nctx->tot_perf_lvl = 0;
1424+
}
1425+
1426+
/*
1427+
* Determine nodes that need a rebalance.
1428+
*/
1429+
bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) {
1430+
nctx = try_lookup_node_ctx(node);
1431+
if (!nctx)
1432+
continue;
1433+
1434+
/*
1435+
* If the current node utilization is 50% or more and there
1436+
* is at least an idle node in the system, trigger a
1437+
* rebalance.
1438+
*/
1439+
nctx->need_rebalance = has_idle_nodes && nctx->perf_lvl >= SCX_CPUPERF_ONE / 2;
1440+
1441+
dbg_msg("node %d util %llu rebalance %d",
1442+
node, nctx->perf_lvl, nctx->need_rebalance);
1443+
}
1444+
1445+
err = bpf_timer_start(timer, NSEC_PER_SEC, 0);
1446+
if (err)
1447+
scx_bpf_error("Failed to start NUMA timer");
1448+
1449+
return 0;
1450+
}
1451+
12751452
s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
12761453
{
1454+
struct bpf_timer *timer;
12771455
int err, node;
1456+
u32 key = 0;
12781457

12791458
/* Initialize amount of online and possible CPUs */
12801459
nr_online_cpus = get_nr_online_cpus();
12811460
nr_cpu_ids = scx_bpf_nr_cpu_ids();
12821461

1283-
/* Initialize cpufreq profile */
1462+
/* Initialize CPUs and NUMA properties */
12841463
init_cpuperf_target();
12851464

12861465
/*
@@ -1299,6 +1478,22 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
12991478
if (err)
13001479
return err;
13011480

1481+
/* Do not update NUMA statistics if there's only one node */
1482+
if (numa_disabled || __COMPAT_scx_bpf_nr_node_ids() <= 1)
1483+
return 0;
1484+
1485+
timer = bpf_map_lookup_elem(&numa_timer, &key);
1486+
if (!timer) {
1487+
scx_bpf_error("Failed to lookup central timer");
1488+
return -ESRCH;
1489+
}
1490+
1491+
bpf_timer_init(timer, &numa_timer, CLOCK_BOOTTIME);
1492+
bpf_timer_set_callback(timer, numa_timerfn);
1493+
err = bpf_timer_start(timer, NSEC_PER_SEC, 0);
1494+
if (err)
1495+
scx_bpf_error("Failed to start NUMA timer");
1496+
13021497
return 0;
13031498
}
13041499

0 commit comments

Comments
 (0)