@@ -106,18 +106,84 @@ private(BPFLAND) struct bpf_cpumask __kptr *primary_cpumask;
106106 */
107107const volatile bool smt_enabled = true;
108108
109+ /*
110+ * Disable NUMA rebalancing.
111+ */
112+ const volatile bool numa_disabled = false;
113+
109114/*
110115 * Current global vruntime.
111116 */
112117static u64 vtime_now ;
113118
119+ /*
120+ * Timer used to update NUMA statistics.
121+ */
122+ struct numa_timer {
123+ struct bpf_timer timer ;
124+ };
125+
126+ struct {
127+ __uint (type , BPF_MAP_TYPE_ARRAY );
128+ __uint (max_entries , 1 );
129+ __type (key , u32 );
130+ __type (value , struct numa_timer );
131+ } numa_timer SEC (".maps" );
132+
133+ /*
134+ * Per-node context.
135+ */
136+ struct node_ctx {
137+ u64 tot_perf_lvl ;
138+ u64 nr_cpus ;
139+ u64 perf_lvl ;
140+ bool need_rebalance ;
141+ };
142+
143+ /* CONFIG_NODES_SHIFT should be always <= 10 */
144+ #define MAX_NUMA_NODES 1024
145+
146+ struct {
147+ __uint (type , BPF_MAP_TYPE_ARRAY );
148+ __type (key , int );
149+ __type (value , struct node_ctx );
150+ __uint (max_entries , MAX_NUMA_NODES );
151+ __uint (map_flags , 0 );
152+ } node_ctx_stor SEC (".maps" );
153+
154+ /*
155+ * Return a node context.
156+ */
157+ struct node_ctx * try_lookup_node_ctx (int node )
158+ {
159+ return bpf_map_lookup_elem (& node_ctx_stor , & node );
160+ }
161+
162+ /*
163+ * Return true if @node needs a rebalance, false otherwise.
164+ */
165+ static bool node_rebalance (int node )
166+ {
167+ const struct node_ctx * nctx ;
168+
169+ if (numa_disabled )
170+ return false;
171+
172+ nctx = try_lookup_node_ctx (node );
173+ if (!nctx )
174+ return false;
175+
176+ return nctx -> need_rebalance ;
177+ }
178+
114179/*
115180 * Per-CPU context.
116181 */
117182struct cpu_ctx {
118183 u64 tot_runtime ;
119184 u64 prev_runtime ;
120185 u64 last_running ;
186+ u64 perf_lvl ;
121187 struct bpf_cpumask __kptr * smt_cpumask ;
122188 struct bpf_cpumask __kptr * l2_cpumask ;
123189 struct bpf_cpumask __kptr * l3_cpumask ;
@@ -519,12 +585,18 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
519585 }
520586
521587 /*
522- * Search for any full-idle CPU in the same node and
523- * primary domain.
588+ * Search for any full-idle CPU in the primary domain.
589+ *
590+ * If the current node needs a rebalance, look for any
591+ * full-idle CPU also on different nodes.
524592 */
525593 if (p_mask ) {
526- cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (p_mask , node ,
527- SCX_PICK_IDLE_CORE | __COMPAT_SCX_PICK_IDLE_IN_NODE );
594+ u64 flags = SCX_PICK_IDLE_CORE ;
595+
596+ if (!node_rebalance (node ))
597+ flags |= __COMPAT_SCX_PICK_IDLE_IN_NODE ;
598+
599+ cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (p_mask , node , flags );
528600 if (cpu >= 0 ) {
529601 * is_idle = true;
530602 goto out_put_cpumask ;
@@ -546,7 +618,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
546618 * Search for any idle CPU in the primary domain that shares the same
547619 * L2 cache.
548620 */
549- if (l2_mask ) {
621+ if (l2_mask && ! node_rebalance ( node ) ) {
550622 cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (l2_mask , node ,
551623 __COMPAT_SCX_PICK_IDLE_IN_NODE );
552624 if (cpu >= 0 ) {
@@ -559,7 +631,7 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
559631 * Search for any idle CPU in the primary domain that shares the same
560632 * L3 cache.
561633 */
562- if (l3_mask ) {
634+ if (l3_mask && ! node_rebalance ( node ) ) {
563635 cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (l3_mask , node ,
564636 __COMPAT_SCX_PICK_IDLE_IN_NODE );
565637 if (cpu >= 0 ) {
@@ -944,19 +1016,15 @@ void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
9441016}
9451017
9461018/*
947- * Scale target CPU frequency based on the performance level selected
948- * from user-space and the CPU utilization.
1019+ * Update CPU load and scale target performance level accordingly.
9491020 */
950- static void update_cpuperf_target (struct task_struct * p , struct task_ctx * tctx )
1021+ static void update_cpu_load (struct task_struct * p , struct task_ctx * tctx )
9511022{
9521023 u64 now = scx_bpf_now ();
9531024 s32 cpu = scx_bpf_task_cpu (p );
9541025 u64 perf_lvl , delta_runtime , delta_t ;
9551026 struct cpu_ctx * cctx ;
9561027
957- if (cpufreq_perf_lvl >= 0 )
958- return ;
959-
9601028 /*
9611029 * For non-interactive tasks determine their cpufreq scaling factor as
9621030 * a function of their CPU utilization.
@@ -970,17 +1038,26 @@ static void update_cpuperf_target(struct task_struct *p, struct task_ctx *tctx)
9701038 * utilization, normalized in the range [0 .. SCX_CPUPERF_ONE].
9711039 */
9721040 delta_t = now - cctx -> last_running ;
973- delta_runtime = cctx -> tot_runtime - cctx -> prev_runtime ;
974- perf_lvl = delta_runtime * SCX_CPUPERF_ONE / delta_t ;
1041+ if (! delta_t )
1042+ return ;
9751043
976- perf_lvl = MIN (perf_lvl , SCX_CPUPERF_ONE );
1044+ /*
1045+ * Refresh target performance level, if utilization is above 75%
1046+ * bump up the performance level to the max.
1047+ */
1048+ delta_runtime = cctx -> tot_runtime - cctx -> prev_runtime ;
1049+ perf_lvl = MIN (delta_runtime * SCX_CPUPERF_ONE / delta_t , SCX_CPUPERF_ONE );
1050+ if (perf_lvl >= SCX_CPUPERF_ONE - SCX_CPUPERF_ONE / 4 )
1051+ perf_lvl = SCX_CPUPERF_ONE ;
1052+ cctx -> perf_lvl = perf_lvl ;
9771053
9781054 /*
979- * Apply the dynamic cpuperf scaling factor.
1055+ * Refresh the dynamic cpuperf scaling factor if needed .
9801056 */
981- scx_bpf_cpuperf_set (cpu , perf_lvl );
1057+ if (cpufreq_perf_lvl < 0 )
1058+ scx_bpf_cpuperf_set (cpu , cctx -> perf_lvl );
9821059
983- cctx -> last_running = scx_bpf_now () ;
1060+ cctx -> last_running = now ;
9841061 cctx -> prev_runtime = cctx -> tot_runtime ;
9851062}
9861063
@@ -998,7 +1075,7 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
9981075 /*
9991076 * Adjust target CPU frequency before the task starts to run.
10001077 */
1001- update_cpuperf_target (p , tctx );
1078+ update_cpu_load (p , tctx );
10021079
10031080 /*
10041081 * Update the global vruntime as a new task is starting to use a
@@ -1014,17 +1091,11 @@ void BPF_STRUCT_OPS(bpfland_running, struct task_struct *p)
10141091 */
10151092void BPF_STRUCT_OPS (bpfland_stopping , struct task_struct * p , bool runnable )
10161093{
1017- u64 now = scx_bpf_now (), slice ;
1094+ u64 now = scx_bpf_now (), slice , delta_runtime ;
10181095 s32 cpu = scx_bpf_task_cpu (p );
10191096 struct cpu_ctx * cctx ;
10201097 struct task_ctx * tctx ;
10211098
1022- if (cpufreq_perf_lvl < 0 ) {
1023- cctx = try_lookup_cpu_ctx (cpu );
1024- if (cctx )
1025- cctx -> tot_runtime += now - cctx -> last_running ;
1026- }
1027-
10281099 __sync_fetch_and_sub (& nr_running , 1 );
10291100
10301101 tctx = try_lookup_task_ctx (p );
@@ -1049,6 +1120,15 @@ void BPF_STRUCT_OPS(bpfland_stopping, struct task_struct *p, bool runnable)
10491120 * Update task's vruntime.
10501121 */
10511122 tctx -> deadline += scale_by_task_weight_inverse (p , slice );
1123+
1124+ /*
1125+ * Update CPU runtime.
1126+ */
1127+ cctx = try_lookup_cpu_ctx (cpu );
1128+ if (!cctx )
1129+ return ;
1130+ delta_runtime = now - cctx -> last_running ;
1131+ cctx -> tot_runtime += delta_runtime ;
10521132}
10531133
10541134void BPF_STRUCT_OPS (bpfland_runnable , struct task_struct * p , u64 enq_flags )
@@ -1256,31 +1336,130 @@ int enable_primary_cpu(struct cpu_arg *input)
12561336static void init_cpuperf_target (void )
12571337{
12581338 const struct cpumask * online_cpumask ;
1339+ struct node_ctx * nctx ;
12591340 u64 perf_lvl ;
1341+ int node ;
12601342 s32 cpu ;
12611343
1262- if (cpufreq_perf_lvl < 0 )
1263- return ;
1264-
12651344 online_cpumask = scx_bpf_get_online_cpumask ();
12661345 bpf_for (cpu , 0 , nr_cpu_ids ) {
12671346 if (!bpf_cpumask_test_cpu (cpu , online_cpumask ))
12681347 continue ;
1269- perf_lvl = MIN (cpufreq_perf_lvl , SCX_CPUPERF_ONE );
1348+
1349+ /* Set the initial cpufreq performance level */
1350+ if (cpufreq_perf_lvl < 0 )
1351+ perf_lvl = SCX_CPUPERF_ONE ;
1352+ else
1353+ perf_lvl = MIN (cpufreq_perf_lvl , SCX_CPUPERF_ONE );
12701354 scx_bpf_cpuperf_set (cpu , perf_lvl );
1355+
1356+ /* Evaluate the amount of online CPUs for each node */
1357+ node = __COMPAT_scx_bpf_cpu_node (cpu );
1358+ nctx = try_lookup_node_ctx (node );
1359+ if (nctx )
1360+ nctx -> nr_cpus ++ ;
12711361 }
12721362 scx_bpf_put_cpumask (online_cpumask );
12731363}
12741364
1365+ /*
1366+ * Refresh NUMA statistics.
1367+ */
1368+ static int numa_timerfn (void * map , int * key , struct bpf_timer * timer )
1369+ {
1370+ const struct cpumask * online_cpumask ;
1371+ struct node_ctx * nctx ;
1372+ int node , err ;
1373+ bool has_idle_nodes = false;
1374+ s32 cpu ;
1375+
1376+ /*
1377+ * Update node statistics.
1378+ */
1379+ online_cpumask = scx_bpf_get_online_cpumask ();
1380+ bpf_for (cpu , 0 , nr_cpu_ids ) {
1381+ struct cpu_ctx * cctx ;
1382+
1383+ if (!bpf_cpumask_test_cpu (cpu , online_cpumask ))
1384+ continue ;
1385+
1386+ cctx = try_lookup_cpu_ctx (cpu );
1387+ if (!cctx )
1388+ continue ;
1389+
1390+ node = __COMPAT_scx_bpf_cpu_node (cpu );
1391+ nctx = try_lookup_node_ctx (node );
1392+ if (!nctx )
1393+ continue ;
1394+
1395+ nctx -> tot_perf_lvl += cctx -> perf_lvl ;
1396+ }
1397+ scx_bpf_put_cpumask (online_cpumask );
1398+
1399+ /*
1400+ * Update node utilization.
1401+ */
1402+ bpf_for (node , 0 , __COMPAT_scx_bpf_nr_node_ids ()) {
1403+ nctx = try_lookup_node_ctx (node );
1404+ if (!nctx || !nctx -> nr_cpus )
1405+ continue ;
1406+
1407+ /*
1408+ * Evaluate node utilization as the average perf_lvl among
1409+ * its CPUs.
1410+ */
1411+ nctx -> perf_lvl = nctx -> tot_perf_lvl / nctx -> nr_cpus ;
1412+
1413+ /*
1414+ * System has at least one idle node if its current
1415+ * utilization is 25% or below.
1416+ */
1417+ if (nctx -> perf_lvl <= SCX_CPUPERF_ONE / 4 )
1418+ has_idle_nodes = true;
1419+
1420+ /*
1421+ * Reset partial performance level.
1422+ */
1423+ nctx -> tot_perf_lvl = 0 ;
1424+ }
1425+
1426+ /*
1427+ * Determine nodes that need a rebalance.
1428+ */
1429+ bpf_for (node , 0 , __COMPAT_scx_bpf_nr_node_ids ()) {
1430+ nctx = try_lookup_node_ctx (node );
1431+ if (!nctx )
1432+ continue ;
1433+
1434+ /*
1435+ * If the current node utilization is 50% or more and there
1436+ * is at least an idle node in the system, trigger a
1437+ * rebalance.
1438+ */
1439+ nctx -> need_rebalance = has_idle_nodes && nctx -> perf_lvl >= SCX_CPUPERF_ONE / 2 ;
1440+
1441+ dbg_msg ("node %d util %llu rebalance %d" ,
1442+ node , nctx -> perf_lvl , nctx -> need_rebalance );
1443+ }
1444+
1445+ err = bpf_timer_start (timer , NSEC_PER_SEC , 0 );
1446+ if (err )
1447+ scx_bpf_error ("Failed to start NUMA timer" );
1448+
1449+ return 0 ;
1450+ }
1451+
12751452s32 BPF_STRUCT_OPS_SLEEPABLE (bpfland_init )
12761453{
1454+ struct bpf_timer * timer ;
12771455 int err , node ;
1456+ u32 key = 0 ;
12781457
12791458 /* Initialize amount of online and possible CPUs */
12801459 nr_online_cpus = get_nr_online_cpus ();
12811460 nr_cpu_ids = scx_bpf_nr_cpu_ids ();
12821461
1283- /* Initialize cpufreq profile */
1462+ /* Initialize CPUs and NUMA properties */
12841463 init_cpuperf_target ();
12851464
12861465 /*
@@ -1299,6 +1478,22 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
12991478 if (err )
13001479 return err ;
13011480
1481+ /* Do not update NUMA statistics if there's only one node */
1482+ if (numa_disabled || __COMPAT_scx_bpf_nr_node_ids () <= 1 )
1483+ return 0 ;
1484+
1485+ timer = bpf_map_lookup_elem (& numa_timer , & key );
1486+ if (!timer ) {
1487+ scx_bpf_error ("Failed to lookup central timer" );
1488+ return - ESRCH ;
1489+ }
1490+
1491+ bpf_timer_init (timer , & numa_timer , CLOCK_BOOTTIME );
1492+ bpf_timer_set_callback (timer , numa_timerfn );
1493+ err = bpf_timer_start (timer , NSEC_PER_SEC , 0 );
1494+ if (err )
1495+ scx_bpf_error ("Failed to start NUMA timer" );
1496+
13021497 return 0 ;
13031498}
13041499
0 commit comments