Skip to content

Commit 942526e

Browse files
feat: RDNA4 support (#487)
* feat: initial rdna4 support * fix: disable writing clocks settings for now * feat: work around misreported clock offsets, only set max offset * feat: support new (kernel 6.15+) sclk offset format * fix: fan curve zero page zero rpm switch * feat: show zero rpm stop temperature on fan curve page * feat: disable zero rpm on static control * chore: update test snapshot * feat: explicitly fail when setting a pmfw fan curve outside of allowed ranges * feat: fall back to gpu_metrics for fan info when hwmon is not available * chore: add gpu_metrics test data * chore: fmt * chore: update default fan curve * chore: update 9070xt-new test data * fix: make missing zero rpm option a non-fatal error * chore: switch amdgpu-sysfs crate version
1 parent c61957a commit 942526e

File tree

154 files changed

+1629
-90
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

154 files changed

+1629
-90
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ members = [
1010
]
1111

1212
[workspace.dependencies]
13-
amdgpu-sysfs = { version = "0.17.3", features = ["serde"] }
13+
amdgpu-sysfs = { version = "0.18.0", features = ["serde"] }
1414
serde = { version = "1.0", features = ["derive"] }
1515
serde_with = { version = "3.5.0", default-features = false, features = [
1616
"macros",

docs/CONFIG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ gpus:
166166
# Voltage offset value in mV for RDNA and newer AMD GPUs.
167167
voltage_offset: 0
168168

169-
# GPU and VRAM clockspeed offset values, per-pstate. Only applicable on Nvidia.
169+
# GPU and VRAM clockspeed offset values, per-pstate. Applicable on Nvidia and on AMD RDNA4.
170170
gpu_clock_offsets:
171171
0: -100
172172
mem_clock_offsets:

lact-daemon/src/server/gpu_controller.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ pub trait GpuController {
4040

4141
fn get_stats(&self, gpu_config: Option<&config::Gpu>) -> DeviceStats;
4242

43-
fn get_clocks_info(&self) -> anyhow::Result<ClocksInfo>;
43+
fn get_clocks_info(&self, gpu_config: Option<&config::Gpu>) -> anyhow::Result<ClocksInfo>;
4444

4545
fn get_power_states(&self, gpu_config: Option<&config::Gpu>) -> PowerStates;
4646

lact-daemon/src/server/gpu_controller/amd.rs

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,15 @@ use amdgpu_sysfs::{
1414
CommitHandle, GpuHandle, PerformanceLevel, PowerLevelKind, PowerLevels,
1515
},
1616
hw_mon::{FanControlMethod, HwMon},
17+
sysfs::SysFS,
1718
};
1819
use anyhow::{anyhow, Context};
1920
use futures::{future::LocalBoxFuture, FutureExt};
2021
use lact_schema::{
2122
ClocksInfo, ClockspeedStats, DeviceInfo, DeviceStats, DrmInfo, FanStats, IntelDrmInfo,
2223
LinkInfo, PmfwInfo, PowerState, PowerStates, PowerStats, VoltageStats, VramStats,
2324
};
24-
use libdrm_amdgpu_sys::AMDGPU::{ThrottleStatus, ThrottlerBit};
25+
use libdrm_amdgpu_sys::AMDGPU::{GpuMetrics, ThrottleStatus, ThrottlerBit};
2526
use libdrm_amdgpu_sys::{LibDrmAmdgpu, AMDGPU::SENSOR_INFO::SENSOR_TYPE};
2627
use std::{
2728
cell::RefCell,
@@ -102,6 +103,12 @@ impl AmdGpuController {
102103

103104
// Use PMFW curve functionality for static speed when it is available
104105
if let Ok(current_curve) = self.handle.get_fan_curve() {
106+
if let Ok(true) = self.handle.get_fan_zero_rpm_enable() {
107+
if let Err(err) = self.handle.set_fan_zero_rpm_enable(false) {
108+
error!("could not disable zero RPM mode for static fan control: {err}");
109+
}
110+
}
111+
105112
let allowed_ranges = current_curve.allowed_ranges.clone().ok_or_else(|| {
106113
anyhow!("The GPU does not allow setting custom fan values (is overdrive enabled?)")
107114
})?;
@@ -606,6 +613,9 @@ impl GpuController for AmdGpuController {
606613
}
607614

608615
fn get_stats(&self, gpu_config: Option<&config::Gpu>) -> DeviceStats {
616+
let metrics = GpuMetrics::get_from_sysfs_path(self.handle.get_path()).ok();
617+
let metrics = metrics.as_ref();
618+
609619
let fan_settings = gpu_config.and_then(|config| config.fan_control_settings.as_ref());
610620
DeviceStats {
611621
fan: FanStats {
@@ -615,10 +625,18 @@ impl GpuController for AmdGpuController {
615625
curve: fan_settings.map(|settings| settings.curve.0.clone()),
616626
spindown_delay_ms: fan_settings.and_then(|settings| settings.spindown_delay_ms),
617627
change_threshold: fan_settings.and_then(|settings| settings.change_threshold),
618-
speed_current: self.hw_mon_and_then(HwMon::get_fan_current),
628+
speed_current: self.hw_mon_and_then(HwMon::get_fan_current).or_else(|| {
629+
metrics
630+
.and_then(MetricsInfo::get_current_fan_speed)
631+
.map(u32::from)
632+
}),
619633
speed_max: self.hw_mon_and_then(HwMon::get_fan_max),
620634
speed_min: self.hw_mon_and_then(HwMon::get_fan_min),
621-
pwm_current: self.hw_mon_and_then(HwMon::get_fan_pwm),
635+
pwm_current: self.hw_mon_and_then(HwMon::get_fan_pwm).or_else(|| {
636+
metrics
637+
.and_then(MetricsInfo::get_fan_pwm)
638+
.and_then(|pwm| u8::try_from(pwm).ok())
639+
}),
622640
pmfw_info: PmfwInfo {
623641
acoustic_limit: self.handle.get_fan_acoustic_limit().ok(),
624642
acoustic_target: self.handle.get_fan_acoustic_target().ok(),
@@ -667,11 +685,32 @@ impl GpuController for AmdGpuController {
667685
}
668686
}
669687

670-
fn get_clocks_info(&self) -> anyhow::Result<ClocksInfo> {
671-
let clocks_table = self
688+
fn get_clocks_info(&self, gpu_config: Option<&config::Gpu>) -> anyhow::Result<ClocksInfo> {
689+
let mut clocks_table = self
672690
.handle
673691
.get_clocks_table()
674692
.context("Clocks table not available")?;
693+
694+
if let ClocksTableGen::Vega20(table) = &mut clocks_table {
695+
// Workaround for RDNA4 not reporting current SCLK offset in the original format:
696+
// https://github.com/ilya-zlobintsev/LACT/issues/485#issuecomment-2712502906
697+
if table.rdna4_sclk_offset_workaround {
698+
// The values present in the old clocks table format for the current slck offset are rubbish,
699+
// we should report the configured value instead
700+
let offset = gpu_config
701+
.and_then(|config| {
702+
config
703+
.clocks_configuration
704+
.gpu_clock_offsets
705+
.get(&0)
706+
.copied()
707+
})
708+
.unwrap_or(0);
709+
710+
table.sclk_offset = Some(offset);
711+
}
712+
}
713+
675714
Ok(clocks_table.into())
676715
}
677716

@@ -836,7 +875,7 @@ impl GpuController for AmdGpuController {
836875
commit_handles.push(handle);
837876
}
838877
Err(err) => {
839-
error!("custom clock settings are present but will be ignored, but could not get clocks table: {err}");
878+
error!("custom clock settings are present but will be ignored, could not get clocks table: {err}");
840879
}
841880
}
842881
}
@@ -969,30 +1008,36 @@ impl GpuController for AmdGpuController {
9691008

9701009
// Unlike the other PMFW options, zero rpm should be functional with a custom curve
9711010
if let Some(zero_rpm) = config.pmfw_options.zero_rpm {
972-
let current_zero_rpm = self
973-
.handle
974-
.get_fan_zero_rpm_enable()
975-
.context("Could not get zero RPM mode")?;
976-
if current_zero_rpm != zero_rpm {
977-
let commit_handle = self
978-
.handle
979-
.set_fan_zero_rpm_enable(zero_rpm)
980-
.context("Could not set zero RPM mode")?;
981-
commit_handles.push(commit_handle);
1011+
match self.handle.get_fan_zero_rpm_enable() {
1012+
Ok(current_zero_rpm) => {
1013+
if current_zero_rpm != zero_rpm {
1014+
let commit_handle = self
1015+
.handle
1016+
.set_fan_zero_rpm_enable(zero_rpm)
1017+
.context("Could not set zero RPM mode")?;
1018+
commit_handles.push(commit_handle);
1019+
}
1020+
}
1021+
Err(err) => {
1022+
error!("zero RPM is present in the config, but not available on the GPU: {err}");
1023+
}
9821024
}
9831025
}
9841026

9851027
if let Some(zero_rpm_threshold) = config.pmfw_options.zero_rpm_threshold {
986-
let current_threshold = self
987-
.handle
988-
.get_fan_zero_rpm_stop_temperature()
989-
.context("Could not get zero RPM temperature")?;
990-
if current_threshold.current != zero_rpm_threshold {
991-
let commit_handle = self
992-
.handle
993-
.set_fan_zero_rpm_stop_temperature(zero_rpm_threshold)
994-
.context("Could not set zero RPM temperature")?;
995-
commit_handles.push(commit_handle);
1028+
match self.handle.get_fan_zero_rpm_stop_temperature() {
1029+
Ok(current_threshold) => {
1030+
if current_threshold.current != zero_rpm_threshold {
1031+
let commit_handle = self
1032+
.handle
1033+
.set_fan_zero_rpm_stop_temperature(zero_rpm_threshold)
1034+
.context("Could not set zero RPM temperature")?;
1035+
commit_handles.push(commit_handle);
1036+
}
1037+
}
1038+
Err(err) => {
1039+
error!("zero RPM threshold is present in the config, but not available on the GPU: {err}");
1040+
}
9961041
}
9971042
}
9981043

@@ -1073,6 +1118,10 @@ impl ClocksConfiguration {
10731118
Some(offset) => table.set_voltage_offset(offset)?,
10741119
None => table.voltage_offset = None,
10751120
}
1121+
1122+
if let Some(offset) = self.gpu_clock_offsets.get(&0) {
1123+
table.sclk_offset = Some(*offset);
1124+
}
10761125
}
10771126

10781127
if let Some(min_clockspeed) = self.min_core_clock {

lact-daemon/src/server/gpu_controller/fan_control.rs

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
use std::cmp;
2-
31
use amdgpu_sysfs::{gpu_handle::fan_control::FanCurve as PmfwCurve, hw_mon::Temperature};
4-
use anyhow::{anyhow, Context};
2+
use anyhow::{anyhow, bail, Context};
53
use lact_schema::{default_fan_curve, FanCurveMap};
64
use serde::{Deserialize, Serialize};
75
use tracing::warn;
@@ -52,18 +50,28 @@ impl FanCurve {
5250
let allowed_ranges = current_pmfw_curve
5351
.allowed_ranges
5452
.context("The GPU does not allow fan curve modifications")?;
55-
let min_pwm = *allowed_ranges.speed_range.start();
56-
let max_pwm = f32::from(*allowed_ranges.speed_range.end());
53+
let min_percent = *allowed_ranges.speed_range.start();
54+
let max_percent = *allowed_ranges.speed_range.end();
55+
let min_temp = *allowed_ranges.temperature_range.start();
56+
let max_temp = *allowed_ranges.temperature_range.end();
5757

5858
let points = self
5959
.0
6060
.into_iter()
6161
.map(|(temp, ratio)| {
62-
let custom_pwm = (max_pwm * ratio) as u8;
63-
let pwm = cmp::max(min_pwm, custom_pwm);
64-
(temp, pwm)
62+
let custom_percent = (ratio * 100.0) as u8;
63+
64+
if !(min_temp..=max_temp).contains(&temp) {
65+
bail!("Temperature {temp}℃ is outside of the allowed range {min_temp}℃ to {max_temp}℃");
66+
}
67+
68+
if !(min_percent..=max_percent).contains(&custom_percent) {
69+
bail!("Speed {custom_percent}% is outside of the allowed range {min_percent}% to {max_percent}%");
70+
}
71+
72+
Ok((temp, custom_percent))
6573
})
66-
.collect();
74+
.collect::<anyhow::Result<_>>()?;
6775

6876
Ok(PmfwCurve {
6977
points,
@@ -93,6 +101,7 @@ impl Default for FanCurve {
93101
mod tests {
94102
use super::{FanCurve, PmfwCurve};
95103
use amdgpu_sysfs::{gpu_handle::fan_control::FanCurveRanges, hw_mon::Temperature};
104+
use anyhow::anyhow;
96105

97106
fn simple_pwm(temp: f32) -> u8 {
98107
let curve = FanCurve([(0, 0.0), (100, 1.0)].into());
@@ -178,7 +187,7 @@ mod tests {
178187
};
179188
curve.pwm_at_temp(temp)
180189
};
181-
assert_eq!(pwm_at_temp(40.0), 51);
190+
assert_eq!(pwm_at_temp(40.0), 76);
182191
assert_eq!(pwm_at_temp(60.0), 127);
183192
assert_eq!(pwm_at_temp(65.0), 159);
184193
assert_eq!(pwm_at_temp(70.0), 191);
@@ -193,12 +202,42 @@ mod tests {
193202
let current_pmfw_curve = PmfwCurve {
194203
points: Box::new([(0, 0); 5]),
195204
allowed_ranges: Some(FanCurveRanges {
196-
temperature_range: 15..=90,
197-
speed_range: 20..=100,
205+
temperature_range: 25..=100,
206+
speed_range: 30..=100,
198207
}),
199208
};
200209
let pmfw_curve = curve.into_pmfw_curve(current_pmfw_curve).unwrap();
201-
let expected_points = [(40, 20), (50, 35), (60, 50), (70, 75), (80, 100)];
210+
let expected_points = [(40, 30), (50, 35), (60, 50), (70, 75), (80, 100)];
202211
assert_eq!(&expected_points, pmfw_curve.points.as_ref());
203212
}
213+
214+
#[test]
215+
fn curve_outside_of_limits_to_pmfw() {
216+
let curve_invalid_temp =
217+
FanCurve([(20, 0.4), (50, 0.35), (60, 0.5), (70, 0.75), (80, 1.0)].into());
218+
let curve_invalid_speed =
219+
FanCurve([(40, 0.1), (50, 0.35), (60, 0.5), (70, 0.75), (80, 1.0)].into());
220+
221+
let current_pmfw_curve = PmfwCurve {
222+
points: Box::new([(0, 0); 5]),
223+
allowed_ranges: Some(FanCurveRanges {
224+
temperature_range: 25..=100,
225+
speed_range: 30..=100,
226+
}),
227+
};
228+
assert_eq!(
229+
anyhow!("Temperature 20℃ is outside of the allowed range 25℃ to 100℃").to_string(),
230+
curve_invalid_temp
231+
.into_pmfw_curve(current_pmfw_curve.clone())
232+
.unwrap_err()
233+
.to_string()
234+
);
235+
assert_eq!(
236+
anyhow!("Speed 10% is outside of the allowed range 30% to 100%").to_string(),
237+
curve_invalid_speed
238+
.into_pmfw_curve(current_pmfw_curve)
239+
.unwrap_err()
240+
.to_string()
241+
);
242+
}
204243
}

lact-daemon/src/server/gpu_controller/intel.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,7 @@ impl GpuController for IntelGpuController {
655655
}
656656
}
657657

658-
fn get_clocks_info(&self) -> anyhow::Result<ClocksInfo> {
658+
fn get_clocks_info(&self, _gpu_config: Option<&config::Gpu>) -> anyhow::Result<ClocksInfo> {
659659
let clocks_table = IntelClocksTable {
660660
gt_freq: self
661661
.read_freq(FrequencyType::Min)

lact-daemon/src/server/gpu_controller/nvidia.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ impl GpuController for NvidiaGpuController {
483483
}
484484

485485
#[allow(clippy::cast_possible_wrap)]
486-
fn get_clocks_info(&self) -> anyhow::Result<ClocksInfo> {
486+
fn get_clocks_info(&self, _gpu_config: Option<&config::Gpu>) -> anyhow::Result<ClocksInfo> {
487487
let device = self.device();
488488

489489
let mut gpu_offsets = IndexMap::new();

lact-daemon/src/server/handler.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ const SNAPSHOT_DEVICE_FILES: &[&str] = &[
7272
"current_link_speed",
7373
"current_link_width",
7474
"power_dpm_force_performance_level",
75+
"mem_info_vram_vendor",
76+
"gpu_metrics",
7577
];
7678
/// Prefixes for entries that will be recursively included in the debug snapshot
7779
const SNAPSHOT_DEVICE_RECURSIVE_PATHS_PREFIXES: &[&str] = &["tile"];
@@ -393,7 +395,9 @@ impl<'a> Handler {
393395
}
394396

395397
pub async fn get_clocks_info(&'a self, id: &str) -> anyhow::Result<ClocksInfo> {
396-
self.controller_by_id(id).await?.get_clocks_info()
398+
let config = self.config.read().await;
399+
let gpu_config = config.gpus()?.get(id);
400+
self.controller_by_id(id).await?.get_clocks_info(gpu_config)
397401
}
398402

399403
pub async fn set_fan_control(&'a self, opts: FanOptions<'_>) -> anyhow::Result<u64> {
@@ -739,7 +743,7 @@ impl<'a> Handler {
739743
"pci_info": controller.controller_info().pci_info.clone(),
740744
"info": controller.get_info().await,
741745
"stats": controller.get_stats(gpu_config),
742-
"clocks_info": controller.get_clocks_info().ok(),
746+
"clocks_info": controller.get_clocks_info(gpu_config).ok(),
743747
"power_profile_modes": controller.get_power_profile_modes().ok(),
744748
"power_states": controller.get_power_states(gpu_config),
745749
});

lact-daemon/src/tests/data/amd/rx7900xt/config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@ fan_control_settings:
55
temperature_key: edge
66
interval_ms: 500
77
curve:
8-
40: 0.14352942
8+
40: 0.15
99
50: 0.17411764
1010
60: 0.22352941
1111
70: 0.29
12-
86: 0.5
12+
86: 0.6
1313
spindown_delay_ms: 0
1414
change_threshold: 0
1515
pmfw_options:

0 commit comments

Comments
 (0)