Skip to content

Commit 84007a9

Browse files
committed
kvm-ioctls: add dirty log ring support for all architectures
Implement dirty log ring interface with `enable_dirty_log_ring` and `dirty_log_ring_iter` methods. Enable `VmFd` `enable_cap` and ioctl imports on all architectures. Add memory fences in iterator for proper synchronization on weak memory consistency architectures. Signed-off-by: David Kleymann <[email protected]>
1 parent bd3260e commit 84007a9

File tree

6 files changed

+555
-10
lines changed

6 files changed

+555
-10
lines changed

kvm-bindings/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,7 @@ pub use self::arm64::*;
3939
mod riscv64;
4040
#[cfg(target_arch = "riscv64")]
4141
pub use self::riscv64::*;
42+
43+
// linux defines these based on _BITUL macros and bindgen fails to generate them
44+
pub const KVM_DIRTY_GFN_F_DIRTY: u32 = 0b1;
45+
pub const KVM_DIRTY_GFN_F_RESET: u32 = 0b10;

kvm-ioctls/CHANGELOG.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,26 @@
22

33
## Upcoming Release
44

5+
### Fixed
6+
7+
- Fixed `VmFd::enable_cap` available for all architectures
8+
9+
### Added
10+
11+
- Added `KvmDirtyLogRing` structure to mmap the dirty log ring.
12+
- Added `KVM_DIRTY_GFN_F_DIRTY` and `KVM_DIRTY_GFN_F_RESET` bitflags.
13+
- Added `KvmDirtyLogRing` iterator type for accessing dirty log entries.
14+
- Added `dirty_log_ring` field to `VcpuFd` to access per-vCpu dirty rings.
15+
- Inserted fences in KvmDirtyLogRing iterator `next` for architectures with weak memory consistency that require Acquire/Release
16+
- Added `DirtyLogRingInfo` struct and `dirty_log_ring_info` field to `VmFd` to
17+
track dirty ring configuration.
18+
- Added `enable_dirty_log_ring` function on `VmFd` to check corresponding
19+
capabilities and enable KVM's dirty log ring.
20+
- Added `VcpuFd::dirty_log_ring_iter()` to iterate over dirty guest frame numbers.
21+
- Added `VmFd::reset_dirty_rings()` to reset all dirty rings for the VM.
22+
23+
- Plumb through KVM_CAP_DIRTY_LOG_RING as DirtyLogRing cap.
24+
525
## v0.24.0
626

727
### Added

kvm-ioctls/src/ioctls/mod.rs

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
use std::mem::size_of;
99
use std::os::unix::io::AsRawFd;
1010
use std::ptr::{NonNull, null_mut};
11+
use std::sync::atomic::{Ordering, fence};
1112

1213
use kvm_bindings::{
13-
KVM_COALESCED_MMIO_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_run,
14+
KVM_COALESCED_MMIO_PAGE_OFFSET, KVM_DIRTY_GFN_F_DIRTY, KVM_DIRTY_GFN_F_RESET,
15+
KVM_DIRTY_LOG_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_dirty_gfn, kvm_run,
1416
};
1517
use vmm_sys_util::errno;
1618

@@ -29,6 +31,121 @@ pub mod vm;
2931
/// is otherwise a direct mapping to Result.
3032
pub type Result<T> = std::result::Result<T, errno::Error>;
3133

34+
/// A wrapper around the KVM dirty log ring page.
35+
#[derive(Debug)]
36+
pub(crate) struct KvmDirtyLogRing {
37+
/// Next potentially dirty guest frame number slot index
38+
next_dirty: u64,
39+
/// Memory-mapped array of dirty guest frame number entries
40+
gfns: NonNull<kvm_dirty_gfn>,
41+
/// Ring size mask (size-1) for efficient modulo operations
42+
mask: u64,
43+
/// `true` if we need to use Acquire/Release memory ordering
44+
use_acq_rel: bool,
45+
}
46+
47+
impl KvmDirtyLogRing {
48+
/// Maps the KVM dirty log ring from the vCPU file descriptor.
49+
///
50+
/// # Arguments
51+
/// * `fd` - vCPU file descriptor to mmap from.
52+
/// * `size` - Size of memory region in bytes.
53+
pub(crate) fn mmap_from_fd<F: AsRawFd>(
54+
fd: &F,
55+
bytes: usize,
56+
use_acq_rel: bool,
57+
) -> Result<Self> {
58+
// SAFETY: We trust the sysconf libc function and we're calling it
59+
// with a correct parameter.
60+
let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } {
61+
-1 => return Err(errno::Error::last()),
62+
ps => ps as usize,
63+
};
64+
65+
let offset = page_size * KVM_DIRTY_LOG_PAGE_OFFSET as usize;
66+
67+
if bytes % std::mem::size_of::<kvm_dirty_gfn>() != 0 {
68+
// Size of dirty ring in bytes must be multiples of slot size
69+
return Err(errno::Error::new(libc::EINVAL));
70+
}
71+
let slots = bytes / std::mem::size_of::<kvm_dirty_gfn>();
72+
if !slots.is_power_of_two() {
73+
// Number of slots must be power of two
74+
return Err(errno::Error::new(libc::EINVAL));
75+
}
76+
77+
// SAFETY: KVM guarantees that there is a page at offset
78+
// KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE if the appropriate
79+
// capability is available. If it is not, the call will simply
80+
// fail.
81+
let gfns = unsafe {
82+
NonNull::<kvm_dirty_gfn>::new(libc::mmap(
83+
null_mut(),
84+
bytes,
85+
libc::PROT_READ | libc::PROT_WRITE,
86+
libc::MAP_SHARED,
87+
fd.as_raw_fd(),
88+
offset as i64,
89+
) as *mut kvm_dirty_gfn)
90+
.filter(|addr| addr.as_ptr() != libc::MAP_FAILED as *mut kvm_dirty_gfn)
91+
.ok_or_else(errno::Error::last)?
92+
};
93+
Ok(Self {
94+
next_dirty: 0,
95+
gfns,
96+
mask: (slots - 1) as u64,
97+
use_acq_rel,
98+
})
99+
}
100+
}
101+
102+
impl Drop for KvmDirtyLogRing {
103+
fn drop(&mut self) {
104+
// SAFETY: This is safe because we mmap the page ourselves, and nobody
105+
// else is holding a reference to it.
106+
unsafe {
107+
libc::munmap(
108+
self.gfns.as_ptr().cast(),
109+
(self.mask + 1) as usize * std::mem::size_of::<kvm_dirty_gfn>(),
110+
);
111+
}
112+
}
113+
}
114+
115+
impl Iterator for KvmDirtyLogRing {
116+
type Item = (u32, u64);
117+
fn next(&mut self) -> Option<Self::Item> {
118+
let i = self.next_dirty & self.mask;
119+
// SAFETY: i is not larger than mask, thus is a valid offset into self.gfns,
120+
// therefore this operation produces a valid pointer to a kvm_dirty_gfn
121+
let gfn_ptr = unsafe { self.gfns.add(i as usize).as_ptr() };
122+
123+
if self.use_acq_rel {
124+
fence(Ordering::Acquire);
125+
}
126+
127+
// SAFETY: Can read a valid pointer to a kvm_dirty_gfn
128+
let gfn = unsafe { gfn_ptr.read_volatile() };
129+
130+
if gfn.flags & KVM_DIRTY_GFN_F_DIRTY == 0 {
131+
// next_dirty stays the same, it will become the next dirty element
132+
None
133+
} else {
134+
self.next_dirty += 1;
135+
let mut updated_gfn = gfn;
136+
updated_gfn.flags ^= KVM_DIRTY_GFN_F_RESET;
137+
// SAFETY: Can write to a valid pointer to a kvm_dirty_gfn
138+
unsafe {
139+
gfn_ptr.write_volatile(updated_gfn);
140+
};
141+
if self.use_acq_rel {
142+
fence(Ordering::Release);
143+
}
144+
Some((gfn.slot, gfn.offset))
145+
}
146+
}
147+
}
148+
32149
/// A wrapper around the coalesced MMIO ring page.
33150
#[derive(Debug)]
34151
pub(crate) struct KvmCoalescedIoRing {

kvm-ioctls/src/ioctls/vcpu.rs

Lines changed: 178 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use libc::EINVAL;
1616
use std::fs::File;
1717
use std::os::unix::io::{AsRawFd, RawFd};
1818

19-
use crate::ioctls::{KvmCoalescedIoRing, KvmRunWrapper, Result};
19+
use crate::ioctls::{KvmCoalescedIoRing, KvmDirtyLogRing, KvmRunWrapper, Result};
2020
use crate::kvm_ioctls::*;
2121
use vmm_sys_util::errno;
2222
use vmm_sys_util::ioctl::{ioctl, ioctl_with_mut_ref, ioctl_with_ref};
@@ -197,6 +197,9 @@ pub struct VcpuFd {
197197
kvm_run_ptr: KvmRunWrapper,
198198
/// A pointer to the coalesced MMIO page
199199
coalesced_mmio_ring: Option<KvmCoalescedIoRing>,
200+
/// A pointer to the dirty log ring
201+
#[allow(unused)]
202+
dirty_log_ring: Option<KvmDirtyLogRing>,
200203
}
201204

202205
/// KVM Sync Registers used to tell KVM which registers to sync
@@ -2104,6 +2107,36 @@ impl VcpuFd {
21042107
}
21052108
}
21062109

2110+
/// Gets the dirty log ring iterator if one is mapped.
2111+
///
2112+
/// Returns an iterator over dirty guest frame numbers as (slot, offset) tuples.
2113+
/// Returns `None` if no dirty log ring has been mapped.
2114+
///
2115+
/// # Returns
2116+
///
2117+
/// An optional iterator over the dirty log ring entries.
2118+
///
2119+
/// # Example
2120+
///
2121+
/// ```no_run
2122+
/// # use kvm_ioctls::Kvm;
2123+
/// # use kvm_ioctls::Cap;
2124+
/// let kvm = Kvm::new().unwrap();
2125+
/// let mut vm = kvm.create_vm().unwrap();
2126+
/// vm.enable_dirty_log_ring(None).unwrap();
2127+
/// let mut vcpu = vm.create_vcpu(0).unwrap();
2128+
/// if kvm.check_extension(Cap::DirtyLogRing) {
2129+
/// if let Some(mut iter) = vcpu.dirty_log_ring_iter() {
2130+
/// for (slot, offset) in iter {
2131+
/// println!("Dirty page in slot {} at offset {}", slot, offset);
2132+
/// }
2133+
/// }
2134+
/// }
2135+
/// ```
2136+
pub fn dirty_log_ring_iter(&mut self) -> Option<impl Iterator<Item = (u32, u64)>> {
2137+
self.dirty_log_ring.as_mut()
2138+
}
2139+
21072140
/// Maps the coalesced MMIO ring page. This allows reading entries from
21082141
/// the ring via [`coalesced_mmio_read()`](VcpuFd::coalesced_mmio_read).
21092142
///
@@ -2159,11 +2192,16 @@ impl VcpuFd {
21592192
/// This should not be exported as a public function because the preferred way is to use
21602193
/// `create_vcpu` from `VmFd`. The function cannot be part of the `VcpuFd` implementation because
21612194
/// then it would be exported with the public `VcpuFd` interface.
2162-
pub fn new_vcpu(vcpu: File, kvm_run_ptr: KvmRunWrapper) -> VcpuFd {
2195+
pub fn new_vcpu(
2196+
vcpu: File,
2197+
kvm_run_ptr: KvmRunWrapper,
2198+
dirty_log_ring: Option<KvmDirtyLogRing>,
2199+
) -> VcpuFd {
21632200
VcpuFd {
21642201
vcpu,
21652202
kvm_run_ptr,
21662203
coalesced_mmio_ring: None,
2204+
dirty_log_ring,
21672205
}
21682206
}
21692207

@@ -2835,6 +2873,144 @@ mod tests {
28352873
}
28362874
}
28372875

2876+
#[cfg(target_arch = "x86_64")]
2877+
#[test]
2878+
fn test_run_code_dirty_log_ring() {
2879+
use std::io::Write;
2880+
2881+
let kvm = Kvm::new().unwrap();
2882+
let mut vm = kvm.create_vm().unwrap();
2883+
2884+
// Enable dirty log ring
2885+
let need_bitmap = vm.enable_dirty_log_ring(None).unwrap();
2886+
2887+
// This example is based on https://lwn.net/Articles/658511/
2888+
#[rustfmt::skip]
2889+
let code = [
2890+
0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
2891+
0x00, 0xd8, /* add %bl, %al */
2892+
0x04, b'0', /* add $'0', %al */
2893+
0xee, /* out %al, %dx */
2894+
0xec, /* in %dx, %al */
2895+
0xc6, 0x06, 0x00, 0x80, 0x00, /* movl $0, (0x8000); This generates a MMIO Write.*/
2896+
0x8a, 0x16, 0x00, 0x80, /* movl (0x8000), %dl; This generates a MMIO Read.*/
2897+
0xc6, 0x06, 0x00, 0x20, 0x00, /* movl $0, (0x2000); Dirty one page in guest mem. */
2898+
0xf4, /* hlt */
2899+
];
2900+
let expected_rips: [u64; 3] = [0x1003, 0x1005, 0x1007];
2901+
2902+
let mem_size = 0x4000;
2903+
let load_addr = mmap_anonymous(mem_size).as_ptr();
2904+
let guest_addr: u64 = 0x1000;
2905+
let slot: u32 = 0;
2906+
let mem_region = kvm_userspace_memory_region {
2907+
slot,
2908+
guest_phys_addr: guest_addr,
2909+
memory_size: mem_size as u64,
2910+
userspace_addr: load_addr as u64,
2911+
flags: KVM_MEM_LOG_DIRTY_PAGES,
2912+
};
2913+
unsafe {
2914+
vm.set_user_memory_region(mem_region).unwrap();
2915+
}
2916+
2917+
unsafe {
2918+
// Get a mutable slice of `mem_size` from `load_addr`.
2919+
// This is safe because we mapped it before.
2920+
let mut slice = std::slice::from_raw_parts_mut(load_addr, mem_size);
2921+
slice.write_all(&code).unwrap();
2922+
}
2923+
2924+
let mut vcpu_fd = vm.create_vcpu(0).unwrap();
2925+
2926+
let mut vcpu_sregs = vcpu_fd.get_sregs().unwrap();
2927+
assert_ne!(vcpu_sregs.cs.base, 0);
2928+
assert_ne!(vcpu_sregs.cs.selector, 0);
2929+
vcpu_sregs.cs.base = 0;
2930+
vcpu_sregs.cs.selector = 0;
2931+
vcpu_fd.set_sregs(&vcpu_sregs).unwrap();
2932+
2933+
let mut vcpu_regs = vcpu_fd.get_regs().unwrap();
2934+
// Set the Instruction Pointer to the guest address where we loaded the code.
2935+
vcpu_regs.rip = guest_addr;
2936+
vcpu_regs.rax = 2;
2937+
vcpu_regs.rbx = 3;
2938+
vcpu_regs.rflags = 2;
2939+
vcpu_fd.set_regs(&vcpu_regs).unwrap();
2940+
2941+
let mut debug_struct = kvm_guest_debug {
2942+
control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP,
2943+
pad: 0,
2944+
arch: kvm_guest_debug_arch {
2945+
debugreg: [0, 0, 0, 0, 0, 0, 0, 0],
2946+
},
2947+
};
2948+
vcpu_fd.set_guest_debug(&debug_struct).unwrap();
2949+
2950+
let mut instr_idx = 0;
2951+
loop {
2952+
match vcpu_fd.run().expect("run failed") {
2953+
VcpuExit::IoIn(addr, data) => {
2954+
assert_eq!(addr, 0x3f8);
2955+
assert_eq!(data.len(), 1);
2956+
}
2957+
VcpuExit::IoOut(addr, data) => {
2958+
assert_eq!(addr, 0x3f8);
2959+
assert_eq!(data.len(), 1);
2960+
assert_eq!(data[0], b'5');
2961+
}
2962+
VcpuExit::MmioRead(addr, data) => {
2963+
assert_eq!(addr, 0x8000);
2964+
assert_eq!(data.len(), 1);
2965+
}
2966+
VcpuExit::MmioWrite(addr, data) => {
2967+
assert_eq!(addr, 0x8000);
2968+
assert_eq!(data.len(), 1);
2969+
assert_eq!(data[0], 0);
2970+
}
2971+
VcpuExit::Debug(debug) => {
2972+
if instr_idx == expected_rips.len() - 1 {
2973+
// Disabling debugging/single-stepping
2974+
debug_struct.control = 0;
2975+
vcpu_fd.set_guest_debug(&debug_struct).unwrap();
2976+
} else if instr_idx >= expected_rips.len() {
2977+
unreachable!();
2978+
}
2979+
let vcpu_regs = vcpu_fd.get_regs().unwrap();
2980+
assert_eq!(vcpu_regs.rip, expected_rips[instr_idx]);
2981+
assert_eq!(debug.exception, 1);
2982+
assert_eq!(debug.pc, expected_rips[instr_idx]);
2983+
// Check first 15 bits of DR6
2984+
let mask = (1 << 16) - 1;
2985+
assert_eq!(debug.dr6 & mask, 0b100111111110000);
2986+
// Bit 10 in DR7 is always 1
2987+
assert_eq!(debug.dr7, 1 << 10);
2988+
instr_idx += 1;
2989+
}
2990+
VcpuExit::Hlt => {
2991+
// The code snippet dirties 2 pages:
2992+
// * one when the code itself is loaded in memory;
2993+
// * and one more from the `movl` that writes to address 0x8000
2994+
2995+
let dirty_pages: u32 =
2996+
u32::try_from(vcpu_fd.dirty_log_ring_iter().unwrap().count()).unwrap()
2997+
+ if need_bitmap {
2998+
let dirty_pages_bitmap = vm.get_dirty_log(slot, mem_size).unwrap();
2999+
dirty_pages_bitmap
3000+
.into_iter()
3001+
.map(|page| page.count_ones())
3002+
.sum()
3003+
} else {
3004+
0
3005+
};
3006+
assert_eq!(dirty_pages, 2);
3007+
break;
3008+
}
3009+
r => panic!("unexpected exit reason: {:?}", r),
3010+
}
3011+
}
3012+
}
3013+
28383014
#[test]
28393015
#[cfg(target_arch = "aarch64")]
28403016
fn test_get_preferred_target() {

0 commit comments

Comments
 (0)