Skip to content

Commit aef7cd8

Browse files
committed
uffd: add support for UFFD_EVENT_REMOVE events
Enabling Firecracker free-page-reporting feature requires us to handle remove events (UFFD_EVENT_REMOVE) in our userfaultfd handler. These events are triggered whenever Firecracker calls madvise(MADV_DONTNEED) (or similar) on a range of guest memory addresses. The main thing that changes on our logic is that page faults in a page that has previously been removed need to be served with a zero page rather than a page from the snapshot file. This commit changes the page fault serving logic to: 1. Introduce tracking of the state of every page in the guest's memory mappings. 2. Add logic to handle the new UFFD_EVENT_REMOVE event 3. Modify existing logic to take into account current state when deciding how to handle each page fault Signed-off-by: Babis Chalios <babis.chalios@e2b.dev>
1 parent 9dca469 commit aef7cd8

5 files changed

Lines changed: 312 additions & 64 deletions

File tree

packages/orchestrator/internal/sandbox/uffd/memory/mapping.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,15 @@ func (e AddressNotFoundError) Error() string {
1515
return fmt.Sprintf("host virtual address %d not found in any mapping", e.hostVirtAddr)
1616
}
1717

18+
type AddressNotPageAlignedError struct {
19+
hostVirtAddr uintptr
20+
pageSize uintptr
21+
}
22+
23+
func (e AddressNotPageAlignedError) Error() string {
24+
return fmt.Sprintf("host virtual address %d not aligned in regions page size %d", e.hostVirtAddr, e.pageSize)
25+
}
26+
1827
type OffsetNotFoundError struct {
1928
offset int64
2029
}
@@ -104,3 +113,49 @@ func (m *Mapping) GetHostVirtAddr(off int64) (uintptr, uintptr, error) {
104113

105114
return region.shiftedHostVirtAddr(off), region.PageSize, nil
106115
}
116+
117+
// GetPageState returns the state of the page with the given start address.
118+
func (m *Mapping) GetPageState(hostVirtAddr uintptr) (PageState, error) {
119+
for _, r := range m.Regions {
120+
if r.addrInRegion(hostVirtAddr) {
121+
if hostVirtAddr%r.PageSize != 0 {
122+
return "", AddressNotPageAlignedError{hostVirtAddr: hostVirtAddr, pageSize: r.PageSize}
123+
}
124+
125+
return r.pageState(hostVirtAddr), nil
126+
}
127+
}
128+
129+
return "", AddressNotFoundError{hostVirtAddr: hostVirtAddr}
130+
}
131+
132+
// SetPageState sets a new state for a range of pages.
133+
func (m *Mapping) SetPageState(startHostVirtAddr uintptr, endHostVirtAddr uintptr, pageState PageState) error {
134+
for i := range m.Regions {
135+
if m.Regions[i].addrInRegion(startHostVirtAddr) {
136+
// The entirety of the range needs to be in the same region. In other words: we can't have here a region
137+
// [startHostVirtAddr, endHostVirtAddr) that spans across more than one guest memory regions
138+
if !m.Regions[i].addrInRegion(endHostVirtAddr) {
139+
return AddressNotFoundError{hostVirtAddr: endHostVirtAddr}
140+
}
141+
142+
// Both start and end addresses need to be page aligned.
143+
pageSize := m.Regions[i].PageSize
144+
if startHostVirtAddr%pageSize != 0 {
145+
return AddressNotPageAlignedError{hostVirtAddr: startHostVirtAddr, pageSize: pageSize}
146+
}
147+
148+
if endHostVirtAddr%pageSize != 0 {
149+
return AddressNotPageAlignedError{hostVirtAddr: endHostVirtAddr, pageSize: pageSize}
150+
}
151+
152+
for addr := startHostVirtAddr; addr < endHostVirtAddr; addr += pageSize {
153+
m.Regions[i].setPageState(addr, pageState)
154+
}
155+
156+
return nil
157+
}
158+
}
159+
160+
return AddressNotFoundError{hostVirtAddr: startHostVirtAddr}
161+
}

packages/orchestrator/internal/sandbox/uffd/memory/region.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
package memory
22

3+
// PageState describes the states that guest memory page can be in
4+
type PageState string
5+
6+
const (
7+
// Page has not been faulted in yet
8+
Unfaulted PageState = "unfaulted"
9+
// Page has been faulted in
10+
Faulted PageState = "faulted"
11+
// Page has been removed
12+
Removed PageState = "removed"
13+
)
14+
315
// Region is a mapping of a region of memory of the guest to a region of memory on the host.
416
// The serialization is based on the Firecracker UFFD protocol communication.
517
// https://github.com/firecracker-microvm/firecracker/blob/ceeca6a14284537ae0b2a192cd2ffef10d3a81e2/src/vmm/src/persist.rs#L96
@@ -9,6 +21,9 @@ type Region struct {
921
Offset uintptr `json:"offset"`
1022
// This field is deprecated in the newer version of the Firecracker with a new field `page_size`.
1123
PageSize uintptr `json:"page_size_kib"` // This is actually in bytes in the deprecated version.
24+
// This field is not used in the serialization. It's metadata we're keeping to track the state of the
25+
// pages within this region
26+
PageState []PageState `json:"-"`
1227
}
1328

1429
// endOffset returns the end offset of the region in bytes.
@@ -32,3 +47,27 @@ func (r *Region) shiftedOffset(addr uintptr) int64 {
3247
func (r *Region) shiftedHostVirtAddr(off int64) uintptr {
3348
return uintptr(off) + r.BaseHostVirtAddr - r.Offset
3449
}
50+
51+
// pageState returns the current PageState of a page in the region.
52+
//
53+
// This assumes that the caller has already checked that `addr` lies
54+
// within this region.
55+
func (r *Region) pageState(addr uintptr) PageState {
56+
page_idx := (addr - r.BaseHostVirtAddr) / r.PageSize
57+
58+
return r.PageState[page_idx]
59+
}
60+
61+
// setPageState sets the current PageState of a page in the region.
62+
//
63+
// This assumes that the caller has already checked that `addr` lies
64+
// within this region.
65+
func (r *Region) setPageState(addr uintptr, state PageState) {
66+
page_idx := (addr - r.BaseHostVirtAddr) / r.PageSize
67+
r.PageState[page_idx] = state
68+
}
69+
70+
// addrInRegion returns true if an address is included in the region
71+
func (r *Region) addrInRegion(addr uintptr) bool {
72+
return addr >= r.BaseHostVirtAddr && addr < r.endHostVirtAddr()
73+
}

packages/orchestrator/internal/sandbox/uffd/uffd.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,15 @@ func (u *Uffd) handle(ctx context.Context, sandboxId string, fdExit *fdexit.FdEx
144144
return fmt.Errorf("failed parsing memory mapping data: %w", err)
145145
}
146146

147+
// Initialize PageState for each region (not included in JSON serialization)
148+
for i := range regions {
149+
numPages := regions[i].Size / regions[i].PageSize
150+
regions[i].PageState = make([]memory.PageState, numPages)
151+
for j := range regions[i].PageState {
152+
regions[i].PageState[j] = memory.Unfaulted
153+
}
154+
}
155+
147156
controlMsgs, err := syscall.ParseSocketControlMessage(uffdBuf[:numBytesFd])
148157
if err != nil {
149158
return fmt.Errorf("failed parsing control messages: %w", err)

packages/orchestrator/internal/sandbox/uffd/userfaultfd/fd.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ struct uffd_pagefault {
1616
__u64 address;
1717
__u32 ptid;
1818
};
19+
20+
struct uffd_remove {
21+
__u64 start;
22+
__u64 end;
23+
};
24+
1925
*/
2026
import "C"
2127

@@ -30,14 +36,18 @@ const (
3036

3137
UFFD_API = C.UFFD_API
3238
UFFD_EVENT_PAGEFAULT = C.UFFD_EVENT_PAGEFAULT
39+
UFFD_EVENT_REMOVE = C.UFFD_EVENT_REMOVE
3340

3441
UFFDIO_REGISTER_MODE_MISSING = C.UFFDIO_REGISTER_MODE_MISSING
3542

3643
UFFDIO_API = C.UFFDIO_API
3744
UFFDIO_REGISTER = C.UFFDIO_REGISTER
3845
UFFDIO_COPY = C.UFFDIO_COPY
46+
UFFDIO_ZEROPAGE = C.UFFDIO_ZEROPAGE
3947

4048
UFFD_PAGEFAULT_FLAG_WRITE = C.UFFD_PAGEFAULT_FLAG_WRITE
49+
UFFD_PAGEFAULT_FLAG_MINOR = C.UFFD_PAGEFAULT_FLAG_MINOR
50+
UFFD_PAGEFAULT_FLAG_WP = C.UFFD_PAGEFAULT_FLAG_WP
4151

4252
UFFD_FEATURE_MISSING_HUGETLBFS = C.UFFD_FEATURE_MISSING_HUGETLBFS
4353
)
@@ -49,11 +59,13 @@ type (
4959

5060
UffdMsg = C.struct_uffd_msg
5161
UffdPagefault = C.struct_uffd_pagefault
62+
UffdRemove = C.struct_uffd_remove
5263

5364
UffdioAPI = C.struct_uffdio_api
5465
UffdioRegister = C.struct_uffdio_register
5566
UffdioRange = C.struct_uffdio_range
5667
UffdioCopy = C.struct_uffdio_copy
68+
UffdioZero = C.struct_uffdio_zeropage
5769
UffdioWriteProtect = C.struct_uffdio_writeprotect
5870
)
5971

@@ -88,6 +100,14 @@ func newUffdioCopy(b []byte, address CULong, pagesize CULong, mode CULong, bytes
88100
}
89101
}
90102

103+
func newUffdioZero(address, pagesize, mode CULong) UffdioZero {
104+
return UffdioZero{
105+
_range: newUffdioRange(address, pagesize),
106+
mode: mode,
107+
zeropage: 0,
108+
}
109+
}
110+
91111
func getMsgEvent(msg *UffdMsg) CUChar {
92112
return msg.event
93113
}
@@ -120,6 +140,21 @@ func (f Fd) copy(addr, pagesize uintptr, data []byte, mode CULong) error {
120140
return nil
121141
}
122142

143+
func (f Fd) zero(addr, pagesize uintptr, mode CULong) error {
144+
zero := newUffdioZero(CULong(addr)&^CULong(pagesize-1), CULong(pagesize), mode)
145+
146+
if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(f), UFFDIO_ZEROPAGE, uintptr(unsafe.Pointer(&zero))); errno != 0 {
147+
return errno
148+
}
149+
150+
// Check if the bytes actually zeroed out by the kernel match the page size
151+
if zero.zeropage != CLong(pagesize) {
152+
return fmt.Errorf("UFFDIO_ZEROPAGE copied %d bytes, expected %d", zero.zeropage, pagesize)
153+
}
154+
155+
return nil
156+
}
157+
123158
func (f Fd) close() error {
124159
return syscall.Close(int(f))
125160
}

0 commit comments

Comments
 (0)