diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index 4e51e64aea..ce2e839798 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -25,11 +25,14 @@ import ( var ( // faultBlockSize is the size used for servicing memory faults. // - // This should be large enough to avoid frequent faults and avoid using - // all available KVM slots (~512), but small enough that KVM does not - // complain about slot sizes (~4GB). See handleBluepillFault for how - // this block is used. - faultBlockSize = uintptr(2 << 30) + // This should be large enough so that the total number of slots + // required to cover the 47-bit virtual address space does not exceed + // the KVM slot limit (e.g. 32764). Linux doesn't allocate virtual + // address space above 47-bit by default. + // It must be small enough to limit the memory overhead associated with + // KVM slot allocation. For example, using a 46-bit address space + // results in an overhead of ~250 MB. + faultBlockSize = uintptr(8 << 30) // faultBlockMask is the mask for the fault blocks. // diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index 9784e52041..b114dff4ee 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -66,6 +66,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { pSize := uintptr(1) << ring0.PhysicalAddressBits pSize -= reservedMemory + maxUserAddr := uintptr(0) // Add specifically excluded regions; see excludeVirtualRegion. if err := applyVirtualRegions(func(vr virtualRegion) { if excludeVirtualRegion(vr) { @@ -81,10 +82,17 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { }) log.Infof("mmio: virtual [%x,%x)", vr.virtual, vr.virtual+vr.length) } + if vr.filename != "[vsyscall]" { + maxUserAddr = vr.region.virtual + vr.region.length + } }); err != nil { panic(fmt.Sprintf("error parsing /proc/self/maps: %v", err)) } + var archRegions []specialVirtualRegion + vSize, archRegions = archSpecialRegions(vSize, maxUserAddr) + specialRegions = append(specialRegions, archRegions...) + // Do we need any more work? if vSize < pSize { return specialRegions @@ -109,7 +117,7 @@ func fillAddressSpace() (specialRegions []specialVirtualRegion) { current := required // Attempted mmap size. filled := uintptr(0) suggestedAddr := uintptr(0) - if ring0.VirtualAddressBits > 48 { + if extendedAddressSpaceAllowed && ring0.VirtualAddressBits > 48 { // Pass a hint address above 47 bits to indicate to the kernel that // we can handle, and want, mappings above 47 bits: // https://docs.kernel.org/arch/x86/x86_64/5level-paging.html#user-space-and-large-virtual-address-space. diff --git a/pkg/sentry/platform/kvm/physical_map_amd64.go b/pkg/sentry/platform/kvm/physical_map_amd64.go index c5adfb577f..d3824e432e 100644 --- a/pkg/sentry/platform/kvm/physical_map_amd64.go +++ b/pkg/sentry/platform/kvm/physical_map_amd64.go @@ -14,9 +14,62 @@ package kvm +import ( + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/ring0" +) + const ( // reservedMemory is a chunk of physical memory reserved starting at // physical address zero. There are some special pages in this region, // so we just call the whole thing off. reservedMemory = 0x100000000 ) + +const ( + // defaultAddressSpaceSize is the default limit for the user virtual + // address space, which is 47-bits (2^47 bytes). The mmap syscall + // respects this limit by default, even with 5-level page tables + // enabled. + defaultAddressSpaceSize = uintptr(1) << 47 + + // extendedAddressSpaceAllowed controls address space usage beyond + // the default 47-bit limit. It is set to 'false' for several reasons: + // * There are no known use cases requiring the extended address space. + // * By restricting the size, we avoid the overhead of: + // a) Aligning the virtual address space size to the physical + // address space size. + // b) Creating unnecessary page table entries for the unused + // extended range. + // * The memory slot size is currently configured only to cover + // the default 47-bit address space. + // * 5-level page table support was primarily introduced to workaround + // a specific kernel bug where VDSO could be mapped above the 47-bit + // boundary (v6.9-rc1~186^2~7). + extendedAddressSpaceAllowed = false +) + +// archSpecialRegions returns special regions that are excluded from the virtual +// address space. Linux doesn't map vma-s above 47-bit by default. +func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) { + var specialRegions []specialVirtualRegion + if extendedAddressSpaceAllowed || vSize <= defaultAddressSpaceSize { + return vSize, nil + } + // This is a workaround for the kernel bug when vdso can be + // mapped above the 47-bit address space boundary. + if defaultAddressSpaceSize > maxUserAddr { + maxUserAddr = defaultAddressSpaceSize + } + r := region{ + virtual: maxUserAddr, + length: ring0.MaximumUserAddress - defaultAddressSpaceSize, + } + specialRegions = append(specialRegions, specialVirtualRegion{ + region: r, + }) + vSize -= r.length + log.Infof("excluded: virtual [%x,%x)", r.virtual, r.virtual+r.length) + + return vSize, specialRegions +} diff --git a/pkg/sentry/platform/kvm/physical_map_arm64.go b/pkg/sentry/platform/kvm/physical_map_arm64.go index 4d85614539..da6105a6e5 100644 --- a/pkg/sentry/platform/kvm/physical_map_arm64.go +++ b/pkg/sentry/platform/kvm/physical_map_arm64.go @@ -16,4 +16,10 @@ package kvm const ( reservedMemory = 0 + // 5-level page tables are not implemeted on arm64. + extendedAddressSpaceAllowed = false ) + +func archSpecialRegions(vSize uintptr, maxUserAddr uintptr) (uintptr, []specialVirtualRegion) { + return vSize, nil +}