1

Merge branch 'kvm-memslot-zap-quirk' into HEAD

Today whenever a memslot is moved or deleted, KVM invalidates the entire
page tables and generates fresh ones based on the new memslot layout.

This behavior traditionally was kept because of a bug which was never
fully investigated and caused VM instability with assigned GeForce
GPUs.  It generally does not have a huge overhead, because the old
MMU is able to reuse cached page tables and the new one is more
scalabale and can resolve EPT violations/nested page faults in parallel,
but it has worse performance if the guest frequently deletes and
adds small memslots, and it's entirely not viable for TDX.  This is
because TDX requires re-accepting of private pages after page dropping.

For non-TDX VMs, this series therefore introduces the
KVM_X86_QUIRK_SLOT_ZAP_ALL quirk, enabling users to control the behavior
of memslot zapping when a memslot is moved/deleted.  The quirk is turned
on by default, leading to the zapping of all SPTEs when a memslot is
moved/deleted; users however have the option to turn off the quirk,
which limits the zapping only to those SPTEs hat lie within the range
of memslot being moved/deleted.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paolo Bonzini 2024-09-12 11:07:15 -04:00
commit 55f50b2f86
7 changed files with 101 additions and 13 deletions

View File

@ -8082,6 +8082,14 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
guest CPUID on writes to MISC_ENABLE if
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
disabled.
KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in
fast way for memslot deletion when VM type
is KVM_X86_DEFAULT_VM.
When this quirk is disabled or when VM type
is other than KVM_X86_DEFAULT_VM, KVM zaps
only leaf SPTEs that are within the range of
the memslot being deleted.
=================================== ============================================
7.32 KVM_CAP_MAX_VCPU_ID

View File

@ -2345,7 +2345,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
KVM_X86_QUIRK_SLOT_ZAP_ALL)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was

View File

@ -439,6 +439,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1

View File

@ -6999,10 +6999,50 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_mmu_zap_all(kvm);
}
/*
* Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
*
* Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
* case scenario we'll have unused shadow pages lying around until they
* are recycled due to age or when the VM is destroyed.
*/
static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_gfn_range range = {
.slot = slot,
.start = slot->base_gfn,
.end = slot->base_gfn + slot->npages,
.may_block = true,
};
bool flush = false;
write_lock(&kvm->mmu_lock);
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, &range, kvm_zap_rmap);
if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush);
if (flush)
kvm_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
{
return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
if (kvm_memslot_flush_zap_all(kvm))
kvm_mmu_zap_all_fast(kvm);
else
kvm_mmu_zap_memslot_leafs(kvm, slot);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)

View File

@ -79,6 +79,7 @@ struct test_params {
useconds_t delay;
uint64_t nr_iterations;
bool partition_vcpu_memory_access;
bool disable_slot_zap_quirk;
};
static void run_test(enum vm_guest_mode mode, void *arg)
@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
VM_MEM_SRC_ANONYMOUS,
p->partition_vcpu_memory_access);
#ifdef __x86_64__
if (p->disable_slot_zap_quirk)
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ?
"disabled" : "enabled");
#endif
pr_info("Finished creating vCPUs\n");
@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
puts("");
printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n"
" [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
guest_modes_help();
printf(" -d: add a delay between each iteration of adding and\n"
" deleting a memslot in usec.\n");
printf(" -q: Disable memslot zap quirk.\n");
printf(" -b: specify the size of the memory region which should be\n"
" accessed by each vCPU. e.g. 10M or 3G.\n"
" Default: 1G\n");
@ -137,7 +146,7 @@ int main(int argc, char *argv[])
guest_modes_append_default();
while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@ -160,6 +169,12 @@ int main(int argc, char *argv[])
case 'i':
p.nr_iterations = atoi_positive("Number of iterations", optarg);
break;
case 'q':
p.disable_slot_zap_quirk = true;
TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
KVM_X86_QUIRK_SLOT_ZAP_ALL);
break;
case 'h':
default:
help(argv[0]);

View File

@ -113,6 +113,7 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless");
static sem_t vcpu_ready;
static bool map_unmap_verify;
static bool disable_slot_zap_quirk;
static bool verbose;
#define pr_info_v(...) \
@ -578,6 +579,9 @@ static bool test_memslot_move_prepare(struct vm_data *data,
uint32_t guest_page_size = data->vm->page_size;
uint64_t movesrcgpa, movetestgpa;
if (disable_slot_zap_quirk)
vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
if (isactive) {
@ -896,6 +900,7 @@ static void help(char *name, struct test_args *targs)
pr_info(" -h: print this help screen.\n");
pr_info(" -v: enable verbose mode (not for benchmarking).\n");
pr_info(" -d: enable extra debug checks.\n");
pr_info(" -q: Disable memslot zap quirk during memslot move.\n");
pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n",
targs->nslots);
pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n",
@ -954,7 +959,7 @@ static bool parse_args(int argc, char *argv[],
uint32_t max_mem_slots;
int opt;
while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) {
switch (opt) {
case 'h':
default:
@ -966,6 +971,11 @@ static bool parse_args(int argc, char *argv[],
case 'd':
map_unmap_verify = true;
break;
case 'q':
disable_slot_zap_quirk = true;
TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
KVM_X86_QUIRK_SLOT_ZAP_ALL);
break;
case 's':
targs->nslots = atoi_paranoid(optarg);
if (targs->nslots <= 1 && targs->nslots != -1) {

View File

@ -175,7 +175,7 @@ static void guest_code_move_memory_region(void)
GUEST_DONE();
}
static void test_move_memory_region(void)
static void test_move_memory_region(bool disable_slot_zap_quirk)
{
pthread_t vcpu_thread;
struct kvm_vcpu *vcpu;
@ -184,6 +184,9 @@ static void test_move_memory_region(void)
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region);
if (disable_slot_zap_quirk)
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
hva = addr_gpa2hva(vm, MEM_REGION_GPA);
/*
@ -266,7 +269,7 @@ static void guest_code_delete_memory_region(void)
GUEST_ASSERT(0);
}
static void test_delete_memory_region(void)
static void test_delete_memory_region(bool disable_slot_zap_quirk)
{
pthread_t vcpu_thread;
struct kvm_vcpu *vcpu;
@ -276,6 +279,9 @@ static void test_delete_memory_region(void)
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region);
if (disable_slot_zap_quirk)
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
/* Delete the memory region, the guest should not die. */
vm_mem_region_delete(vm, MEM_REGION_SLOT);
wait_for_vcpu();
@ -553,7 +559,10 @@ int main(int argc, char *argv[])
{
#ifdef __x86_64__
int i, loops;
int j, disable_slot_zap_quirk = 0;
if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL)
disable_slot_zap_quirk = 1;
/*
* FIXME: the zero-memslot test fails on aarch64 and s390x because
* KVM_RUN fails with ENOEXEC or EFAULT.
@ -579,13 +588,17 @@ int main(int argc, char *argv[])
else
loops = 10;
pr_info("Testing MOVE of in-use region, %d loops\n", loops);
for (j = 0; j <= disable_slot_zap_quirk; j++) {
pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n",
loops, j ? "disabled" : "enabled");
for (i = 0; i < loops; i++)
test_move_memory_region();
test_move_memory_region(!!j);
pr_info("Testing DELETE of in-use region, %d loops\n", loops);
pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n",
loops, j ? "disabled" : "enabled");
for (i = 0; i < loops; i++)
test_delete_memory_region();
test_delete_memory_region(!!j);
}
#endif
return 0;