Merge branch 'kvm-memslot-zap-quirk' into HEAD
Today whenever a memslot is moved or deleted, KVM invalidates the entire page tables and generates fresh ones based on the new memslot layout. This behavior traditionally was kept because of a bug which was never fully investigated and caused VM instability with assigned GeForce GPUs. It generally does not have a huge overhead, because the old MMU is able to reuse cached page tables and the new one is more scalabale and can resolve EPT violations/nested page faults in parallel, but it has worse performance if the guest frequently deletes and adds small memslots, and it's entirely not viable for TDX. This is because TDX requires re-accepting of private pages after page dropping. For non-TDX VMs, this series therefore introduces the KVM_X86_QUIRK_SLOT_ZAP_ALL quirk, enabling users to control the behavior of memslot zapping when a memslot is moved/deleted. The quirk is turned on by default, leading to the zapping of all SPTEs when a memslot is moved/deleted; users however have the option to turn off the quirk, which limits the zapping only to those SPTEs hat lie within the range of memslot being moved/deleted. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
commit
55f50b2f86
@ -8082,6 +8082,14 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
|
||||
guest CPUID on writes to MISC_ENABLE if
|
||||
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
|
||||
disabled.
|
||||
|
||||
KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in
|
||||
fast way for memslot deletion when VM type
|
||||
is KVM_X86_DEFAULT_VM.
|
||||
When this quirk is disabled or when VM type
|
||||
is other than KVM_X86_DEFAULT_VM, KVM zaps
|
||||
only leaf SPTEs that are within the range of
|
||||
the memslot being deleted.
|
||||
=================================== ============================================
|
||||
|
||||
7.32 KVM_CAP_MAX_VCPU_ID
|
||||
|
@ -2345,7 +2345,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
|
||||
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
|
||||
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
|
||||
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
|
||||
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
|
||||
KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
|
||||
KVM_X86_QUIRK_SLOT_ZAP_ALL)
|
||||
|
||||
/*
|
||||
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
|
||||
|
@ -439,6 +439,7 @@ struct kvm_sync_regs {
|
||||
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
|
||||
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
|
||||
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
|
||||
#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
|
||||
|
||||
#define KVM_STATE_NESTED_FORMAT_VMX 0
|
||||
#define KVM_STATE_NESTED_FORMAT_SVM 1
|
||||
|
@ -6999,10 +6999,50 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
|
||||
kvm_mmu_zap_all(kvm);
|
||||
}
|
||||
|
||||
/*
|
||||
* Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
|
||||
*
|
||||
* Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
|
||||
* case scenario we'll have unused shadow pages lying around until they
|
||||
* are recycled due to age or when the VM is destroyed.
|
||||
*/
|
||||
static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
|
||||
{
|
||||
struct kvm_gfn_range range = {
|
||||
.slot = slot,
|
||||
.start = slot->base_gfn,
|
||||
.end = slot->base_gfn + slot->npages,
|
||||
.may_block = true,
|
||||
};
|
||||
bool flush = false;
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
|
||||
if (kvm_memslots_have_rmaps(kvm))
|
||||
flush = kvm_handle_gfn_range(kvm, &range, kvm_zap_rmap);
|
||||
|
||||
if (tdp_mmu_enabled)
|
||||
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, &range, flush);
|
||||
|
||||
if (flush)
|
||||
kvm_flush_remote_tlbs_memslot(kvm, slot);
|
||||
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
}
|
||||
|
||||
static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
|
||||
{
|
||||
return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
|
||||
kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
}
|
||||
|
||||
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
|
||||
struct kvm_memory_slot *slot)
|
||||
{
|
||||
kvm_mmu_zap_all_fast(kvm);
|
||||
if (kvm_memslot_flush_zap_all(kvm))
|
||||
kvm_mmu_zap_all_fast(kvm);
|
||||
else
|
||||
kvm_mmu_zap_memslot_leafs(kvm, slot);
|
||||
}
|
||||
|
||||
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
|
||||
|
@ -79,6 +79,7 @@ struct test_params {
|
||||
useconds_t delay;
|
||||
uint64_t nr_iterations;
|
||||
bool partition_vcpu_memory_access;
|
||||
bool disable_slot_zap_quirk;
|
||||
};
|
||||
|
||||
static void run_test(enum vm_guest_mode mode, void *arg)
|
||||
@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
|
||||
vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
|
||||
VM_MEM_SRC_ANONYMOUS,
|
||||
p->partition_vcpu_memory_access);
|
||||
#ifdef __x86_64__
|
||||
if (p->disable_slot_zap_quirk)
|
||||
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
|
||||
pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ?
|
||||
"disabled" : "enabled");
|
||||
#endif
|
||||
|
||||
pr_info("Finished creating vCPUs\n");
|
||||
|
||||
@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
|
||||
static void help(char *name)
|
||||
{
|
||||
puts("");
|
||||
printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
|
||||
printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n"
|
||||
" [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
|
||||
guest_modes_help();
|
||||
printf(" -d: add a delay between each iteration of adding and\n"
|
||||
" deleting a memslot in usec.\n");
|
||||
printf(" -q: Disable memslot zap quirk.\n");
|
||||
printf(" -b: specify the size of the memory region which should be\n"
|
||||
" accessed by each vCPU. e.g. 10M or 3G.\n"
|
||||
" Default: 1G\n");
|
||||
@ -137,7 +146,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
guest_modes_append_default();
|
||||
|
||||
while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) {
|
||||
switch (opt) {
|
||||
case 'm':
|
||||
guest_modes_cmdline(optarg);
|
||||
@ -160,6 +169,12 @@ int main(int argc, char *argv[])
|
||||
case 'i':
|
||||
p.nr_iterations = atoi_positive("Number of iterations", optarg);
|
||||
break;
|
||||
case 'q':
|
||||
p.disable_slot_zap_quirk = true;
|
||||
|
||||
TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
|
||||
KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
break;
|
||||
case 'h':
|
||||
default:
|
||||
help(argv[0]);
|
||||
|
@ -113,6 +113,7 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless");
|
||||
static sem_t vcpu_ready;
|
||||
|
||||
static bool map_unmap_verify;
|
||||
static bool disable_slot_zap_quirk;
|
||||
|
||||
static bool verbose;
|
||||
#define pr_info_v(...) \
|
||||
@ -578,6 +579,9 @@ static bool test_memslot_move_prepare(struct vm_data *data,
|
||||
uint32_t guest_page_size = data->vm->page_size;
|
||||
uint64_t movesrcgpa, movetestgpa;
|
||||
|
||||
if (disable_slot_zap_quirk)
|
||||
vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
|
||||
movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
|
||||
|
||||
if (isactive) {
|
||||
@ -896,6 +900,7 @@ static void help(char *name, struct test_args *targs)
|
||||
pr_info(" -h: print this help screen.\n");
|
||||
pr_info(" -v: enable verbose mode (not for benchmarking).\n");
|
||||
pr_info(" -d: enable extra debug checks.\n");
|
||||
pr_info(" -q: Disable memslot zap quirk during memslot move.\n");
|
||||
pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n",
|
||||
targs->nslots);
|
||||
pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n",
|
||||
@ -954,7 +959,7 @@ static bool parse_args(int argc, char *argv[],
|
||||
uint32_t max_mem_slots;
|
||||
int opt;
|
||||
|
||||
while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
|
||||
while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) {
|
||||
switch (opt) {
|
||||
case 'h':
|
||||
default:
|
||||
@ -966,6 +971,11 @@ static bool parse_args(int argc, char *argv[],
|
||||
case 'd':
|
||||
map_unmap_verify = true;
|
||||
break;
|
||||
case 'q':
|
||||
disable_slot_zap_quirk = true;
|
||||
TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
|
||||
KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
break;
|
||||
case 's':
|
||||
targs->nslots = atoi_paranoid(optarg);
|
||||
if (targs->nslots <= 1 && targs->nslots != -1) {
|
||||
|
@ -175,7 +175,7 @@ static void guest_code_move_memory_region(void)
|
||||
GUEST_DONE();
|
||||
}
|
||||
|
||||
static void test_move_memory_region(void)
|
||||
static void test_move_memory_region(bool disable_slot_zap_quirk)
|
||||
{
|
||||
pthread_t vcpu_thread;
|
||||
struct kvm_vcpu *vcpu;
|
||||
@ -184,6 +184,9 @@ static void test_move_memory_region(void)
|
||||
|
||||
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region);
|
||||
|
||||
if (disable_slot_zap_quirk)
|
||||
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
|
||||
hva = addr_gpa2hva(vm, MEM_REGION_GPA);
|
||||
|
||||
/*
|
||||
@ -266,7 +269,7 @@ static void guest_code_delete_memory_region(void)
|
||||
GUEST_ASSERT(0);
|
||||
}
|
||||
|
||||
static void test_delete_memory_region(void)
|
||||
static void test_delete_memory_region(bool disable_slot_zap_quirk)
|
||||
{
|
||||
pthread_t vcpu_thread;
|
||||
struct kvm_vcpu *vcpu;
|
||||
@ -276,6 +279,9 @@ static void test_delete_memory_region(void)
|
||||
|
||||
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region);
|
||||
|
||||
if (disable_slot_zap_quirk)
|
||||
vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
|
||||
|
||||
/* Delete the memory region, the guest should not die. */
|
||||
vm_mem_region_delete(vm, MEM_REGION_SLOT);
|
||||
wait_for_vcpu();
|
||||
@ -553,7 +559,10 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
#ifdef __x86_64__
|
||||
int i, loops;
|
||||
int j, disable_slot_zap_quirk = 0;
|
||||
|
||||
if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL)
|
||||
disable_slot_zap_quirk = 1;
|
||||
/*
|
||||
* FIXME: the zero-memslot test fails on aarch64 and s390x because
|
||||
* KVM_RUN fails with ENOEXEC or EFAULT.
|
||||
@ -579,13 +588,17 @@ int main(int argc, char *argv[])
|
||||
else
|
||||
loops = 10;
|
||||
|
||||
pr_info("Testing MOVE of in-use region, %d loops\n", loops);
|
||||
for (i = 0; i < loops; i++)
|
||||
test_move_memory_region();
|
||||
for (j = 0; j <= disable_slot_zap_quirk; j++) {
|
||||
pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n",
|
||||
loops, j ? "disabled" : "enabled");
|
||||
for (i = 0; i < loops; i++)
|
||||
test_move_memory_region(!!j);
|
||||
|
||||
pr_info("Testing DELETE of in-use region, %d loops\n", loops);
|
||||
for (i = 0; i < loops; i++)
|
||||
test_delete_memory_region();
|
||||
pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n",
|
||||
loops, j ? "disabled" : "enabled");
|
||||
for (i = 0; i < loops; i++)
|
||||
test_delete_memory_region(!!j);
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
Loading…
Reference in New Issue
Block a user