diff --git a/MAINTAINERS b/MAINTAINERS index d74a07f8b010..c77800006e72 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12463,7 +12463,7 @@ F: virt/kvm/* KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) M: Marc Zyngier M: Oliver Upton -R: James Morse +R: Joey Gouly R: Suzuki K Poulose R: Zenghui Yu L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 329619c6fa96..94cff508874b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1441,11 +1441,6 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); sign_extend64(__val, id##_##fld##_WIDTH - 1); \ }) -#define expand_field_sign(id, fld, val) \ - (id##_##fld##_SIGNED ? \ - __expand_field_sign_signed(id, fld, val) : \ - __expand_field_sign_unsigned(id, fld, val)) - #define get_idreg_field_unsigned(kvm, id, fld) \ ({ \ u64 __val = kvm_read_vm_id_reg((kvm), SYS_##id); \ @@ -1461,20 +1456,26 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); #define get_idreg_field_enum(kvm, id, fld) \ get_idreg_field_unsigned(kvm, id, fld) -#define get_idreg_field(kvm, id, fld) \ +#define kvm_cmp_feat_signed(kvm, id, fld, op, limit) \ + (get_idreg_field_signed((kvm), id, fld) op __expand_field_sign_signed(id, fld, limit)) + +#define kvm_cmp_feat_unsigned(kvm, id, fld, op, limit) \ + (get_idreg_field_unsigned((kvm), id, fld) op __expand_field_sign_unsigned(id, fld, limit)) + +#define kvm_cmp_feat(kvm, id, fld, op, limit) \ (id##_##fld##_SIGNED ? \ - get_idreg_field_signed(kvm, id, fld) : \ - get_idreg_field_unsigned(kvm, id, fld)) + kvm_cmp_feat_signed(kvm, id, fld, op, limit) : \ + kvm_cmp_feat_unsigned(kvm, id, fld, op, limit)) #define kvm_has_feat(kvm, id, fld, limit) \ - (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, limit)) + kvm_cmp_feat(kvm, id, fld, >=, limit) #define kvm_has_feat_enum(kvm, id, fld, val) \ - (get_idreg_field_unsigned((kvm), id, fld) == __expand_field_sign_unsigned(id, fld, val)) + kvm_cmp_feat_unsigned(kvm, id, fld, ==, val) #define kvm_has_feat_range(kvm, id, fld, min, max) \ - (get_idreg_field((kvm), id, fld) >= expand_field_sign(id, fld, min) && \ - get_idreg_field((kvm), id, fld) <= expand_field_sign(id, fld, max)) + (kvm_cmp_feat(kvm, id, fld, >=, min) && \ + kvm_cmp_feat(kvm, id, fld, <=, max)) /* Check for a given level of PAuth support */ #define kvm_has_pauth(k, l) \ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 46d52e8a3df3..5310fe1da616 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -338,7 +338,7 @@ static inline void __hyp_sve_save_host(void) struct cpu_sve_state *sve_state = *host_data_ptr(sve_state); sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR); - write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), &sve_state->fpsr, true); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 87692b566d90..fefc89209f9e 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -33,7 +33,7 @@ static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) */ sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); __sve_save_state(vcpu_sve_pffr(vcpu), &vcpu->arch.ctxt.fp_regs.fpsr, true); - write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); } static void __hyp_sve_restore_host(void) @@ -45,10 +45,11 @@ static void __hyp_sve_restore_host(void) * the host. The layout of the data when saving the sve state depends * on the VL, so use a consistent (i.e., the maximum) host VL. * - * Setting ZCR_EL2 to ZCR_ELx_LEN_MASK sets the effective length - * supported by the system (or limited at EL3). + * Note that this constrains the PE to the maximum shared VL + * that was discovered, if we wish to use larger VLs this will + * need to be revisited. */ - write_sysreg_s(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2); __sve_restore_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl), &sve_state->fpsr, true); @@ -488,7 +489,8 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) case ESR_ELx_EC_SVE: cpacr_clear_set(0, CPACR_ELx_ZEN); isb(); - sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); + sve_cond_update_zcr_vq(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, + SYS_ZCR_EL2); break; case ESR_ELx_EC_IABT_LOW: case ESR_ELx_EC_DABT_LOW: diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 187a5f4d56c0..077d4098548d 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -574,12 +574,14 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, unlock: hyp_spin_unlock(&vm_table_lock); - if (ret) + if (ret) { unmap_donated_memory(hyp_vcpu, sizeof(*hyp_vcpu)); + return ret; + } hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); - return ret; + return 0; } static void diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index c02183d3cdd7..ecd58ea9a837 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -26,7 +26,7 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_APM 1 typedef void (cpu_emergency_virt_cb)(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); @@ -34,7 +34,7 @@ void cpu_emergency_disable_virtualization(void); static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {} static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {} static inline void cpu_emergency_disable_virtualization(void) {} -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ typedef void (*nmi_shootdown_cb)(int, struct pt_regs*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0e0a4cf6b5eb..615922838c51 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -530,7 +530,7 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) /* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; @@ -600,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void) } #else static void emergency_reboot_disable_virtualization(void) { } -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ void __attribute__((weak)) mach_reboot_fixups(void) { diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 730c2f34d347..f09f13c01c6b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -17,8 +17,8 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM - tristate "Kernel-based Virtual Machine (KVM) support" +config KVM_X86 + def_tristate KVM if KVM_INTEL || KVM_AMD depends on X86_LOCAL_APIC select KVM_COMMON select KVM_GENERIC_MMU_NOTIFIER @@ -44,7 +44,11 @@ config KVM select HAVE_KVM_PM_NOTIFIER if PM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY + select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM select KVM_WERROR if WERROR + +config KVM + tristate "Kernel-based Virtual Machine (KVM) support" help Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -77,7 +81,6 @@ config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM && X86_64 - select KVM_GENERIC_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 5494669a055a..f9dddb8cb466 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -32,7 +32,7 @@ kvm-intel-y += vmx/vmx_onhyperv.o vmx/hyperv_evmcs.o kvm-amd-y += svm/svm_onhyperv.o endif -obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_X86) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e52f990548df..a9a23e058555 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1884,10 +1884,14 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) if (is_obsolete_sp((_kvm), (_sp))) { \ } else -#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ +#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ - if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else + if ((_sp)->gfn != (_gfn)) {} else + +#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ + for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ + if (!sp_has_gptes(_sp)) {} else static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -7047,14 +7051,42 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) kvm_mmu_zap_all(kvm); } -/* - * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted. - * - * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst - * case scenario we'll have unused shadow pages lying around until they - * are recycled due to age or when the VM is destroyed. - */ -static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot) +static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, + struct kvm_memory_slot *slot, + bool flush) +{ + LIST_HEAD(invalid_list); + unsigned long i; + + if (list_empty(&kvm->arch.active_mmu_pages)) + goto out_flush; + + /* + * Since accounting information is stored in struct kvm_arch_memory_slot, + * shadow pages deletion (e.g. unaccount_shadowed()) requires that all + * gfns with a shadow page have a corresponding memslot. Do so before + * the memslot goes away. + */ + for (i = 0; i < slot->npages; i++) { + struct kvm_mmu_page *sp; + gfn_t gfn = slot->base_gfn + i; + + for_each_gfn_valid_sp(kvm, sp, gfn) + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); + flush = false; + cond_resched_rwlock_write(&kvm->mmu_lock); + } + } + +out_flush: + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); +} + +static void kvm_mmu_zap_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { struct kvm_gfn_range range = { .slot = slot, @@ -7062,11 +7094,11 @@ static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *s .end = slot->base_gfn + slot->npages, .may_block = true, }; + bool flush; write_lock(&kvm->mmu_lock); - if (kvm_unmap_gfn_range(kvm, &range)) - kvm_flush_remote_tlbs_memslot(kvm, slot); - + flush = kvm_unmap_gfn_range(kvm, &range); + kvm_mmu_zap_memslot_pages_and_flush(kvm, slot, flush); write_unlock(&kvm->mmu_lock); } @@ -7082,7 +7114,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, if (kvm_memslot_flush_zap_all(kvm)) kvm_mmu_zap_all_fast(kvm); else - kvm_mmu_zap_memslot_leafs(kvm, slot); + kvm_mmu_zap_memslot(kvm, slot); } void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index e3343f0df9e1..c81a84990eab 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -169,12 +169,14 @@ int main(int argc, char *argv[]) case 'i': p.nr_iterations = atoi_positive("Number of iterations", optarg); break; +#ifdef __x86_64__ case 'q': p.disable_slot_zap_quirk = true; TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL); break; +#endif case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 893366982f77..989ffe0d047f 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -113,7 +113,9 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless"); static sem_t vcpu_ready; static bool map_unmap_verify; +#ifdef __x86_64__ static bool disable_slot_zap_quirk; +#endif static bool verbose; #define pr_info_v(...) \ @@ -579,8 +581,10 @@ static bool test_memslot_move_prepare(struct vm_data *data, uint32_t guest_page_size = data->vm->page_size; uint64_t movesrcgpa, movetestgpa; +#ifdef __x86_64__ if (disable_slot_zap_quirk) vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL); +#endif movesrcgpa = vm_slot2gpa(data, data->nslots - 1); @@ -971,11 +975,13 @@ static bool parse_args(int argc, char *argv[], case 'd': map_unmap_verify = true; break; +#ifdef __x86_64__ case 'q': disable_slot_zap_quirk = true; TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL); break; +#endif case 's': targs->nslots = atoi_paranoid(optarg); if (targs->nslots <= 1 && targs->nslots != -1) {