diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 09126bb8cc9f..1b52b1b7bbc4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2648,6 +2648,23 @@ Default is Y (on). + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] + If enabled, KVM will enable virtualization in hardware + when KVM is loaded, and disable virtualization when KVM + is unloaded (if KVM is built as a module). + + If disabled, KVM will dynamically enable and disable + virtualization on-demand when creating and destroying + VMs, i.e. on the 0=>1 and 1=>0 transitions of the + number of VMs. + + Enabling virtualization at module lode avoids potential + latency for creation of the 0=>1 VM, as KVM serializes + virtualization enabling across all online CPUs. The + "cost" of enabling virtualization when KVM is loaded, + is that doing so may interfere with using out-of-tree + hypervisors that want to "own" virtualization hardware. + kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface. Default is false (don't support). diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 02880d5552d5..be3c323888b1 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -11,6 +11,8 @@ The acquisition orders for mutexes are as follows: - cpus_read_lock() is taken outside kvm_lock +- kvm_usage_lock is taken outside cpus_read_lock() + - kvm->lock is taken outside vcpu->mutex - kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock @@ -24,6 +26,12 @@ The acquisition orders for mutexes are as follows: are taken on the waiting side when modifying memslots, so MMU notifiers must not take either kvm->slots_lock or kvm->slots_arch_lock. +cpus_read_lock() vs kvm_lock: +- Taking cpus_read_lock() outside of kvm_lock is problematic, despite that + being the official ordering, as it is quite easy to unknowingly trigger + cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list, + e.g. avoid complex operations when possible. + For SRCU: - ``synchronize_srcu(&kvm->srcu)`` is called inside critical sections @@ -227,10 +235,16 @@ time it will be set using the Dirty tracking mechanism described above. :Type: mutex :Arch: any :Protects: - vm_list - - kvm_usage_count + +``kvm_usage_lock`` +^^^^^^^^^^^^^^^^^^ + +:Type: mutex +:Arch: any +:Protects: - kvm_usage_count - hardware virtualization enable/disable -:Comment: KVM also disables CPU hotplug via cpus_read_lock() during - enable/disable. +:Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is + protected, which simplifies the virtualization enabling logic. ``kvm->mn_invalidate_lock`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -290,11 +304,12 @@ time it will be set using the Dirty tracking mechanism described above. wakeup. ``vendor_module_lock`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^ :Type: mutex :Arch: x86 :Protects: loading a vendor module (kvm_amd or kvm_intel) -:Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is - taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and - many operations need to take cpu_hotplug_lock when loading a vendor module, - e.g. updating static calls. +:Comment: Exists because using kvm_lock leads to deadlock. kvm_lock is taken + in notifiers, e.g. __kvmclock_cpufreq_notifier(), that may be invoked while + cpu_hotplug_lock is held, e.g. from cpufreq_boost_trigger_state(), and many + operations need to take cpu_hotplug_lock when loading a vendor module, e.g. + updating static calls. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fe0764173cd0..a0d01c46e408 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2164,7 +2164,7 @@ static void cpu_hyp_uninit(void *discard) } } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { /* * Most calls to this function are made with migration @@ -2184,7 +2184,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_timer_cpu_down(); kvm_vgic_cpu_down(); @@ -2380,7 +2380,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits) /* * The stub hypercalls are now disabled, so set our local flag to - * prevent a later re-init attempt in kvm_arch_hardware_enable(). + * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). */ __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index 844736b99d38..27e9b94c0a0b 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -261,7 +261,7 @@ long kvm_arch_dev_ioctl(struct file *filp, return -ENOIOCTLCMD; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { unsigned long env, gcfg = 0; @@ -300,7 +300,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { write_csr_gcfg(0); write_csr_gstat(0); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6743a57c1ab4..f7222eb594ea 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -728,8 +728,8 @@ struct kvm_mips_callbacks { int (*handle_fpe)(struct kvm_vcpu *vcpu); int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); int (*handle_guest_exit)(struct kvm_vcpu *vcpu); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); int (*check_extension)(struct kvm *kvm, long ext); int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index b5de770b092e..60b43ea85c12 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -125,14 +125,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return 1; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { - return kvm_mips_callbacks->hardware_enable(); + return kvm_mips_callbacks->enable_virtualization_cpu(); } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { - kvm_mips_callbacks->hardware_disable(); + kvm_mips_callbacks->disable_virtualization_cpu(); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 99d5a71e4300..ccab4d76b126 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -2869,7 +2869,7 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size) return ret + 1; } -static int kvm_vz_hardware_enable(void) +static int kvm_vz_enable_virtualization_cpu(void) { unsigned int mmu_size, guest_mmu_size, ftlb_size; u64 guest_cvmctl, cvmvmconfig; @@ -2983,7 +2983,7 @@ static int kvm_vz_hardware_enable(void) return 0; } -static void kvm_vz_hardware_disable(void) +static void kvm_vz_disable_virtualization_cpu(void) { u64 cvmvmconfig; unsigned int mmu_size; @@ -3280,8 +3280,8 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = { .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, .handle_guest_exit = kvm_trap_vz_handle_guest_exit, - .hardware_enable = kvm_vz_hardware_enable, - .hardware_disable = kvm_vz_hardware_disable, + .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu, + .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu, .check_extension = kvm_vz_check_extension, .vcpu_init = kvm_vz_vcpu_init, .vcpu_uninit = kvm_vz_vcpu_uninit, diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index bab2ec34cd87..f3427f6de608 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -20,7 +20,7 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -int kvm_arch_hardware_enable(void) +int kvm_arch_enable_virtualization_cpu(void) { csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT); csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT); @@ -35,7 +35,7 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { kvm_riscv_aia_disable(); diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 68ad4f923664..03b7e13f15bb 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -14,8 +14,8 @@ BUILD_BUG_ON(1) * be __static_call_return0. */ KVM_X86_OP(check_processor_compatibility) -KVM_X86_OP(hardware_enable) -KVM_X86_OP(hardware_disable) +KVM_X86_OP(enable_virtualization_cpu) +KVM_X86_OP(disable_virtualization_cpu) KVM_X86_OP(hardware_unsetup) KVM_X86_OP(has_emulated_msr) KVM_X86_OP(vcpu_after_set_cpuid) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e4fc362ba3da..52443ccda320 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -36,6 +36,7 @@ #include #include #include +#include #define __KVM_HAVE_ARCH_VCPU_DEBUGFS @@ -1629,8 +1630,10 @@ struct kvm_x86_ops { int (*check_processor_compatibility)(void); - int (*hardware_enable)(void); - void (*hardware_disable)(void); + int (*enable_virtualization_cpu)(void); + void (*disable_virtualization_cpu)(void); + cpu_emergency_virt_cb *emergency_disable_virtualization_cpu; + void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 6536873f8fc0..d0ef2a678d66 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -25,8 +25,8 @@ void __noreturn machine_real_restart(unsigned int type); #define MRR_BIOS 0 #define MRR_APM 1 -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) typedef void (cpu_emergency_virt_cb)(void); +#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback); void cpu_emergency_disable_virtualization(void); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5ab2c92c7331..6b804ec3d1dc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void) } } -static void svm_emergency_disable(void) +static void svm_emergency_disable_virtualization_cpu(void) { kvm_rebooting = true; kvm_cpu_svm_disable(); } -static void svm_hardware_disable(void) +static void svm_disable_virtualization_cpu(void) { /* Make sure we clean up behind us */ if (tsc_scaling) @@ -610,7 +610,7 @@ static void svm_hardware_disable(void) amd_pmu_disable_virt(); } -static int svm_hardware_enable(void) +static int svm_enable_virtualization_cpu(void) { struct svm_cpu_data *sd; @@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * TSC_AUX is always virtualized for SEV-ES guests when the feature is * available. The user return MSR support is not required in this case * because TSC_AUX is restored on #VMEXIT from the host save area - * (which has been initialized in svm_hardware_enable()). + * (which has been initialized in svm_enable_virtualization_cpu()). */ if (likely(tsc_aux_uret_slot >= 0) && (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) @@ -3144,7 +3144,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) * feature is available. The user return MSR support is not * required in this case because TSC_AUX is restored on #VMEXIT * from the host save area (which has been initialized in - * svm_hardware_enable()). + * svm_enable_virtualization_cpu()). */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm)) break; @@ -4992,8 +4992,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_processor_compatibility = svm_check_processor_compat, .hardware_unsetup = svm_hardware_unsetup, - .hardware_enable = svm_hardware_enable, - .hardware_disable = svm_hardware_disable, + .enable_virtualization_cpu = svm_enable_virtualization_cpu, + .disable_virtualization_cpu = svm_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu, .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_vcpu_create, @@ -5425,8 +5426,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = { static void __svm_exit(void) { kvm_x86_vendor_exit(); - - cpu_emergency_unregister_virt_callback(svm_emergency_disable); } static int __init svm_init(void) @@ -5442,8 +5441,6 @@ static int __init svm_init(void) if (r) return r; - cpu_emergency_register_virt_callback(svm_emergency_disable); - /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 0bf35ebe8a1b..7e2e78a14257 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -23,8 +23,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .hardware_unsetup = vmx_hardware_unsetup, - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, + .has_emulated_msr = vmx_has_emulated_msr, .vm_size = sizeof(struct kvm_vmx), diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f18c2d8c7476..89682832dded 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -755,7 +755,7 @@ fault: return -EIO; } -static void vmx_emergency_disable(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -2844,7 +2844,7 @@ fault: return -EFAULT; } -int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2881,7 +2881,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -8584,8 +8584,6 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); - vmx_cleanup_l1d_flush(); } @@ -8632,8 +8630,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); - vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3221cd1d01..b6a7cfc6ae31 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -13,8 +13,9 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata; void vmx_hardware_unsetup(void); int vmx_check_processor_compat(void); -int vmx_hardware_enable(void); -void vmx_hardware_disable(void); +int vmx_enable_virtualization_cpu(void); +void vmx_disable_virtualization_cpu(void); +void vmx_emergency_disable_virtualization_cpu(void); int vmx_vm_init(struct kvm *kvm); void vmx_vm_destroy(struct kvm *kvm); int vmx_vcpu_precreate(struct kvm *kvm); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c983c8e434b8..0f7342b574f9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -355,7 +355,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn) /* * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() + * interrupted and executed through kvm_arch_disable_virtualization_cpu() */ local_irq_save(flags); if (msrs->registered) { @@ -9753,7 +9753,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) guard(mutex)(&vendor_module_lock); - if (kvm_x86_ops.hardware_enable) { + if (kvm_x86_ops.enable_virtualization_cpu) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; } @@ -9880,7 +9880,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return 0; out_unwind_ops: - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: kvm_mmu_vendor_module_exit(); @@ -9921,7 +9921,7 @@ void kvm_x86_vendor_exit(void) WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif mutex_lock(&vendor_module_lock); - kvm_x86_ops.hardware_enable = NULL; + kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); @@ -12516,7 +12516,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) } EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); -int kvm_arch_hardware_enable(void) +void kvm_arch_enable_virtualization(void) +{ + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +void kvm_arch_disable_virtualization(void) +{ + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; @@ -12532,7 +12542,7 @@ int kvm_arch_hardware_enable(void) if (ret) return ret; - ret = kvm_x86_call(hardware_enable)(); + ret = kvm_x86_call(enable_virtualization_cpu)(); if (ret != 0) return ret; @@ -12612,9 +12622,9 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) +void kvm_arch_disable_virtualization_cpu(void) { - kvm_x86_call(hardware_disable)(); + kvm_x86_call(disable_virtualization_cpu)(); drop_user_return_notifiers(); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0d5125a3e31a..db567d26f7b9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1529,8 +1529,22 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -int kvm_arch_hardware_enable(void); -void kvm_arch_hardware_disable(void); +/* + * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under + * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of + * kvm_usage_count, i.e. at the beginning of the generic hardware enabling + * sequence, and at the end of the generic hardware disabling sequence. + */ +void kvm_arch_enable_virtualization(void); +void kvm_arch_disable_virtualization(void); +/* + * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to + * do the actual twiddling of hardware bits. The hooks are called on all + * online CPUs when KVM enables/disabled virtualization, and on a single CPU + * when that CPU is onlined/offlined (including for Resume/Suspend). + */ +int kvm_arch_enable_virtualization_cpu(void); +void kvm_arch_disable_virtualization_cpu(void); #endif int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb2b78e92910..5f9a66df7bfb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -136,8 +136,8 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int hardware_enable_all(void); -static void hardware_disable_all(void); +static int kvm_enable_virtualization(void); +static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -1220,7 +1220,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_arch_destroy_vm; - r = hardware_enable_all(); + r = kvm_enable_virtualization(); if (r) goto out_err_no_disable; @@ -1263,7 +1263,7 @@ out_no_coalesced_mmio: mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif out_err_no_mmu_notifier: - hardware_disable_all(); + kvm_disable_virtualization(); out_err_no_disable: kvm_arch_destroy_vm(kvm); out_err_no_arch_destroy_vm: @@ -1360,7 +1360,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); - hardware_disable_all(); + kvm_disable_virtualization(); mmdrop(mm); } @@ -5571,137 +5571,67 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +static bool enable_virt_at_load = true; +module_param(enable_virt_at_load, bool, 0444); + __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -static DEFINE_PER_CPU(bool, hardware_enabled); +static DEFINE_PER_CPU(bool, virtualization_enabled); +static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; -static int __hardware_enable_nolock(void) +__weak void kvm_arch_enable_virtualization(void) { - if (__this_cpu_read(hardware_enabled)) + +} + +__weak void kvm_arch_disable_virtualization(void) +{ + +} + +static int kvm_enable_virtualization_cpu(void) +{ + if (__this_cpu_read(virtualization_enabled)) return 0; - if (kvm_arch_hardware_enable()) { + if (kvm_arch_enable_virtualization_cpu()) { pr_info("kvm: enabling virtualization on CPU%d failed\n", raw_smp_processor_id()); return -EIO; } - __this_cpu_write(hardware_enabled, true); + __this_cpu_write(virtualization_enabled, true); return 0; } -static void hardware_enable_nolock(void *failed) -{ - if (__hardware_enable_nolock()) - atomic_inc(failed); -} - static int kvm_online_cpu(unsigned int cpu) { - int ret = 0; - /* * Abort the CPU online process if hardware virtualization cannot * be enabled. Otherwise running VMs would encounter unrecoverable * errors when scheduled to this CPU. */ - mutex_lock(&kvm_lock); - if (kvm_usage_count) - ret = __hardware_enable_nolock(); - mutex_unlock(&kvm_lock); - return ret; + return kvm_enable_virtualization_cpu(); } -static void hardware_disable_nolock(void *junk) +static void kvm_disable_virtualization_cpu(void *ign) { - /* - * Note, hardware_disable_all_nolock() tells all online CPUs to disable - * hardware, not just CPUs that successfully enabled hardware! - */ - if (!__this_cpu_read(hardware_enabled)) + if (!__this_cpu_read(virtualization_enabled)) return; - kvm_arch_hardware_disable(); + kvm_arch_disable_virtualization_cpu(); - __this_cpu_write(hardware_enabled, false); + __this_cpu_write(virtualization_enabled, false); } static int kvm_offline_cpu(unsigned int cpu) { - mutex_lock(&kvm_lock); - if (kvm_usage_count) - hardware_disable_nolock(NULL); - mutex_unlock(&kvm_lock); + kvm_disable_virtualization_cpu(NULL); return 0; } -static void hardware_disable_all_nolock(void) -{ - BUG_ON(!kvm_usage_count); - - kvm_usage_count--; - if (!kvm_usage_count) - on_each_cpu(hardware_disable_nolock, NULL, 1); -} - -static void hardware_disable_all(void) -{ - cpus_read_lock(); - mutex_lock(&kvm_lock); - hardware_disable_all_nolock(); - mutex_unlock(&kvm_lock); - cpus_read_unlock(); -} - -static int hardware_enable_all(void) -{ - atomic_t failed = ATOMIC_INIT(0); - int r; - - /* - * Do not enable hardware virtualization if the system is going down. - * If userspace initiated a forced reboot, e.g. reboot -f, then it's - * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling - * after kvm_reboot() is called. Note, this relies on system_state - * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops - * hook instead of registering a dedicated reboot notifier (the latter - * runs before system_state is updated). - */ - if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || - system_state == SYSTEM_RESTART) - return -EBUSY; - - /* - * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu() - * is called, and so on_each_cpu() between them includes the CPU that - * is being onlined. As a result, hardware_enable_nolock() may get - * invoked before kvm_online_cpu(), which also enables hardware if the - * usage count is non-zero. Disable CPU hotplug to avoid attempting to - * enable hardware multiple times. - */ - cpus_read_lock(); - mutex_lock(&kvm_lock); - - r = 0; - - kvm_usage_count++; - if (kvm_usage_count == 1) { - on_each_cpu(hardware_enable_nolock, &failed, 1); - - if (atomic_read(&failed)) { - hardware_disable_all_nolock(); - r = -EBUSY; - } - } - - mutex_unlock(&kvm_lock); - cpus_read_unlock(); - - return r; -} - static void kvm_shutdown(void) { /* @@ -5717,34 +5647,32 @@ static void kvm_shutdown(void) */ pr_info("kvm: exiting hardware virtualization\n"); kvm_rebooting = true; - on_each_cpu(hardware_disable_nolock, NULL, 1); + on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } static int kvm_suspend(void) { /* * Secondary CPUs and CPU hotplug are disabled across the suspend/resume - * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count - * is stable. Assert that kvm_lock is not held to ensure the system - * isn't suspended while KVM is enabling hardware. Hardware enabling - * can be preempted, but the task cannot be frozen until it has dropped - * all locks (userspace tasks are frozen via a fake signal). + * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage + * count is stable. Assert that kvm_usage_lock is not held to ensure + * the system isn't suspended while KVM is enabling hardware. Hardware + * enabling can be preempted, but the task cannot be frozen until it has + * dropped all locks (userspace tasks are frozen via a fake signal). */ - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - hardware_disable_nolock(NULL); + kvm_disable_virtualization_cpu(NULL); return 0; } static void kvm_resume(void) { - lockdep_assert_not_held(&kvm_lock); + lockdep_assert_not_held(&kvm_usage_lock); lockdep_assert_irqs_disabled(); - if (kvm_usage_count) - WARN_ON_ONCE(__hardware_enable_nolock()); + WARN_ON_ONCE(kvm_enable_virtualization_cpu()); } static struct syscore_ops kvm_syscore_ops = { @@ -5752,13 +5680,95 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, .shutdown = kvm_shutdown, }; + +static int kvm_enable_virtualization(void) +{ + int r; + + guard(mutex)(&kvm_usage_lock); + + if (kvm_usage_count++) + return 0; + + kvm_arch_enable_virtualization(); + + r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", + kvm_online_cpu, kvm_offline_cpu); + if (r) + goto err_cpuhp; + + register_syscore_ops(&kvm_syscore_ops); + + /* + * Undo virtualization enabling and bail if the system is going down. + * If userspace initiated a forced reboot, e.g. reboot -f, then it's + * possible for an in-flight operation to enable virtualization after + * syscore_shutdown() is called, i.e. without kvm_shutdown() being + * invoked. Note, this relies on system_state being set _before_ + * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked + * or this CPU observes the impending shutdown. Which is why KVM uses + * a syscore ops hook instead of registering a dedicated reboot + * notifier (the latter runs before system_state is updated). + */ + if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || + system_state == SYSTEM_RESTART) { + r = -EBUSY; + goto err_rebooting; + } + + return 0; + +err_rebooting: + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); +err_cpuhp: + kvm_arch_disable_virtualization(); + --kvm_usage_count; + return r; +} + +static void kvm_disable_virtualization(void) +{ + guard(mutex)(&kvm_usage_lock); + + if (--kvm_usage_count) + return; + + unregister_syscore_ops(&kvm_syscore_ops); + cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); + kvm_arch_disable_virtualization(); +} + +static int kvm_init_virtualization(void) +{ + if (enable_virt_at_load) + return kvm_enable_virtualization(); + + return 0; +} + +static void kvm_uninit_virtualization(void) +{ + if (enable_virt_at_load) + kvm_disable_virtualization(); +} #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int hardware_enable_all(void) +static int kvm_enable_virtualization(void) { return 0; } -static void hardware_disable_all(void) +static int kvm_init_virtualization(void) +{ + return 0; +} + +static void kvm_disable_virtualization(void) +{ + +} + +static void kvm_uninit_virtualization(void) { } @@ -6460,15 +6470,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) int r; int cpu; -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online", - kvm_online_cpu, kvm_offline_cpu); - if (r) - return r; - - register_syscore_ops(&kvm_syscore_ops); -#endif - /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); @@ -6479,10 +6480,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) offsetofend(struct kvm_vcpu, stats_id) - offsetof(struct kvm_vcpu, arch), NULL); - if (!kvm_vcpu_cache) { - r = -ENOMEM; - goto err_vcpu_cache; - } + if (!kvm_vcpu_cache) + return -ENOMEM; for_each_possible_cpu(cpu) { if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu), @@ -6516,6 +6515,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) kvm_gmem_init(module); + r = kvm_init_virtualization(); + if (r) + goto err_virt; + /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! @@ -6529,6 +6532,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) return 0; err_register: + kvm_uninit_virtualization(); +err_virt: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6539,11 +6544,6 @@ err_cpu_kick_mask: for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); -err_vcpu_cache: -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif return r; } EXPORT_SYMBOL_GPL(kvm_init); @@ -6559,16 +6559,14 @@ void kvm_exit(void) */ misc_deregister(&kvm_dev); + kvm_uninit_virtualization(); + debugfs_remove_recursive(kvm_debugfs_dir); for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING - unregister_syscore_ops(&kvm_syscore_ops); - cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE); -#endif kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit);