KVM x86 MTRR virtualization removal
Remove support for virtualizing MTRRs on Intel CPUs, along with a nasty CR0.CD hack, and instead always honor guest PAT on CPUs that support self-snoop. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmaRuwAACgkQOlYIJqCj N/32Gg/+Nnnz6TCRno2vursPJme7gvtLdqSxjazAj3u2ZO8IApGYWMyfVpS+ymC9 Wdpj6gRe2ukSxgTsUI2CYoy5V2NxDaA9YgdTPZUVQvqwujVrqZCJ7L393iPYYnC9 No3LXZ+SOYRmomiCzknjC6GOlT2hAZHzQsyaXDlEYok7NAA2L6XybbLonEdA4RYi V1mS62W5PaA4tUesuxkJjPujXo1nXRWD/aXOruJWjPESdSFSALlx7reFAf2Nwn7K Uw8yZqhq6vWAZSph0Nz8OrZOS/kULKA3q2zl1B/qJJ0ToAt2VdXS6abXky52RExf KvP+jBAWMO5kHbIqaMRtCHjbIkbhH8RdUIYNJQEUQ5DdydM5+/RDa+KprmLPcmUn qvJq+3uyH0MEENtneGegs8uxR+sn6fT32cGMIw790yIywddh562+IJ4Z+C3BuYJi yszD71odqKT8+knUd2CaZjE9UZyoQNDfj2OCCTzzZOC/6TuJWCh9CYQ1csssHbQR KcvZCKE6ht8tWwi+2HWj0laOdg1reX2kV869k3xH4uCwEaFIj2Wk+/Bw/lg2Tn5h 5uTnQ01dx5XhAV1klr6IY3VXJ/A8G8895wRfkZEelsA9Wj8qZvNgXhsoXReIUIrn aR0ppsFcbqHzC50qE2JT4juTD1EPx95LL9zKT8pI9mGKwxCAxUM= =yb10 -----END PGP SIGNATURE----- Merge tag 'kvm-x86-mtrrs-6.11' of https://github.com/kvm-x86/linux into HEAD KVM x86 MTRR virtualization removal Remove support for virtualizing MTRRs on Intel CPUs, along with a nasty CR0.CD hack, and instead always honor guest PAT on CPUs that support self-snoop.
This commit is contained in:
commit
5c5ddf7107
@ -8025,7 +8025,11 @@ The valid bits in cap.args[0] are:
|
||||
When this quirk is disabled, the reset value
|
||||
is 0x10000 (APIC_LVT_MASKED).
|
||||
|
||||
KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW.
|
||||
KVM_X86_QUIRK_CD_NW_CLEARED By default, KVM clears CR0.CD and CR0.NW on
|
||||
AMD CPUs to workaround buggy guest firmware
|
||||
that runs in perpetuity with CR0.CD, i.e.
|
||||
with caches in "no fill" mode.
|
||||
|
||||
When this quirk is disabled, KVM does not
|
||||
change the value of CR0.CD and CR0.NW.
|
||||
|
||||
|
@ -48,3 +48,21 @@ have the same physical APIC ID, KVM will deliver events targeting that APIC ID
|
||||
only to the vCPU with the lowest vCPU ID. If KVM_X2APIC_API_USE_32BIT_IDS is
|
||||
not enabled, KVM follows x86 architecture when processing interrupts (all vCPUs
|
||||
matching the target APIC ID receive the interrupt).
|
||||
|
||||
MTRRs
|
||||
-----
|
||||
KVM does not virtualize guest MTRR memory types. KVM emulates accesses to MTRR
|
||||
MSRs, i.e. {RD,WR}MSR in the guest will behave as expected, but KVM does not
|
||||
honor guest MTRRs when determining the effective memory type, and instead
|
||||
treats all of guest memory as having Writeback (WB) MTRRs.
|
||||
|
||||
CR0.CD
|
||||
------
|
||||
KVM does not virtualize CR0.CD on Intel CPUs. Similar to MTRR MSRs, KVM
|
||||
emulates CR0.CD accesses so that loads and stores from/to CR0 behave as
|
||||
expected, but setting CR0.CD=1 has no impact on the cachaeability of guest
|
||||
memory.
|
||||
|
||||
Note, this erratum does not affect AMD CPUs, which fully virtualize CR0.CD in
|
||||
hardware, i.e. put the CPU caches into "no fill" mode when CR0.CD=1, even when
|
||||
running in the guest.
|
@ -160,7 +160,6 @@
|
||||
#define KVM_MIN_FREE_MMU_PAGES 5
|
||||
#define KVM_REFILL_PAGES 25
|
||||
#define KVM_MAX_CPUID_ENTRIES 256
|
||||
#define KVM_NR_FIXED_MTRR_REGION 88
|
||||
#define KVM_NR_VAR_MTRR 8
|
||||
|
||||
#define ASYNC_PF_PER_VCPU 64
|
||||
@ -605,18 +604,12 @@ enum {
|
||||
KVM_DEBUGREG_WONT_EXIT = 2,
|
||||
};
|
||||
|
||||
struct kvm_mtrr_range {
|
||||
u64 base;
|
||||
u64 mask;
|
||||
struct list_head node;
|
||||
};
|
||||
|
||||
struct kvm_mtrr {
|
||||
struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
|
||||
mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
|
||||
u64 var[KVM_NR_VAR_MTRR * 2];
|
||||
u64 fixed_64k;
|
||||
u64 fixed_16k[2];
|
||||
u64 fixed_4k[8];
|
||||
u64 deftype;
|
||||
|
||||
struct list_head head;
|
||||
};
|
||||
|
||||
/* Hyper-V SynIC timer */
|
||||
|
@ -221,12 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
|
||||
return -(u32)fault & errcode;
|
||||
}
|
||||
|
||||
bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
|
||||
|
||||
static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
|
||||
{
|
||||
return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
|
||||
}
|
||||
bool kvm_mmu_may_ignore_guest_pat(void);
|
||||
|
||||
int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
|
||||
|
||||
|
@ -4671,38 +4671,23 @@ out_unlock:
|
||||
}
|
||||
#endif
|
||||
|
||||
bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
|
||||
bool kvm_mmu_may_ignore_guest_pat(void)
|
||||
{
|
||||
/*
|
||||
* If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
|
||||
* VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
|
||||
* to honor the memtype from the guest's MTRRs so that guest accesses
|
||||
* to memory that is DMA'd aren't cached against the guest's wishes.
|
||||
*
|
||||
* Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
|
||||
* e.g. KVM will force UC memtype for host MMIO.
|
||||
* When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
|
||||
* not support self-snoop (or is affected by an erratum), and the VM
|
||||
* has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
|
||||
* honor the memtype from the guest's PAT so that guest accesses to
|
||||
* memory that is DMA'd aren't cached against the guest's wishes. As a
|
||||
* result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
|
||||
* KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
|
||||
* bits in response to non-coherent device (un)registration.
|
||||
*/
|
||||
return vm_has_noncoherent_dma && shadow_memtype_mask;
|
||||
return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
|
||||
}
|
||||
|
||||
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
|
||||
{
|
||||
/*
|
||||
* If the guest's MTRRs may be used to compute the "real" memtype,
|
||||
* restrict the mapping level to ensure KVM uses a consistent memtype
|
||||
* across the entire mapping.
|
||||
*/
|
||||
if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
|
||||
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
|
||||
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
|
||||
gfn_t base = gfn_round_for_level(fault->gfn,
|
||||
fault->max_level);
|
||||
|
||||
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
if (tdp_mmu_enabled)
|
||||
return kvm_tdp_mmu_page_fault(vcpu, fault);
|
||||
|
@ -19,33 +19,21 @@
|
||||
#include <asm/mtrr.h>
|
||||
|
||||
#include "cpuid.h"
|
||||
#include "mmu.h"
|
||||
|
||||
#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
|
||||
#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
|
||||
#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
|
||||
|
||||
static bool is_mtrr_base_msr(unsigned int msr)
|
||||
static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
|
||||
{
|
||||
/* MTRR base MSRs use even numbers, masks use odd numbers. */
|
||||
return !(msr & 0x1);
|
||||
}
|
||||
int index;
|
||||
|
||||
static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
|
||||
unsigned int msr)
|
||||
{
|
||||
int index = (msr - MTRRphysBase_MSR(0)) / 2;
|
||||
|
||||
return &vcpu->arch.mtrr_state.var_ranges[index];
|
||||
}
|
||||
|
||||
static bool msr_mtrr_valid(unsigned msr)
|
||||
{
|
||||
switch (msr) {
|
||||
case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
|
||||
index = msr - MTRRphysBase_MSR(0);
|
||||
return &vcpu->arch.mtrr_state.var[index];
|
||||
case MSR_MTRRfix64K_00000:
|
||||
return &vcpu->arch.mtrr_state.fixed_64k;
|
||||
case MSR_MTRRfix16K_80000:
|
||||
case MSR_MTRRfix16K_A0000:
|
||||
index = msr - MSR_MTRRfix16K_80000;
|
||||
return &vcpu->arch.mtrr_state.fixed_16k[index];
|
||||
case MSR_MTRRfix4K_C0000:
|
||||
case MSR_MTRRfix4K_C8000:
|
||||
case MSR_MTRRfix4K_D0000:
|
||||
@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr)
|
||||
case MSR_MTRRfix4K_E8000:
|
||||
case MSR_MTRRfix4K_F0000:
|
||||
case MSR_MTRRfix4K_F8000:
|
||||
index = msr - MSR_MTRRfix4K_C0000;
|
||||
return &vcpu->arch.mtrr_state.fixed_4k[index];
|
||||
case MSR_MTRRdefType:
|
||||
return true;
|
||||
return &vcpu->arch.mtrr_state.deftype;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool valid_mtrr_type(unsigned t)
|
||||
@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
int i;
|
||||
u64 mask;
|
||||
|
||||
if (!msr_mtrr_valid(msr))
|
||||
return false;
|
||||
|
||||
if (msr == MSR_MTRRdefType) {
|
||||
if (data & ~0xcff)
|
||||
return false;
|
||||
@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
}
|
||||
|
||||
/* variable MTRRs */
|
||||
WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
|
||||
msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
|
||||
if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
|
||||
msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
|
||||
return false;
|
||||
|
||||
mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
|
||||
if ((msr & 1) == 0) {
|
||||
@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
if (!valid_mtrr_type(data & 0xff))
|
||||
return false;
|
||||
mask |= 0xf00;
|
||||
} else
|
||||
} else {
|
||||
/* MTRR mask */
|
||||
mask |= 0x7ff;
|
||||
}
|
||||
|
||||
return (data & mask) == 0;
|
||||
}
|
||||
|
||||
static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
|
||||
{
|
||||
return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
|
||||
}
|
||||
|
||||
static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
|
||||
{
|
||||
return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
|
||||
}
|
||||
|
||||
static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
|
||||
{
|
||||
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
|
||||
}
|
||||
|
||||
static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
/*
|
||||
* Intel SDM 11.11.2.2: all MTRRs are disabled when
|
||||
* IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
|
||||
* memory type is applied to all of physical memory.
|
||||
*
|
||||
* However, virtual machines can be run with CPUID such that
|
||||
* there are no MTRRs. In that case, the firmware will never
|
||||
* enable MTRRs and it is obviously undesirable to run the
|
||||
* guest entirely with UC memory and we use WB.
|
||||
*/
|
||||
if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
|
||||
return MTRR_TYPE_UNCACHABLE;
|
||||
else
|
||||
return MTRR_TYPE_WRBACK;
|
||||
}
|
||||
|
||||
/*
|
||||
* Three terms are used in the following code:
|
||||
* - segment, it indicates the address segments covered by fixed MTRRs.
|
||||
* - unit, it corresponds to the MSR entry in the segment.
|
||||
* - range, a range is covered in one memory cache type.
|
||||
*/
|
||||
struct fixed_mtrr_segment {
|
||||
u64 start;
|
||||
u64 end;
|
||||
|
||||
int range_shift;
|
||||
|
||||
/* the start position in kvm_mtrr.fixed_ranges[]. */
|
||||
int range_start;
|
||||
};
|
||||
|
||||
static struct fixed_mtrr_segment fixed_seg_table[] = {
|
||||
/* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
|
||||
{
|
||||
.start = 0x0,
|
||||
.end = 0x80000,
|
||||
.range_shift = 16, /* 64K */
|
||||
.range_start = 0,
|
||||
},
|
||||
|
||||
/*
|
||||
* MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
|
||||
* 16K fixed mtrr.
|
||||
*/
|
||||
{
|
||||
.start = 0x80000,
|
||||
.end = 0xc0000,
|
||||
.range_shift = 14, /* 16K */
|
||||
.range_start = 8,
|
||||
},
|
||||
|
||||
/*
|
||||
* MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
|
||||
* 4K fixed mtrr.
|
||||
*/
|
||||
{
|
||||
.start = 0xc0000,
|
||||
.end = 0x100000,
|
||||
.range_shift = 12, /* 12K */
|
||||
.range_start = 24,
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* The size of unit is covered in one MSR, one MSR entry contains
|
||||
* 8 ranges so that unit size is always 8 * 2^range_shift.
|
||||
*/
|
||||
static u64 fixed_mtrr_seg_unit_size(int seg)
|
||||
{
|
||||
return 8 << fixed_seg_table[seg].range_shift;
|
||||
}
|
||||
|
||||
static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
|
||||
{
|
||||
switch (msr) {
|
||||
case MSR_MTRRfix64K_00000:
|
||||
*seg = 0;
|
||||
*unit = 0;
|
||||
break;
|
||||
case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
|
||||
*seg = 1;
|
||||
*unit = array_index_nospec(
|
||||
msr - MSR_MTRRfix16K_80000,
|
||||
MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
|
||||
break;
|
||||
case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
|
||||
*seg = 2;
|
||||
*unit = array_index_nospec(
|
||||
msr - MSR_MTRRfix4K_C0000,
|
||||
MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
|
||||
{
|
||||
struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
|
||||
u64 unit_size = fixed_mtrr_seg_unit_size(seg);
|
||||
|
||||
*start = mtrr_seg->start + unit * unit_size;
|
||||
*end = *start + unit_size;
|
||||
WARN_ON(*end > mtrr_seg->end);
|
||||
}
|
||||
|
||||
static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
|
||||
{
|
||||
struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
|
||||
|
||||
WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
|
||||
> mtrr_seg->end);
|
||||
|
||||
/* each unit has 8 ranges. */
|
||||
return mtrr_seg->range_start + 8 * unit;
|
||||
}
|
||||
|
||||
static int fixed_mtrr_seg_end_range_index(int seg)
|
||||
{
|
||||
struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
|
||||
int n;
|
||||
|
||||
n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
|
||||
return mtrr_seg->range_start + n - 1;
|
||||
}
|
||||
|
||||
static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
|
||||
{
|
||||
int seg, unit;
|
||||
|
||||
if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
|
||||
return false;
|
||||
|
||||
fixed_mtrr_seg_unit_range(seg, unit, start, end);
|
||||
return true;
|
||||
}
|
||||
|
||||
static int fixed_msr_to_range_index(u32 msr)
|
||||
{
|
||||
int seg, unit;
|
||||
|
||||
if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
|
||||
return -1;
|
||||
|
||||
return fixed_mtrr_seg_unit_range_index(seg, unit);
|
||||
}
|
||||
|
||||
static int fixed_mtrr_addr_to_seg(u64 addr)
|
||||
{
|
||||
struct fixed_mtrr_segment *mtrr_seg;
|
||||
int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
|
||||
|
||||
for (seg = 0; seg < seg_num; seg++) {
|
||||
mtrr_seg = &fixed_seg_table[seg];
|
||||
if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
|
||||
return seg;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
|
||||
{
|
||||
struct fixed_mtrr_segment *mtrr_seg;
|
||||
int index;
|
||||
|
||||
mtrr_seg = &fixed_seg_table[seg];
|
||||
index = mtrr_seg->range_start;
|
||||
index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
|
||||
return index;
|
||||
}
|
||||
|
||||
static u64 fixed_mtrr_range_end_addr(int seg, int index)
|
||||
{
|
||||
struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
|
||||
int pos = index - mtrr_seg->range_start;
|
||||
|
||||
return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
|
||||
}
|
||||
|
||||
static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
|
||||
{
|
||||
u64 mask;
|
||||
|
||||
*start = range->base & PAGE_MASK;
|
||||
|
||||
mask = range->mask & PAGE_MASK;
|
||||
|
||||
/* This cannot overflow because writing to the reserved bits of
|
||||
* variable MTRRs causes a #GP.
|
||||
*/
|
||||
*end = (*start | ~mask) + 1;
|
||||
}
|
||||
|
||||
static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
|
||||
{
|
||||
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
|
||||
gfn_t start, end;
|
||||
|
||||
if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
|
||||
return;
|
||||
|
||||
if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
|
||||
return;
|
||||
|
||||
/* fixed MTRRs. */
|
||||
if (fixed_msr_to_range(msr, &start, &end)) {
|
||||
if (!fixed_mtrr_is_enabled(mtrr_state))
|
||||
return;
|
||||
} else if (msr == MSR_MTRRdefType) {
|
||||
start = 0x0;
|
||||
end = ~0ULL;
|
||||
} else {
|
||||
/* variable range MTRRs. */
|
||||
var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
|
||||
}
|
||||
|
||||
kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
|
||||
}
|
||||
|
||||
static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
|
||||
{
|
||||
return (range->mask & (1 << 11)) != 0;
|
||||
}
|
||||
|
||||
static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
{
|
||||
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
|
||||
struct kvm_mtrr_range *tmp, *cur;
|
||||
|
||||
cur = var_mtrr_msr_to_range(vcpu, msr);
|
||||
|
||||
/* remove the entry if it's in the list. */
|
||||
if (var_mtrr_range_is_valid(cur))
|
||||
list_del(&cur->node);
|
||||
|
||||
/*
|
||||
* Set all illegal GPA bits in the mask, since those bits must
|
||||
* implicitly be 0. The bits are then cleared when reading them.
|
||||
*/
|
||||
if (is_mtrr_base_msr(msr))
|
||||
cur->base = data;
|
||||
else
|
||||
cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
|
||||
|
||||
/* add it to the list if it's enabled. */
|
||||
if (var_mtrr_range_is_valid(cur)) {
|
||||
list_for_each_entry(tmp, &mtrr_state->head, node)
|
||||
if (cur->base >= tmp->base)
|
||||
break;
|
||||
list_add_tail(&cur->node, &tmp->node);
|
||||
}
|
||||
}
|
||||
|
||||
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
|
||||
{
|
||||
int index;
|
||||
u64 *mtrr;
|
||||
|
||||
mtrr = find_mtrr(vcpu, msr);
|
||||
if (!mtrr)
|
||||
return 1;
|
||||
|
||||
if (!kvm_mtrr_valid(vcpu, msr, data))
|
||||
return 1;
|
||||
|
||||
index = fixed_msr_to_range_index(msr);
|
||||
if (index >= 0)
|
||||
*(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
|
||||
else if (msr == MSR_MTRRdefType)
|
||||
vcpu->arch.mtrr_state.deftype = data;
|
||||
else
|
||||
set_var_mtrr_msr(vcpu, msr, data);
|
||||
|
||||
update_mtrr(vcpu, msr);
|
||||
*mtrr = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
|
||||
{
|
||||
int index;
|
||||
u64 *mtrr;
|
||||
|
||||
/* MSR_MTRRcap is a readonly MSR. */
|
||||
if (msr == MSR_MTRRcap) {
|
||||
@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!msr_mtrr_valid(msr))
|
||||
mtrr = find_mtrr(vcpu, msr);
|
||||
if (!mtrr)
|
||||
return 1;
|
||||
|
||||
index = fixed_msr_to_range_index(msr);
|
||||
if (index >= 0) {
|
||||
*pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
|
||||
} else if (msr == MSR_MTRRdefType) {
|
||||
*pdata = vcpu->arch.mtrr_state.deftype;
|
||||
} else {
|
||||
/* Variable MTRRs */
|
||||
if (is_mtrr_base_msr(msr))
|
||||
*pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
|
||||
else
|
||||
*pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
|
||||
|
||||
*pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
|
||||
}
|
||||
|
||||
*pdata = *mtrr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
|
||||
{
|
||||
INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
|
||||
}
|
||||
|
||||
struct mtrr_iter {
|
||||
/* input fields. */
|
||||
struct kvm_mtrr *mtrr_state;
|
||||
u64 start;
|
||||
u64 end;
|
||||
|
||||
/* output fields. */
|
||||
int mem_type;
|
||||
/* mtrr is completely disabled? */
|
||||
bool mtrr_disabled;
|
||||
/* [start, end) is not fully covered in MTRRs? */
|
||||
bool partial_map;
|
||||
|
||||
/* private fields. */
|
||||
union {
|
||||
/* used for fixed MTRRs. */
|
||||
struct {
|
||||
int index;
|
||||
int seg;
|
||||
};
|
||||
|
||||
/* used for var MTRRs. */
|
||||
struct {
|
||||
struct kvm_mtrr_range *range;
|
||||
/* max address has been covered in var MTRRs. */
|
||||
u64 start_max;
|
||||
};
|
||||
};
|
||||
|
||||
bool fixed;
|
||||
};
|
||||
|
||||
static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
|
||||
{
|
||||
int seg, index;
|
||||
|
||||
if (!fixed_mtrr_is_enabled(iter->mtrr_state))
|
||||
return false;
|
||||
|
||||
seg = fixed_mtrr_addr_to_seg(iter->start);
|
||||
if (seg < 0)
|
||||
return false;
|
||||
|
||||
iter->fixed = true;
|
||||
index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
|
||||
iter->index = index;
|
||||
iter->seg = seg;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool match_var_range(struct mtrr_iter *iter,
|
||||
struct kvm_mtrr_range *range)
|
||||
{
|
||||
u64 start, end;
|
||||
|
||||
var_mtrr_range(range, &start, &end);
|
||||
if (!(start >= iter->end || end <= iter->start)) {
|
||||
iter->range = range;
|
||||
|
||||
/*
|
||||
* the function is called when we do kvm_mtrr.head walking.
|
||||
* Range has the minimum base address which interleaves
|
||||
* [looker->start_max, looker->end).
|
||||
*/
|
||||
iter->partial_map |= iter->start_max < start;
|
||||
|
||||
/* update the max address has been covered. */
|
||||
iter->start_max = max(iter->start_max, end);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
|
||||
{
|
||||
struct kvm_mtrr *mtrr_state = iter->mtrr_state;
|
||||
|
||||
list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
|
||||
if (match_var_range(iter, iter->range))
|
||||
return;
|
||||
|
||||
iter->range = NULL;
|
||||
iter->partial_map |= iter->start_max < iter->end;
|
||||
}
|
||||
|
||||
static void mtrr_lookup_var_start(struct mtrr_iter *iter)
|
||||
{
|
||||
struct kvm_mtrr *mtrr_state = iter->mtrr_state;
|
||||
|
||||
iter->fixed = false;
|
||||
iter->start_max = iter->start;
|
||||
iter->range = NULL;
|
||||
iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
|
||||
|
||||
__mtrr_lookup_var_next(iter);
|
||||
}
|
||||
|
||||
static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
|
||||
{
|
||||
/* terminate the lookup. */
|
||||
if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
|
||||
iter->fixed = false;
|
||||
iter->range = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
iter->index++;
|
||||
|
||||
/* have looked up for all fixed MTRRs. */
|
||||
if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
|
||||
return mtrr_lookup_var_start(iter);
|
||||
|
||||
/* switch to next segment. */
|
||||
if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
|
||||
iter->seg++;
|
||||
}
|
||||
|
||||
static void mtrr_lookup_var_next(struct mtrr_iter *iter)
|
||||
{
|
||||
__mtrr_lookup_var_next(iter);
|
||||
}
|
||||
|
||||
static void mtrr_lookup_start(struct mtrr_iter *iter)
|
||||
{
|
||||
if (!mtrr_is_enabled(iter->mtrr_state)) {
|
||||
iter->mtrr_disabled = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!mtrr_lookup_fixed_start(iter))
|
||||
mtrr_lookup_var_start(iter);
|
||||
}
|
||||
|
||||
static void mtrr_lookup_init(struct mtrr_iter *iter,
|
||||
struct kvm_mtrr *mtrr_state, u64 start, u64 end)
|
||||
{
|
||||
iter->mtrr_state = mtrr_state;
|
||||
iter->start = start;
|
||||
iter->end = end;
|
||||
iter->mtrr_disabled = false;
|
||||
iter->partial_map = false;
|
||||
iter->fixed = false;
|
||||
iter->range = NULL;
|
||||
|
||||
mtrr_lookup_start(iter);
|
||||
}
|
||||
|
||||
static bool mtrr_lookup_okay(struct mtrr_iter *iter)
|
||||
{
|
||||
if (iter->fixed) {
|
||||
iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
|
||||
return true;
|
||||
}
|
||||
|
||||
if (iter->range) {
|
||||
iter->mem_type = iter->range->base & 0xff;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void mtrr_lookup_next(struct mtrr_iter *iter)
|
||||
{
|
||||
if (iter->fixed)
|
||||
mtrr_lookup_fixed_next(iter);
|
||||
else
|
||||
mtrr_lookup_var_next(iter);
|
||||
}
|
||||
|
||||
#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
|
||||
for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
|
||||
mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
|
||||
|
||||
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
|
||||
{
|
||||
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
|
||||
struct mtrr_iter iter;
|
||||
u64 start, end;
|
||||
int type = -1;
|
||||
const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
|
||||
| (1 << MTRR_TYPE_WRTHROUGH);
|
||||
|
||||
start = gfn_to_gpa(gfn);
|
||||
end = start + PAGE_SIZE;
|
||||
|
||||
mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
|
||||
int curr_type = iter.mem_type;
|
||||
|
||||
/*
|
||||
* Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
|
||||
* Precedences.
|
||||
*/
|
||||
|
||||
if (type == -1) {
|
||||
type = curr_type;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If two or more variable memory ranges match and the
|
||||
* memory types are identical, then that memory type is
|
||||
* used.
|
||||
*/
|
||||
if (type == curr_type)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If two or more variable memory ranges match and one of
|
||||
* the memory types is UC, the UC memory type used.
|
||||
*/
|
||||
if (curr_type == MTRR_TYPE_UNCACHABLE)
|
||||
return MTRR_TYPE_UNCACHABLE;
|
||||
|
||||
/*
|
||||
* If two or more variable memory ranges match and the
|
||||
* memory types are WT and WB, the WT memory type is used.
|
||||
*/
|
||||
if (((1 << type) & wt_wb_mask) &&
|
||||
((1 << curr_type) & wt_wb_mask)) {
|
||||
type = MTRR_TYPE_WRTHROUGH;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* For overlaps not defined by the above rules, processor
|
||||
* behavior is undefined.
|
||||
*/
|
||||
|
||||
/* We use WB for this undefined behavior. :( */
|
||||
return MTRR_TYPE_WRBACK;
|
||||
}
|
||||
|
||||
if (iter.mtrr_disabled)
|
||||
return mtrr_disabled_type(vcpu);
|
||||
|
||||
/* not contained in any MTRRs. */
|
||||
if (type == -1)
|
||||
return mtrr_default_type(mtrr_state);
|
||||
|
||||
/*
|
||||
* We just check one page, partially covered by MTRRs is
|
||||
* impossible.
|
||||
*/
|
||||
WARN_ON(iter.partial_map);
|
||||
|
||||
return type;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
|
||||
|
||||
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
int page_num)
|
||||
{
|
||||
struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
|
||||
struct mtrr_iter iter;
|
||||
u64 start, end;
|
||||
int type = -1;
|
||||
|
||||
start = gfn_to_gpa(gfn);
|
||||
end = gfn_to_gpa(gfn + page_num);
|
||||
mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
|
||||
if (type == -1) {
|
||||
type = iter.mem_type;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (type != iter.mem_type)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (iter.mtrr_disabled)
|
||||
return true;
|
||||
|
||||
if (!iter.partial_map)
|
||||
return true;
|
||||
|
||||
if (type == -1)
|
||||
return true;
|
||||
|
||||
return type == mtrr_default_type(mtrr_state);
|
||||
}
|
||||
|
@ -7670,39 +7670,25 @@ int vmx_vm_init(struct kvm *kvm)
|
||||
|
||||
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
|
||||
{
|
||||
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
|
||||
* memory aliases with conflicting memory types and sometimes MCEs.
|
||||
* We have to be careful as to what are honored and when.
|
||||
*
|
||||
* For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
|
||||
* UC. The effective memory type is UC or WC depending on guest PAT.
|
||||
* This was historically the source of MCEs and we want to be
|
||||
* conservative.
|
||||
*
|
||||
* When there is no need to deal with noncoherent DMA (e.g., no VT-d
|
||||
* or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
|
||||
* EPT memory type is set to WB. The effective memory type is forced
|
||||
* WB.
|
||||
*
|
||||
* Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
|
||||
* EPT memory type is used to emulate guest CD/MTRR.
|
||||
/*
|
||||
* Force UC for host MMIO regions, as allowing the guest to access MMIO
|
||||
* with cacheable accesses will result in Machine Checks.
|
||||
*/
|
||||
|
||||
if (is_mmio)
|
||||
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
|
||||
|
||||
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
|
||||
/*
|
||||
* Force WB and ignore guest PAT if the VM does NOT have a non-coherent
|
||||
* device attached and the CPU doesn't support self-snoop. Letting the
|
||||
* guest control memory types on Intel CPUs without self-snoop may
|
||||
* result in unexpected behavior, and so KVM's (historical) ABI is to
|
||||
* trust the guest to behave only as a last resort.
|
||||
*/
|
||||
if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
|
||||
!kvm_arch_has_noncoherent_dma(vcpu->kvm))
|
||||
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
|
||||
|
||||
if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
|
||||
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
|
||||
return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
|
||||
else
|
||||
return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
|
||||
VMX_EPT_IPAT_BIT;
|
||||
}
|
||||
|
||||
return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
|
||||
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
|
||||
}
|
||||
|
||||
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
|
||||
|
@ -946,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
|
||||
|
||||
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
|
||||
kvm_mmu_reset_context(vcpu);
|
||||
|
||||
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
|
||||
kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
|
||||
!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
|
||||
kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
|
||||
|
||||
@ -11181,6 +11176,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
||||
|
||||
kvm_vcpu_srcu_read_lock(vcpu);
|
||||
|
||||
/*
|
||||
* Call this to ensure WC buffers in guest are evicted after each VM
|
||||
* Exit, so that the evicted WC writes can be snooped across all cpus
|
||||
*/
|
||||
smp_mb__after_srcu_read_lock();
|
||||
|
||||
/*
|
||||
* Profile KVM exit RIPs:
|
||||
*/
|
||||
@ -12264,7 +12265,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
|
||||
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
|
||||
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
|
||||
kvm_xen_init_vcpu(vcpu);
|
||||
kvm_vcpu_mtrr_init(vcpu);
|
||||
vcpu_load(vcpu);
|
||||
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
|
||||
kvm_vcpu_reset(vcpu, false);
|
||||
@ -13528,13 +13528,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
|
||||
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
|
||||
{
|
||||
/*
|
||||
* Non-coherent DMA assignment and de-assignment will affect
|
||||
* whether KVM honors guest MTRRs and cause changes in memtypes
|
||||
* in TDP.
|
||||
* So, pass %true unconditionally to indicate non-coherent DMA was,
|
||||
* or will be involved, and that zapping SPTEs might be necessary.
|
||||
* Non-coherent DMA assignment and de-assignment may affect whether or
|
||||
* not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
|
||||
* due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
|
||||
* (or last) non-coherent device is (un)registered to so that new SPTEs
|
||||
* with the correct "ignore guest PAT" setting are created.
|
||||
*/
|
||||
if (__kvm_mmu_honors_guest_mtrrs(true))
|
||||
if (kvm_mmu_may_ignore_guest_pat())
|
||||
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
|
||||
}
|
||||
|
||||
|
@ -325,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
|
||||
void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
|
||||
struct kvm_queued_exception *ex);
|
||||
|
||||
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
|
||||
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
|
||||
int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
|
||||
int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
|
||||
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
|
||||
int page_num);
|
||||
bool kvm_vector_hashing_enabled(void);
|
||||
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
|
||||
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
|
||||
|
@ -343,6 +343,20 @@ static inline void smp_mb__after_srcu_read_unlock(void)
|
||||
/* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
|
||||
}
|
||||
|
||||
/**
|
||||
* smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
|
||||
*
|
||||
* Converts the preceding srcu_read_lock into a two-way memory barrier.
|
||||
*
|
||||
* Call this after srcu_read_lock, to guarantee that all memory operations
|
||||
* that occur after smp_mb__after_srcu_read_lock will appear to happen after
|
||||
* the preceding srcu_read_lock.
|
||||
*/
|
||||
static inline void smp_mb__after_srcu_read_lock(void)
|
||||
{
|
||||
/* __srcu_read_lock has smp_mb() internally so nothing to do here. */
|
||||
}
|
||||
|
||||
DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
|
||||
_T->idx = srcu_read_lock(_T->lock),
|
||||
srcu_read_unlock(_T->lock, _T->idx),
|
||||
|
Loading…
Reference in New Issue
Block a user