KVM x86 MTRR virtualization removal

Remove support for virtualizing MTRRs on Intel CPUs, along with a nasty CR0.CD hack, and instead always honor guest PAT on CPUs that support self-snoop. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmaRuwAACgkQOlYIJqCj N/32Gg/+Nnnz6TCRno2vursPJme7gvtLdqSxjazAj3u2ZO8IApGYWMyfVpS+ymC9 Wdpj6gRe2ukSxgTsUI2CYoy5V2NxDaA9YgdTPZUVQvqwujVrqZCJ7L393iPYYnC9 No3LXZ+SOYRmomiCzknjC6GOlT2hAZHzQsyaXDlEYok7NAA2L6XybbLonEdA4RYi V1mS62W5PaA4tUesuxkJjPujXo1nXRWD/aXOruJWjPESdSFSALlx7reFAf2Nwn7K Uw8yZqhq6vWAZSph0Nz8OrZOS/kULKA3q2zl1B/qJJ0ToAt2VdXS6abXky52RExf KvP+jBAWMO5kHbIqaMRtCHjbIkbhH8RdUIYNJQEUQ5DdydM5+/RDa+KprmLPcmUn qvJq+3uyH0MEENtneGegs8uxR+sn6fT32cGMIw790yIywddh562+IJ4Z+C3BuYJi yszD71odqKT8+knUd2CaZjE9UZyoQNDfj2OCCTzzZOC/6TuJWCh9CYQ1csssHbQR KcvZCKE6ht8tWwi+2HWj0laOdg1reX2kV869k3xH4uCwEaFIj2Wk+/Bw/lg2Tn5h 5uTnQ01dx5XhAV1klr6IY3VXJ/A8G8895wRfkZEelsA9Wj8qZvNgXhsoXReIUIrn aR0ppsFcbqHzC50qE2JT4juTD1EPx95LL9zKT8pI9mGKwxCAxUM= =yb10 -----END PGP SIGNATURE----- Merge tag 'kvm-x86-mtrrs-6.11' of https://github.com/kvm-x86/linux into HEAD KVM x86 MTRR virtualization removal Remove support for virtualizing MTRRs on Intel CPUs, along with a nasty CR0.CD hack, and instead always honor guest PAT on CPUs that support self-snoop.
2024-07-16 09:54:57 -04:00 · 2024-07-16 09:54:57 -04:00 · 5c5ddf7107
commit 5c5ddf7107
parent 34b69edecb 377b2f359d
10 changed files with 105 additions and 702 deletions
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@ -8025,7 +8025,11 @@ The valid bits in cap.args[0] are:
                                    When this quirk is disabled, the reset value
                                    is 0x10000 (APIC_LVT_MASKED).
- KVM_X86_QUIRK_CD_NW_CLEARED        By default, KVM clears CR0.CD and CR0.NW.
+ KVM_X86_QUIRK_CD_NW_CLEARED        By default, KVM clears CR0.CD and CR0.NW on
                                    AMD CPUs to workaround buggy guest firmware
                                    that runs in perpetuity with CR0.CD, i.e.
                                    with caches in "no fill" mode.
                                    When this quirk is disabled, KVM does not
                                    change the value of CR0.CD and CR0.NW.
--- a/Documentation/virt/kvm/x86/errata.rst
+++ b/Documentation/virt/kvm/x86/errata.rst
@ -48,3 +48,21 @@ have the same physical APIC ID, KVM will deliver events targeting that APIC ID
 only to the vCPU with the lowest vCPU ID.  If KVM_X2APIC_API_USE_32BIT_IDS is
 not enabled, KVM follows x86 architecture when processing interrupts (all vCPUs
 matching the target APIC ID receive the interrupt).
 MTRRs
 -----
 KVM does not virtualize guest MTRR memory types.  KVM emulates accesses to MTRR
 MSRs, i.e. {RD,WR}MSR in the guest will behave as expected, but KVM does not
 honor guest MTRRs when determining the effective memory type, and instead
 treats all of guest memory as having Writeback (WB) MTRRs.
 CR0.CD
 ------
 KVM does not virtualize CR0.CD on Intel CPUs.  Similar to MTRR MSRs, KVM
 emulates CR0.CD accesses so that loads and stores from/to CR0 behave as
 expected, but setting CR0.CD=1 has no impact on the cachaeability of guest
 memory.
 Note, this erratum does not affect AMD CPUs, which fully virtualize CR0.CD in
 hardware, i.e. put the CPU caches into "no fill" mode when CR0.CD=1, even when
 running in the guest.
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@ -160,7 +160,6 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 256
 #define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 #define ASYNC_PF_PER_VCPU 64
@ -605,18 +604,12 @@ enum {
 	KVM_DEBUGREG_WONT_EXIT = 2,
 };
 struct kvm_mtrr_range {
 	u64 base;
 	u64 mask;
 	struct list_head node;
 };
 struct kvm_mtrr {
-	struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
+	u64 var[KVM_NR_VAR_MTRR * 2];
-	mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+	u64 fixed_64k;
 	u64 fixed_16k[2];
 	u64 fixed_4k[8];
 	u64 deftype;
 	struct list_head head;
 };
 /* Hyper-V SynIC timer */
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@ -221,12 +221,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return -(u32)fault & errcode;
 }
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma);
+bool kvm_mmu_may_ignore_guest_pat(void);
 static inline bool kvm_mmu_honors_guest_mtrrs(struct kvm *kvm)
 {
 	return __kvm_mmu_honors_guest_mtrrs(kvm_arch_has_noncoherent_dma(kvm));
 }
 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@ -4671,38 +4671,23 @@ out_unlock:
 }
 #endif
-bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
+bool kvm_mmu_may_ignore_guest_pat(void)
 {
 	/*
-	 * If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
+	 * When EPT is enabled (shadow_memtype_mask is non-zero), the CPU does
-	 * VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
+	 * not support self-snoop (or is affected by an erratum), and the VM
-	 * to honor the memtype from the guest's MTRRs so that guest accesses
+	 * has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is to
-	 * to memory that is DMA'd aren't cached against the guest's wishes.
+	 * honor the memtype from the guest's PAT so that guest accesses to
-	 *
+	 * memory that is DMA'd aren't cached against the guest's wishes.  As a
-	 * Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
+	 * result, KVM _may_ ignore guest PAT, whereas without non-coherent DMA,
-	 * e.g. KVM will force UC memtype for host MMIO.
+	 * KVM _always_ ignores or honors guest PAT, i.e. doesn't toggle SPTE
 	 * bits in response to non-coherent device (un)registration.
 	 */
-	return vm_has_noncoherent_dma && shadow_memtype_mask;
+	return !static_cpu_has(X86_FEATURE_SELFSNOOP) && shadow_memtype_mask;
 }
 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	/*
 	 * If the guest's MTRRs may be used to compute the "real" memtype,
 	 * restrict the mapping level to ensure KVM uses a consistent memtype
 	 * across the entire mapping.
 	 */
 	if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
 		for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
 			int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
 			gfn_t base = gfn_round_for_level(fault->gfn,
 							 fault->max_level);
 			if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
 				break;
 		}
 	}
 #ifdef CONFIG_X86_64
 	if (tdp_mmu_enabled)
 		return kvm_tdp_mmu_page_fault(vcpu, fault);
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@ -19,33 +19,21 @@
 #include <asm/mtrr.h>
 #include "cpuid.h"
 #include "mmu.h"
-#define IA32_MTRR_DEF_TYPE_E		(1ULL << 11)
+static u64 *find_mtrr(struct kvm_vcpu *vcpu, unsigned int msr)
 #define IA32_MTRR_DEF_TYPE_FE		(1ULL << 10)
 #define IA32_MTRR_DEF_TYPE_TYPE_MASK	(0xff)
 static bool is_mtrr_base_msr(unsigned int msr)
 {
-	/* MTRR base MSRs use even numbers, masks use odd numbers. */
+	int index;
 	return !(msr & 0x1);
 }
 static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu,
 						    unsigned int msr)
 {
 	int index = (msr - MTRRphysBase_MSR(0)) / 2;
 	return &vcpu->arch.mtrr_state.var_ranges[index];
 }
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
 	case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1):
 		index = msr - MTRRphysBase_MSR(0);
 		return &vcpu->arch.mtrr_state.var[index];
 	case MSR_MTRRfix64K_00000:
 		return &vcpu->arch.mtrr_state.fixed_64k;
 	case MSR_MTRRfix16K_80000:
 	case MSR_MTRRfix16K_A0000:
 		index = msr - MSR_MTRRfix16K_80000;
 		return &vcpu->arch.mtrr_state.fixed_16k[index];
 	case MSR_MTRRfix4K_C0000:
 	case MSR_MTRRfix4K_C8000:
 	case MSR_MTRRfix4K_D0000:
@ -54,10 +42,14 @@ static bool msr_mtrr_valid(unsigned msr)
 	case MSR_MTRRfix4K_E8000:
 	case MSR_MTRRfix4K_F0000:
 	case MSR_MTRRfix4K_F8000:
 		index = msr - MSR_MTRRfix4K_C0000;
 		return &vcpu->arch.mtrr_state.fixed_4k[index];
 	case MSR_MTRRdefType:
-		return true;
+		return &vcpu->arch.mtrr_state.deftype;
 	default:
 		break;
 	}
-	return false;
+	return NULL;
 }
 static bool valid_mtrr_type(unsigned t)
@ -70,9 +62,6 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	int i;
 	u64 mask;
 	if (!msr_mtrr_valid(msr))
 		return false;
 	if (msr == MSR_MTRRdefType) {
 		if (data & ~0xcff)
 			return false;
@ -85,8 +74,9 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	}
 	/* variable MTRRs */
-	WARN_ON(!(msr >= MTRRphysBase_MSR(0) &&
+	if (WARN_ON_ONCE(!(msr >= MTRRphysBase_MSR(0) &&
-		  msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1)));
+			   msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))))
 		return false;
 	mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 	if ((msr & 1) == 0) {
@ -94,309 +84,32 @@ static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (!valid_mtrr_type(data & 0xff))
 			return false;
 		mask |= 0xf00;
-	} else
+	} else {
 		/* MTRR mask */
 		mask |= 0x7ff;
 	}
 	return (data & mask) == 0;
 }
 static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
 {
 	return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
 }
 static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
 {
 	return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
 }
 static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
 {
 	return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
 }
 static u8 mtrr_disabled_type(struct kvm_vcpu *vcpu)
 {
 	/*
 	 * Intel SDM 11.11.2.2: all MTRRs are disabled when
 	 * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
 	 * memory type is applied to all of physical memory.
 	 *
 	 * However, virtual machines can be run with CPUID such that
 	 * there are no MTRRs.  In that case, the firmware will never
 	 * enable MTRRs and it is obviously undesirable to run the
 	 * guest entirely with UC memory and we use WB.
 	 */
 	if (guest_cpuid_has(vcpu, X86_FEATURE_MTRR))
 		return MTRR_TYPE_UNCACHABLE;
 	else
 		return MTRR_TYPE_WRBACK;
 }
 /*
 * Three terms are used in the following code:
 * - segment, it indicates the address segments covered by fixed MTRRs.
 * - unit, it corresponds to the MSR entry in the segment.
 * - range, a range is covered in one memory cache type.
 */
 struct fixed_mtrr_segment {
 	u64 start;
 	u64 end;
 	int range_shift;
 	/* the start position in kvm_mtrr.fixed_ranges[]. */
 	int range_start;
 };
 static struct fixed_mtrr_segment fixed_seg_table[] = {
 	/* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
 	{
 		.start = 0x0,
 		.end = 0x80000,
 		.range_shift = 16, /* 64K */
 		.range_start = 0,
 	},
 	/*
 	 * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
 	 * 16K fixed mtrr.
 	 */
 	{
 		.start = 0x80000,
 		.end = 0xc0000,
 		.range_shift = 14, /* 16K */
 		.range_start = 8,
 	},
 	/*
 	 * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
 	 * 4K fixed mtrr.
 	 */
 	{
 		.start = 0xc0000,
 		.end = 0x100000,
 		.range_shift = 12, /* 12K */
 		.range_start = 24,
 	}
 };
 /*
 * The size of unit is covered in one MSR, one MSR entry contains
 * 8 ranges so that unit size is always 8 * 2^range_shift.
 */
 static u64 fixed_mtrr_seg_unit_size(int seg)
 {
 	return 8 << fixed_seg_table[seg].range_shift;
 }
 static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
 {
 	switch (msr) {
 	case MSR_MTRRfix64K_00000:
 		*seg = 0;
 		*unit = 0;
 		break;
 	case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
 		*seg = 1;
 		*unit = array_index_nospec(
 			msr - MSR_MTRRfix16K_80000,
 			MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
 		break;
 	case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
 		*seg = 2;
 		*unit = array_index_nospec(
 			msr - MSR_MTRRfix4K_C0000,
 			MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
 		break;
 	default:
 		return false;
 	}
 	return true;
 }
 static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
 {
 	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
 	u64 unit_size = fixed_mtrr_seg_unit_size(seg);
 	*start = mtrr_seg->start + unit * unit_size;
 	*end = *start + unit_size;
 	WARN_ON(*end > mtrr_seg->end);
 }
 static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
 {
 	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
 	WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
 		> mtrr_seg->end);
 	/* each unit has 8 ranges. */
 	return mtrr_seg->range_start + 8 * unit;
 }
 static int fixed_mtrr_seg_end_range_index(int seg)
 {
 	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
 	int n;
 	n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
 	return mtrr_seg->range_start + n - 1;
 }
 static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
 {
 	int seg, unit;
 	if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
 		return false;
 	fixed_mtrr_seg_unit_range(seg, unit, start, end);
 	return true;
 }
 static int fixed_msr_to_range_index(u32 msr)
 {
 	int seg, unit;
 	if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
 		return -1;
 	return fixed_mtrr_seg_unit_range_index(seg, unit);
 }
 static int fixed_mtrr_addr_to_seg(u64 addr)
 {
 	struct fixed_mtrr_segment *mtrr_seg;
 	int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
 	for (seg = 0; seg < seg_num; seg++) {
 		mtrr_seg = &fixed_seg_table[seg];
 		if (mtrr_seg->start <= addr && addr < mtrr_seg->end)
 			return seg;
 	}
 	return -1;
 }
 static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
 {
 	struct fixed_mtrr_segment *mtrr_seg;
 	int index;
 	mtrr_seg = &fixed_seg_table[seg];
 	index = mtrr_seg->range_start;
 	index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
 	return index;
 }
 static u64 fixed_mtrr_range_end_addr(int seg, int index)
 {
 	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
 	int pos = index - mtrr_seg->range_start;
 	return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
 }
 static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
 {
 	u64 mask;
 	*start = range->base & PAGE_MASK;
 	mask = range->mask & PAGE_MASK;
 	/* This cannot overflow because writing to the reserved bits of
 	 * variable MTRRs causes a #GP.
 	 */
 	*end = (*start | ~mask) + 1;
 }
 static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
 {
 	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
 	gfn_t start, end;
 	if (!kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
 		return;
 	if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
 		return;
 	/* fixed MTRRs. */
 	if (fixed_msr_to_range(msr, &start, &end)) {
 		if (!fixed_mtrr_is_enabled(mtrr_state))
 			return;
 	} else if (msr == MSR_MTRRdefType) {
 		start = 0x0;
 		end = ~0ULL;
 	} else {
 		/* variable range MTRRs. */
 		var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
 	}
 	kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
 }
 static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
 {
 	return (range->mask & (1 << 11)) != 0;
 }
 static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
 	struct kvm_mtrr_range *tmp, *cur;
 	cur = var_mtrr_msr_to_range(vcpu, msr);
 	/* remove the entry if it's in the list. */
 	if (var_mtrr_range_is_valid(cur))
 		list_del(&cur->node);
 	/*
 	 * Set all illegal GPA bits in the mask, since those bits must
 	 * implicitly be 0.  The bits are then cleared when reading them.
 	 */
 	if (is_mtrr_base_msr(msr))
 		cur->base = data;
 	else
 		cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 	/* add it to the list if it's enabled. */
 	if (var_mtrr_range_is_valid(cur)) {
 		list_for_each_entry(tmp, &mtrr_state->head, node)
 			if (cur->base >= tmp->base)
 				break;
 		list_add_tail(&cur->node, &tmp->node);
 	}
 }
 int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
-	int index;
+	u64 *mtrr;
 	mtrr = find_mtrr(vcpu, msr);
 	if (!mtrr)
 		return 1;
 	if (!kvm_mtrr_valid(vcpu, msr, data))
 		return 1;
-	index = fixed_msr_to_range_index(msr);
+	*mtrr = data;
 	if (index >= 0)
 		*(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
 	else if (msr == MSR_MTRRdefType)
 		vcpu->arch.mtrr_state.deftype = data;
 	else
 		set_var_mtrr_msr(vcpu, msr, data);
 	update_mtrr(vcpu, msr);
 	return 0;
 }
 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
-	int index;
+	u64 *mtrr;
 	/* MSR_MTRRcap is a readonly MSR. */
 	if (msr == MSR_MTRRcap) {
@ -410,311 +123,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		return 0;
 	}
-	if (!msr_mtrr_valid(msr))
+	mtrr = find_mtrr(vcpu, msr);
 	if (!mtrr)
 		return 1;
-	index = fixed_msr_to_range_index(msr);
+	*pdata = *mtrr;
 	if (index >= 0) {
 		*pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
 	} else if (msr == MSR_MTRRdefType) {
 		*pdata = vcpu->arch.mtrr_state.deftype;
 	} else {
 		/* Variable MTRRs */
 		if (is_mtrr_base_msr(msr))
 			*pdata = var_mtrr_msr_to_range(vcpu, msr)->base;
 		else
 			*pdata = var_mtrr_msr_to_range(vcpu, msr)->mask;
 		*pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu);
 	}
 	return 0;
 }
 void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
 {
 	INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
 }
 struct mtrr_iter {
 	/* input fields. */
 	struct kvm_mtrr *mtrr_state;
 	u64 start;
 	u64 end;
 	/* output fields. */
 	int mem_type;
 	/* mtrr is completely disabled? */
 	bool mtrr_disabled;
 	/* [start, end) is not fully covered in MTRRs? */
 	bool partial_map;
 	/* private fields. */
 	union {
 		/* used for fixed MTRRs. */
 		struct {
 			int index;
 			int seg;
 		};
 		/* used for var MTRRs. */
 		struct {
 			struct kvm_mtrr_range *range;
 			/* max address has been covered in var MTRRs. */
 			u64 start_max;
 		};
 	};
 	bool fixed;
 };
 static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
 {
 	int seg, index;
 	if (!fixed_mtrr_is_enabled(iter->mtrr_state))
 		return false;
 	seg = fixed_mtrr_addr_to_seg(iter->start);
 	if (seg < 0)
 		return false;
 	iter->fixed = true;
 	index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
 	iter->index = index;
 	iter->seg = seg;
 	return true;
 }
 static bool match_var_range(struct mtrr_iter *iter,
 			    struct kvm_mtrr_range *range)
 {
 	u64 start, end;
 	var_mtrr_range(range, &start, &end);
 	if (!(start >= iter->end || end <= iter->start)) {
 		iter->range = range;
 		/*
 		 * the function is called when we do kvm_mtrr.head walking.
 		 * Range has the minimum base address which interleaves
 		 * [looker->start_max, looker->end).
 		 */
 		iter->partial_map |= iter->start_max < start;
 		/* update the max address has been covered. */
 		iter->start_max = max(iter->start_max, end);
 		return true;
 	}
 	return false;
 }
 static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
 {
 	struct kvm_mtrr *mtrr_state = iter->mtrr_state;
 	list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
 		if (match_var_range(iter, iter->range))
 			return;
 	iter->range = NULL;
 	iter->partial_map |= iter->start_max < iter->end;
 }
 static void mtrr_lookup_var_start(struct mtrr_iter *iter)
 {
 	struct kvm_mtrr *mtrr_state = iter->mtrr_state;
 	iter->fixed = false;
 	iter->start_max = iter->start;
 	iter->range = NULL;
 	iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
 	__mtrr_lookup_var_next(iter);
 }
 static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
 {
 	/* terminate the lookup. */
 	if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
 		iter->fixed = false;
 		iter->range = NULL;
 		return;
 	}
 	iter->index++;
 	/* have looked up for all fixed MTRRs. */
 	if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
 		return mtrr_lookup_var_start(iter);
 	/* switch to next segment. */
 	if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
 		iter->seg++;
 }
 static void mtrr_lookup_var_next(struct mtrr_iter *iter)
 {
 	__mtrr_lookup_var_next(iter);
 }
 static void mtrr_lookup_start(struct mtrr_iter *iter)
 {
 	if (!mtrr_is_enabled(iter->mtrr_state)) {
 		iter->mtrr_disabled = true;
 		return;
 	}
 	if (!mtrr_lookup_fixed_start(iter))
 		mtrr_lookup_var_start(iter);
 }
 static void mtrr_lookup_init(struct mtrr_iter *iter,
 			     struct kvm_mtrr *mtrr_state, u64 start, u64 end)
 {
 	iter->mtrr_state = mtrr_state;
 	iter->start = start;
 	iter->end = end;
 	iter->mtrr_disabled = false;
 	iter->partial_map = false;
 	iter->fixed = false;
 	iter->range = NULL;
 	mtrr_lookup_start(iter);
 }
 static bool mtrr_lookup_okay(struct mtrr_iter *iter)
 {
 	if (iter->fixed) {
 		iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
 		return true;
 	}
 	if (iter->range) {
 		iter->mem_type = iter->range->base & 0xff;
 		return true;
 	}
 	return false;
 }
 static void mtrr_lookup_next(struct mtrr_iter *iter)
 {
 	if (iter->fixed)
 		mtrr_lookup_fixed_next(iter);
 	else
 		mtrr_lookup_var_next(iter);
 }
 #define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
 	for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
 	     mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
 u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
 	struct mtrr_iter iter;
 	u64 start, end;
 	int type = -1;
 	const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
 			       | (1 << MTRR_TYPE_WRTHROUGH);
 	start = gfn_to_gpa(gfn);
 	end = start + PAGE_SIZE;
 	mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
 		int curr_type = iter.mem_type;
 		/*
 		 * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
 		 * Precedences.
 		 */
 		if (type == -1) {
 			type = curr_type;
 			continue;
 		}
 		/*
 		 * If two or more variable memory ranges match and the
 		 * memory types are identical, then that memory type is
 		 * used.
 		 */
 		if (type == curr_type)
 			continue;
 		/*
 		 * If two or more variable memory ranges match and one of
 		 * the memory types is UC, the UC memory type used.
 		 */
 		if (curr_type == MTRR_TYPE_UNCACHABLE)
 			return MTRR_TYPE_UNCACHABLE;
 		/*
 		 * If two or more variable memory ranges match and the
 		 * memory types are WT and WB, the WT memory type is used.
 		 */
 		if (((1 << type) & wt_wb_mask) &&
 		      ((1 << curr_type) & wt_wb_mask)) {
 			type = MTRR_TYPE_WRTHROUGH;
 			continue;
 		}
 		/*
 		 * For overlaps not defined by the above rules, processor
 		 * behavior is undefined.
 		 */
 		/* We use WB for this undefined behavior. :( */
 		return MTRR_TYPE_WRBACK;
 	}
 	if (iter.mtrr_disabled)
 		return mtrr_disabled_type(vcpu);
 	/* not contained in any MTRRs. */
 	if (type == -1)
 		return mtrr_default_type(mtrr_state);
 	/*
 	 * We just check one page, partially covered by MTRRs is
 	 * impossible.
 	 */
 	WARN_ON(iter.partial_map);
 	return type;
 }
 EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num)
 {
 	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
 	struct mtrr_iter iter;
 	u64 start, end;
 	int type = -1;
 	start = gfn_to_gpa(gfn);
 	end = gfn_to_gpa(gfn + page_num);
 	mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
 		if (type == -1) {
 			type = iter.mem_type;
 			continue;
 		}
 		if (type != iter.mem_type)
 			return false;
 	}
 	if (iter.mtrr_disabled)
 		return true;
 	if (!iter.partial_map)
 		return true;
 	if (type == -1)
 		return true;
 	return type == mtrr_default_type(mtrr_state);
 }
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@ -7670,39 +7670,25 @@ int vmx_vm_init(struct kvm *kvm)
 u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
-	/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+	/*
-	 * memory aliases with conflicting memory types and sometimes MCEs.
+	 * Force UC for host MMIO regions, as allowing the guest to access MMIO
-	 * We have to be careful as to what are honored and when.
+	 * with cacheable accesses will result in Machine Checks.
 	 *
 	 * For MMIO, guest CD/MTRR are ignored.  The EPT memory type is set to
 	 * UC.  The effective memory type is UC or WC depending on guest PAT.
 	 * This was historically the source of MCEs and we want to be
 	 * conservative.
 	 *
 	 * When there is no need to deal with noncoherent DMA (e.g., no VT-d
 	 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored.  The
 	 * EPT memory type is set to WB.  The effective memory type is forced
 	 * WB.
 	 *
 	 * Otherwise, we trust guest.  Guest CD/MTRR/PAT are all honored.  The
 	 * EPT memory type is used to emulate guest CD/MTRR.
 	 */
 	if (is_mmio)
 		return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+	/*
 	 * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
 	 * device attached and the CPU doesn't support self-snoop.  Letting the
 	 * guest control memory types on Intel CPUs without self-snoop may
 	 * result in unexpected behavior, and so KVM's (historical) ABI is to
 	 * trust the guest to behave only as a last resort.
 	 */
 	if (!static_cpu_has(X86_FEATURE_SELFSNOOP) &&
 	    !kvm_arch_has_noncoherent_dma(vcpu->kvm))
 		return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
-	if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
+	return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
 		if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 			return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
 		else
 			return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
 				VMX_EPT_IPAT_BIT;
 	}
 	return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
 }
 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -946,11 +946,6 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
 	if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
 		kvm_mmu_reset_context(vcpu);
 	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
 	    kvm_mmu_honors_guest_mtrrs(vcpu->kvm) &&
 	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 }
 EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
@ -11181,6 +11176,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	kvm_vcpu_srcu_read_lock(vcpu);
 	/*
 	 * Call this to ensure WC buffers in guest are evicted after each VM
 	 * Exit, so that the evicted WC writes can be snooped across all cpus
 	 */
 	smp_mb__after_srcu_read_lock();
 	/*
 	 * Profile KVM exit RIPs:
 	 */
@ -12264,7 +12265,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
 	kvm_xen_init_vcpu(vcpu);
 	kvm_vcpu_mtrr_init(vcpu);
 	vcpu_load(vcpu);
 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
 	kvm_vcpu_reset(vcpu, false);
@ -13528,13 +13528,13 @@ EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
 static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
 {
 	/*
-	 * Non-coherent DMA assignment and de-assignment will affect
+	 * Non-coherent DMA assignment and de-assignment may affect whether or
-	 * whether KVM honors guest MTRRs and cause changes in memtypes
+	 * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs
-	 * in TDP.
+	 * due to toggling the "ignore PAT" bit.  Zap all SPTEs when the first
-	 * So, pass %true unconditionally to indicate non-coherent DMA was,
+	 * (or last) non-coherent device is (un)registered to so that new SPTEs
-	 * or will be involved, and that zapping SPTEs might be necessary.
+	 * with the correct "ignore guest PAT" setting are created.
 	 */
-	if (__kvm_mmu_honors_guest_mtrrs(true))
+	if (kvm_mmu_may_ignore_guest_pat())
 		kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
 }
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@ -325,12 +325,8 @@ int handle_ud(struct kvm_vcpu *vcpu);
 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
 				   struct kvm_queued_exception *ex);
 void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
 u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num);
 bool kvm_vector_hashing_enabled(void);
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@ -343,6 +343,20 @@ static inline void smp_mb__after_srcu_read_unlock(void)
 	/* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
 }
 /**
 * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
 *
 * Converts the preceding srcu_read_lock into a two-way memory barrier.
 *
 * Call this after srcu_read_lock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_lock will appear to happen after
 * the preceding srcu_read_lock.
 */
 static inline void smp_mb__after_srcu_read_lock(void)
 {
 	/* __srcu_read_lock has smp_mb() internally so nothing to do here. */
 }
 DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
 		    _T->idx = srcu_read_lock(_T->lock),
 		    srcu_read_unlock(_T->lock, _T->idx),