1
linux/arch/x86/kernel/cpu/topology.c
Thomas Gleixner 9d22c96316 x86/topology: Handle bogus ACPI tables correctly
The ACPI specification clearly states how the processors should be
enumerated in the MADT:

 "To ensure that the boot processor is supported post initialization,
  two guidelines should be followed. The first is that OSPM should
  initialize processors in the order that they appear in the MADT. The
  second is that platform firmware should list the boot processor as the
  first processor entry in the MADT.
  ...
  Failure of OSPM implementations and platform firmware to abide by
  these guidelines can result in both unpredictable and non optimal
  platform operation."

The kernel relies on that ordering to detect the real BSP on crash kernels
which is important to avoid sending a INIT IPI to it as that would cause a
full machine reset.

On a Dell XPS 16 9640 the BIOS ignores this rule and enumerates the CPUs in
the wrong order. As a consequence the kernel falsely detects a crash kernel
and disables the corresponding CPU.

Prevent this by checking the IA32_APICBASE MSR for the BSP bit on the boot
CPU. If that bit is set, then the MADT based BSP detection can be safely
ignored. If the kernel detects a mismatch between the BSP bit and the first
enumerated MADT entry then emit a firmware bug message.

This obviously also has to be taken into account when the boot APIC ID and
the first enumerated APIC ID match. If the boot CPU does not have the BSP
bit set in the APICBASE MSR then there is no way for the boot CPU to
determine which of the CPUs is the real BSP. Sending an INIT to the real
BSP would reset the machine so the only sane way to deal with that is to
limit the number of CPUs to one and emit a corresponding warning message.

Fixes: 5c5682b9f8 ("x86/cpu: Detect real BSP on crash kernels")
Reported-by: Carsten Tolkmit <ctolkmit@ennit.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Carsten Tolkmit <ctolkmit@ennit.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/87le48jycb.ffs@tglx
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218837
2024-05-21 14:52:35 +02:00

572 lines
17 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* CPU/APIC topology
*
* The APIC IDs describe the system topology in multiple domain levels.
* The CPUID topology parser provides the information which part of the
* APIC ID is associated to the individual levels:
*
* [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
*
* The root space contains the package (socket) IDs.
*
* Not enumerated levels consume 0 bits space, but conceptually they are
* always represented. If e.g. only CORE and THREAD levels are enumerated
* then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
*
* If SMT is not supported, then the THREAD domain is still used. It then
* has the same physical ID as the CORE domain and is the only child of
* the core domain.
*
* This allows a unified view on the system independent of the enumerated
* domain levels without requiring any conditionals in the code.
*/
#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>
#include <xen/xen.h>
#include <asm/apic.h>
#include <asm/hypervisor.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
#include "cpu.h"
/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
/* Bitmap of physically present CPUs. */
DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
/* Used for CPU number allocation and parallel CPU bringup */
u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
/* Bitmaps to mark registered APICs at each topology domain */
static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
/*
* Keep track of assigned, disabled and rejected CPUs. Present assigned
* with 1 as CPU #0 is reserved for the boot CPU.
*/
static struct {
unsigned int nr_assigned_cpus;
unsigned int nr_disabled_cpus;
unsigned int nr_rejected_cpus;
u32 boot_cpu_apic_id;
u32 real_bsp_apic_id;
} topo_info __ro_after_init = {
.nr_assigned_cpus = 1,
.boot_cpu_apic_id = BAD_APICID,
.real_bsp_apic_id = BAD_APICID,
};
#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
return phys_id == (u64)cpuid_to_apicid[cpu];
}
#ifdef CONFIG_SMP
static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
{
if (!(apicid & (__max_threads_per_core - 1)))
cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
#else
static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
#endif
/*
* Convert the APIC ID to a domain level ID by masking out the low bits
* below the domain level @dom.
*/
static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
{
if (dom == TOPO_SMT_DOMAIN)
return apicid;
return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
}
static int topo_lookup_cpuid(u32 apic_id)
{
int i;
/* CPU# to APICID mapping is persistent once it is established */
for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
if (cpuid_to_apicid[i] == apic_id)
return i;
}
return -ENODEV;
}
static __init int topo_get_cpunr(u32 apic_id)
{
int cpu = topo_lookup_cpuid(apic_id);
if (cpu >= 0)
return cpu;
return topo_info.nr_assigned_cpus++;
}
static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
{
#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
set_cpu_present(cpu, true);
}
static __init bool check_for_real_bsp(u32 apic_id)
{
bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
u64 msr;
/*
* There is no real good way to detect whether this a kdump()
* kernel, but except on the Voyager SMP monstrosity which is not
* longer supported, the real BSP APIC ID is the first one which is
* enumerated by firmware. That allows to detect whether the boot
* CPU is the real BSP. If it is not, then do not register the APIC
* because sending INIT to the real BSP would reset the whole
* system.
*
* The first APIC ID which is enumerated by firmware is detectable
* because the boot CPU APIC ID is registered before that without
* invoking this code.
*/
if (topo_info.real_bsp_apic_id != BAD_APICID)
return false;
/*
* Check whether the enumeration order is broken by evaluating the
* BSP bit in the APICBASE MSR. If the CPU does not have the
* APICBASE MSR then the BSP detection is not possible and the
* kernel must rely on the firmware enumeration order.
*/
if (has_apic_base) {
rdmsrl(MSR_IA32_APICBASE, msr);
is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
}
if (apic_id == topo_info.boot_cpu_apic_id) {
/*
* If the boot CPU has the APIC BSP bit set then the
* firmware enumeration is agreeing. If the CPU does not
* have the APICBASE MSR then the only choice is to trust
* the enumeration order.
*/
if (is_bsp || !has_apic_base) {
topo_info.real_bsp_apic_id = apic_id;
return false;
}
/*
* If the boot APIC is enumerated first, but the APICBASE
* MSR does not have the BSP bit set, then there is no way
* to discover the real BSP here. Assume a crash kernel and
* limit the number of CPUs to 1 as an INIT to the real BSP
* would reset the machine.
*/
pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
set_nr_cpu_ids(1);
goto fwbug;
}
pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
topo_info.boot_cpu_apic_id, apic_id);
if (is_bsp) {
/*
* The boot CPU has the APIC BSP bit set. Use it and complain
* about the broken firmware enumeration.
*/
topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
goto fwbug;
}
pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
topo_info.real_bsp_apic_id = apic_id;
return true;
fwbug:
pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
return false;
}
static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
unsigned long *map)
{
unsigned int id, end, cnt = 0;
/* Calculate the exclusive end */
end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
/* Unfortunately there is no bitmap_weight_range() */
for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
cnt++;
return cnt;
}
static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
int cpu, dom;
if (present) {
set_bit(apic_id, phys_cpu_present_map);
/*
* Double registration is valid in case of the boot CPU
* APIC because that is registered before the enumeration
* of the APICs via firmware parsers or VM guest
* mechanisms.
*/
if (apic_id == topo_info.boot_cpu_apic_id)
cpu = 0;
else
cpu = topo_get_cpunr(apic_id);
cpuid_to_apicid[cpu] = apic_id;
topo_set_cpuids(cpu, apic_id, acpi_id);
} else {
u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
/*
* Check for present APICs in the same package when running
* on bare metal. Allow the bogosity in a guest.
*/
if (hypervisor_is_type(X86_HYPER_NATIVE) &&
topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
apic_id);
topo_info.nr_rejected_cpus++;
return;
}
topo_info.nr_disabled_cpus++;
}
/*
* Register present and possible CPUs in the domain
* maps. cpu_possible_map will be updated in
* topology_init_possible_cpus() after enumeration is done.
*/
for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
}
/**
* topology_register_apic - Register an APIC in early topology maps
* @apic_id: The APIC ID to set up
* @acpi_id: The ACPI ID associated to the APIC
* @present: True if the corresponding CPU is present
*/
void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
{
if (apic_id >= MAX_LOCAL_APIC) {
pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
topo_info.nr_rejected_cpus++;
return;
}
if (check_for_real_bsp(apic_id)) {
topo_info.nr_rejected_cpus++;
return;
}
/* CPU numbers exhausted? */
if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
topo_info.nr_rejected_cpus++;
return;
}
topo_register_apic(apic_id, acpi_id, present);
}
/**
* topology_register_boot_apic - Register the boot CPU APIC
* @apic_id: The APIC ID to set up
*
* Separate so CPU #0 can be assigned
*/
void __init topology_register_boot_apic(u32 apic_id)
{
WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
topo_info.boot_cpu_apic_id = apic_id;
topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
}
/**
* topology_get_logical_id - Retrieve the logical ID at a given topology domain level
* @apicid: The APIC ID for which to lookup the logical ID
* @at_level: The topology domain level to use
*
* @apicid must be a full APIC ID, not the normalized variant. It's valid to have
* all bits below the domain level specified by @at_level to be clear. So both
* real APIC IDs and backshifted normalized APIC IDs work correctly.
*
* Returns:
* - >= 0: The requested logical ID
* - -ERANGE: @apicid is out of range
* - -ENODEV: @apicid is not registered
*/
int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
{
/* Remove the bits below @at_level to get the proper level ID of @apicid */
unsigned int lvlid = topo_apicid(apicid, at_level);
if (lvlid >= MAX_LOCAL_APIC)
return -ERANGE;
if (!test_bit(lvlid, apic_maps[at_level].map))
return -ENODEV;
/* Get the number of set bits before @lvlid. */
return bitmap_weight(apic_maps[at_level].map, lvlid);
}
EXPORT_SYMBOL_GPL(topology_get_logical_id);
/**
* topology_unit_count - Retrieve the count of specified units at a given topology domain level
* @apicid: The APIC ID which specifies the search range
* @which_units: The domain level specifying the units to count
* @at_level: The domain level at which @which_units have to be counted
*
* This returns the number of possible units according to the enumerated
* information.
*
* E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
* counts the number of possible cores in the package to which @apicid
* belongs.
*
* @at_level must obviously be greater than @which_level to produce useful
* results. If @at_level is equal to @which_units the result is
* unsurprisingly 1. If @at_level is less than @which_units the results
* is by definition undefined and the function returns 0.
*/
unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
enum x86_topology_domains at_level)
{
/* Remove the bits below @at_level to get the proper level ID of @apicid */
unsigned int lvlid = topo_apicid(apicid, at_level);
if (lvlid >= MAX_LOCAL_APIC)
return 0;
if (!test_bit(lvlid, apic_maps[at_level].map))
return 0;
if (which_units > at_level)
return 0;
if (which_units == at_level)
return 1;
return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
}
#ifdef CONFIG_ACPI_HOTPLUG_CPU
/**
* topology_hotplug_apic - Handle a physical hotplugged APIC after boot
* @apic_id: The APIC ID to set up
* @acpi_id: The ACPI ID associated to the APIC
*/
int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
{
int cpu;
if (apic_id >= MAX_LOCAL_APIC)
return -EINVAL;
/* Reject if the APIC ID was not registered during enumeration. */
if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
return -ENODEV;
cpu = topo_lookup_cpuid(apic_id);
if (cpu < 0)
return -ENOSPC;
set_bit(apic_id, phys_cpu_present_map);
topo_set_cpuids(cpu, apic_id, acpi_id);
cpu_mark_primary_thread(cpu, apic_id);
return cpu;
}
/**
* topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
* @cpu: The CPU number for which the APIC ID is removed
*/
void topology_hotunplug_apic(unsigned int cpu)
{
u32 apic_id = cpuid_to_apicid[cpu];
if (apic_id == BAD_APICID)
return;
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
clear_bit(apic_id, phys_cpu_present_map);
set_cpu_present(cpu, false);
}
#endif
#ifdef CONFIG_X86_LOCAL_APIC
static unsigned int max_possible_cpus __initdata = NR_CPUS;
/**
* topology_apply_cmdline_limits_early - Apply topology command line limits early
*
* Ensure that command line limits are in effect before firmware parsing
* takes place.
*/
void __init topology_apply_cmdline_limits_early(void)
{
unsigned int possible = nr_cpu_ids;
/* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */
if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled)
possible = 1;
/* 'possible_cpus=N' */
possible = min_t(unsigned int, max_possible_cpus, possible);
if (possible < nr_cpu_ids) {
pr_info("Limiting to %u possible CPUs\n", possible);
set_nr_cpu_ids(possible);
}
}
static __init bool restrict_to_up(void)
{
if (!smp_found_config || ioapic_is_disabled)
return true;
/*
* XEN PV is special as it does not advertise the local APIC
* properly, but provides a fake topology for it so that the
* infrastructure works. So don't apply the restrictions vs. APIC
* here.
*/
if (xen_pv_domain())
return false;
return apic_is_disabled;
}
void __init topology_init_possible_cpus(void)
{
unsigned int assigned = topo_info.nr_assigned_cpus;
unsigned int disabled = topo_info.nr_disabled_cpus;
unsigned int cnta, cntb, cpu, allowed = 1;
unsigned int total = assigned + disabled;
u32 apicid, firstid;
/*
* If there was no APIC registered, then fake one so that the
* topology bitmap is populated. That ensures that the code below
* is valid and the various query interfaces can be used
* unconditionally. This does not affect the actual APIC code in
* any way because either the local APIC address has not been
* registered or the local APIC was disabled on the command line.
*/
if (topo_info.boot_cpu_apic_id == BAD_APICID)
topology_register_boot_apic(0);
if (!restrict_to_up()) {
if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
disabled += assigned - nr_cpu_ids;
assigned = nr_cpu_ids;
}
allowed = min_t(unsigned int, total, nr_cpu_ids);
}
if (total > allowed)
pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
assigned = min_t(unsigned int, allowed, assigned);
disabled = allowed - assigned;
topo_info.nr_assigned_cpus = assigned;
topo_info.nr_disabled_cpus = disabled;
total_cpus = allowed;
set_nr_cpu_ids(allowed);
cnta = domain_weight(TOPO_PKG_DOMAIN);
cntb = domain_weight(TOPO_DIE_DOMAIN);
__max_logical_packages = cnta;
__max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
pr_info("Max. logical packages: %3u\n", cnta);
pr_info("Max. logical dies: %3u\n", cntb);
pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
cnta = domain_weight(TOPO_CORE_DOMAIN);
cntb = domain_weight(TOPO_SMT_DOMAIN);
/*
* Can't use order delta here as order(cnta) can be equal
* order(cntb) even if cnta != cntb.
*/
__max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
__num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
__num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
if (topo_info.nr_rejected_cpus)
pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
init_cpu_present(cpumask_of(0));
init_cpu_possible(cpumask_of(0));
/* Assign CPU numbers to non-present CPUs */
for (apicid = 0; disabled; disabled--, apicid++) {
apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
MAX_LOCAL_APIC, apicid);
if (apicid >= MAX_LOCAL_APIC)
break;
cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
}
for (cpu = 0; cpu < allowed; cpu++) {
apicid = cpuid_to_apicid[cpu];
set_cpu_possible(cpu, true);
if (apicid == BAD_APICID)
continue;
cpu_mark_primary_thread(cpu, apicid);
set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
}
}
/*
* Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
*/
void __init topology_reset_possible_cpus_up(void)
{
init_cpu_present(cpumask_of(0));
init_cpu_possible(cpumask_of(0));
bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
if (topo_info.boot_cpu_apic_id != BAD_APICID)
set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
}
static int __init setup_possible_cpus(char *str)
{
get_option(&str, &max_possible_cpus);
return 0;
}
early_param("possible_cpus", setup_possible_cpus);
#endif