2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Low-Level PCI Support for PC
|
|
|
|
*
|
|
|
|
* (c) 1999--2000 Martin Mares <mj@ucw.cz>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/ioport.h>
|
|
|
|
#include <linux/init.h>
|
2006-02-18 02:36:55 -07:00
|
|
|
#include <linux/dmi.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 01:04:11 -07:00
|
|
|
#include <linux/slab.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#include <asm/acpi.h>
|
|
|
|
#include <asm/segment.h>
|
|
|
|
#include <asm/io.h>
|
|
|
|
#include <asm/smp.h>
|
2008-12-27 06:02:28 -07:00
|
|
|
#include <asm/pci_x86.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
|
|
|
|
PCI_PROBE_MMCONF;
|
|
|
|
|
2008-05-22 14:35:11 -07:00
|
|
|
unsigned int pci_early_dump_regs;
|
2006-11-16 05:16:23 -07:00
|
|
|
static int pci_bf_sort;
|
2005-04-16 15:20:36 -07:00
|
|
|
int pci_routeirq;
|
2008-06-11 07:35:14 -07:00
|
|
|
int noioapicquirk;
|
2008-07-15 04:48:55 -07:00
|
|
|
#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
|
|
|
|
int noioapicreroute = 0;
|
|
|
|
#else
|
2008-06-11 07:35:15 -07:00
|
|
|
int noioapicreroute = 1;
|
2008-07-15 04:48:55 -07:00
|
|
|
#endif
|
2005-04-16 15:20:36 -07:00
|
|
|
int pcibios_last_bus = -1;
|
2005-03-21 21:20:42 -07:00
|
|
|
unsigned long pirq_table_addr;
|
|
|
|
struct pci_bus *pci_root_bus;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct pci_raw_ops *raw_pci_ops;
|
2008-02-10 07:45:28 -07:00
|
|
|
struct pci_raw_ops *raw_pci_ext_ops;
|
|
|
|
|
|
|
|
int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
|
|
|
|
int reg, int len, u32 *val)
|
|
|
|
{
|
2008-07-11 14:21:17 -07:00
|
|
|
if (domain == 0 && reg < 256 && raw_pci_ops)
|
2008-02-10 07:45:28 -07:00
|
|
|
return raw_pci_ops->read(domain, bus, devfn, reg, len, val);
|
|
|
|
if (raw_pci_ext_ops)
|
|
|
|
return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
|
|
|
|
int reg, int len, u32 val)
|
|
|
|
{
|
2008-07-11 14:21:17 -07:00
|
|
|
if (domain == 0 && reg < 256 && raw_pci_ops)
|
2008-02-10 07:45:28 -07:00
|
|
|
return raw_pci_ops->write(domain, bus, devfn, reg, len, val);
|
|
|
|
if (raw_pci_ext_ops)
|
|
|
|
return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *value)
|
|
|
|
{
|
2008-02-10 07:45:28 -07:00
|
|
|
return raw_pci_read(pci_domain_nr(bus), bus->number,
|
2007-10-11 13:58:30 -07:00
|
|
|
devfn, where, size, value);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 value)
|
|
|
|
{
|
2008-02-10 07:45:28 -07:00
|
|
|
return raw_pci_write(pci_domain_nr(bus), bus->number,
|
2007-10-11 13:58:30 -07:00
|
|
|
devfn, where, size, value);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
struct pci_ops pci_root_ops = {
|
|
|
|
.read = pci_read,
|
|
|
|
.write = pci_write,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This interrupt-safe spinlock protects all accesses to PCI
|
|
|
|
* configuration space.
|
|
|
|
*/
|
2010-02-17 07:35:25 -07:00
|
|
|
DEFINE_RAW_SPINLOCK(pci_config_lock);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-03-27 01:31:18 -07:00
|
|
|
static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
|
|
|
|
{
|
|
|
|
pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
|
|
|
|
printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-12 05:09:57 -07:00
|
|
|
static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __devinitconst = {
|
2008-03-27 01:31:18 -07:00
|
|
|
/*
|
|
|
|
* Systems where PCI IO resource ISA alignment can be skipped
|
|
|
|
* when the ISA enable bit in the bridge control is not set
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
.callback = can_skip_ioresource_align,
|
|
|
|
.ident = "IBM System x3800",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = can_skip_ioresource_align,
|
|
|
|
.ident = "IBM System x3850",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "x3850"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = can_skip_ioresource_align,
|
|
|
|
.ident = "IBM System x3950",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "x3950"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
|
|
|
void __init dmi_check_skip_isa_align(void)
|
|
|
|
{
|
|
|
|
dmi_check_system(can_skip_pciprobe_dmi_table);
|
|
|
|
}
|
|
|
|
|
2008-05-12 13:57:46 -07:00
|
|
|
static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
|
|
|
|
|
|
|
|
if (pci_probe & PCI_NOASSIGN_ROMS) {
|
|
|
|
if (rom_r->parent)
|
|
|
|
return;
|
|
|
|
if (rom_r->start) {
|
|
|
|
/* we deal with BIOS assigned ROM later */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
rom_r->start = rom_r->end = rom_r->flags = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Called after each bus is probed, but before its children
|
|
|
|
* are examined.
|
|
|
|
*/
|
|
|
|
|
2009-04-16 12:31:10 -07:00
|
|
|
void __devinit pcibios_fixup_bus(struct pci_bus *b)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-05-12 13:57:46 -07:00
|
|
|
struct pci_dev *dev;
|
|
|
|
|
2009-04-18 10:11:25 -07:00
|
|
|
/* root bus? */
|
|
|
|
if (!b->parent)
|
|
|
|
x86_pci_root_bus_res_quirks(b);
|
2005-04-16 15:20:36 -07:00
|
|
|
pci_read_bridge_bases(b);
|
2008-05-12 13:57:46 -07:00
|
|
|
list_for_each_entry(dev, &b->devices, bus_list)
|
|
|
|
pcibios_fixup_device_resources(dev);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-29 13:23:23 -07:00
|
|
|
/*
|
|
|
|
* Only use DMI information to set this if nothing was passed
|
|
|
|
* on the kernel command line (which was parsed earlier).
|
|
|
|
*/
|
|
|
|
|
2007-10-03 12:15:40 -07:00
|
|
|
static int __devinit set_bf_sort(const struct dmi_system_id *d)
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-29 13:23:23 -07:00
|
|
|
{
|
|
|
|
if (pci_bf_sort == pci_bf_sort_default) {
|
|
|
|
pci_bf_sort = pci_dmi_bf;
|
|
|
|
printk(KERN_INFO "PCI: %s detected, enabling pci=bfsort.\n", d->ident);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-02-18 02:36:55 -07:00
|
|
|
/*
|
|
|
|
* Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
|
|
|
|
*/
|
|
|
|
#ifdef __i386__
|
2007-10-03 12:15:40 -07:00
|
|
|
static int __devinit assign_all_busses(const struct dmi_system_id *d)
|
2006-02-18 02:36:55 -07:00
|
|
|
{
|
|
|
|
pci_probe |= PCI_ASSIGN_ALL_BUSSES;
|
|
|
|
printk(KERN_INFO "%s detected: enabling PCI bus# renumbering"
|
|
|
|
" (pci=assign-busses)\n", d->ident);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-03-12 05:09:57 -07:00
|
|
|
static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-29 13:23:23 -07:00
|
|
|
#ifdef __i386__
|
2006-02-18 02:36:55 -07:00
|
|
|
/*
|
|
|
|
* Laptops which need pci=assign-busses to see Cardbus cards
|
|
|
|
*/
|
|
|
|
{
|
|
|
|
.callback = assign_all_busses,
|
|
|
|
.ident = "Samsung X20 Laptop",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Samsung Electronics"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "SX20S"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
#endif /* __i386__ */
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-29 13:23:23 -07:00
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "Dell PowerEdge 1950",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1950"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "Dell PowerEdge 1955",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1955"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "Dell PowerEdge 2900",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2900"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "Dell PowerEdge 2950",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2950"),
|
|
|
|
},
|
|
|
|
},
|
2007-03-23 21:58:07 -07:00
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "Dell PowerEdge R900",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge R900"),
|
|
|
|
},
|
|
|
|
},
|
2007-02-05 17:36:10 -07:00
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL20p G3",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G3"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL20p G4",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL20p G4"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL30p G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL30p G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL25p G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL25p G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL35p G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL35p G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL45p G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL45p G2",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL45p G2"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL460c G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL460c G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL465c G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL465c G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL480c G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL480c G1"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
|
|
|
.ident = "HP ProLiant BL685c G1",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant BL685c G1"),
|
|
|
|
},
|
|
|
|
},
|
2007-10-17 09:04:35 -07:00
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
2008-05-15 11:40:14 -07:00
|
|
|
.ident = "HP ProLiant DL360",
|
2007-10-17 09:04:35 -07:00
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
2008-05-15 11:40:14 -07:00
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
|
2007-10-17 09:04:35 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
2008-05-15 11:40:14 -07:00
|
|
|
.ident = "HP ProLiant DL380",
|
2007-10-17 09:04:35 -07:00
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
2008-05-15 11:40:14 -07:00
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
|
2007-10-17 09:04:35 -07:00
|
|
|
},
|
|
|
|
},
|
2007-09-13 11:21:34 -07:00
|
|
|
#ifdef __i386__
|
|
|
|
{
|
|
|
|
.callback = assign_all_busses,
|
|
|
|
.ident = "Compaq EVO N800c",
|
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
|
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "EVO N800c"),
|
|
|
|
},
|
|
|
|
},
|
|
|
|
#endif
|
2007-11-26 12:42:19 -07:00
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
2008-07-07 09:55:26 -07:00
|
|
|
.ident = "HP ProLiant DL385 G2",
|
2007-11-26 12:42:19 -07:00
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
2008-07-07 09:55:26 -07:00
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
|
2007-11-26 12:42:19 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.callback = set_bf_sort,
|
2008-07-07 09:55:26 -07:00
|
|
|
.ident = "HP ProLiant DL585 G2",
|
2007-11-26 12:42:19 -07:00
|
|
|
.matches = {
|
|
|
|
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
|
2008-07-07 09:55:26 -07:00
|
|
|
DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
|
2007-11-26 12:42:19 -07:00
|
|
|
},
|
|
|
|
},
|
2006-02-18 02:36:55 -07:00
|
|
|
{}
|
|
|
|
};
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-04-14 15:40:37 -07:00
|
|
|
void __init dmi_check_pciprobe(void)
|
|
|
|
{
|
|
|
|
dmi_check_system(pciprobe_dmi_table);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
struct pci_bus * __devinit pcibios_scan_root(int busnum)
|
|
|
|
{
|
|
|
|
struct pci_bus *bus = NULL;
|
2007-07-21 14:23:39 -07:00
|
|
|
struct pci_sysdata *sd;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
while ((bus = pci_find_next_bus(bus)) != NULL) {
|
|
|
|
if (bus->number == busnum) {
|
|
|
|
/* Already scanned */
|
|
|
|
return bus;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-07-21 14:23:39 -07:00
|
|
|
/* Allocate per-root-bus (not per bus) arch-specific data.
|
|
|
|
* TODO: leak; this memory is never freed.
|
|
|
|
* It's arguable whether it's worth the trouble to care.
|
|
|
|
*/
|
|
|
|
sd = kzalloc(sizeof(*sd), GFP_KERNEL);
|
|
|
|
if (!sd) {
|
|
|
|
printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
x86: get mp_bus_to_node early
Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.
Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.
In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.
The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.
pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.
In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.
Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.
This is an updated version of pci_sysdata and Jeff's pci_domain patch.
[ mingo@elte.hu: build fix ]
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-02-19 04:20:09 -07:00
|
|
|
sd->node = get_mp_bus_to_node(busnum);
|
|
|
|
|
2005-11-23 16:44:49 -07:00
|
|
|
printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
|
x86: get mp_bus_to_node early
Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.
Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.
In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.
The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.
pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.
In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.
Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.
This is an updated version of pci_sysdata and Jeff's pci_domain patch.
[ mingo@elte.hu: build fix ]
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-02-19 04:20:09 -07:00
|
|
|
bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
|
|
|
|
if (!bus)
|
|
|
|
kfree(sd);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
x86: get mp_bus_to_node early
Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.
Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.
In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.
The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.
pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.
In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.
Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.
This is an updated version of pci_sysdata and Jeff's pci_domain patch.
[ mingo@elte.hu: build fix ]
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-02-19 04:20:09 -07:00
|
|
|
return bus;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2008-07-02 13:50:29 -07:00
|
|
|
int __init pcibios_init(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct cpuinfo_x86 *c = &boot_cpu_data;
|
|
|
|
|
|
|
|
if (!raw_pci_ops) {
|
2005-11-23 16:44:49 -07:00
|
|
|
printk(KERN_WARNING "PCI: System does not support PCI\n");
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2009-10-14 13:31:39 -07:00
|
|
|
* Set PCI cacheline size to that of the CPU if the CPU has reported it.
|
|
|
|
* (For older CPUs that don't support cpuid, we se it to 32 bytes
|
|
|
|
* It's also good for 386/486s (which actually have 16)
|
2005-04-16 15:20:36 -07:00
|
|
|
* as quite a few PCI devices do not support smaller values.
|
|
|
|
*/
|
2009-10-14 13:31:39 -07:00
|
|
|
if (c->x86_clflush_size > 0) {
|
|
|
|
pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
|
|
|
|
printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
|
|
|
|
pci_dfl_cache_line_size << 2);
|
|
|
|
} else {
|
|
|
|
pci_dfl_cache_line_size = 32 >> 2;
|
|
|
|
printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
pcibios_resource_survey();
|
|
|
|
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-29 13:23:23 -07:00
|
|
|
if (pci_bf_sort >= pci_force_bf)
|
|
|
|
pci_sort_breadthfirst();
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
char * __devinit pcibios_setup(char *str)
|
|
|
|
{
|
|
|
|
if (!strcmp(str, "off")) {
|
|
|
|
pci_probe = 0;
|
|
|
|
return NULL;
|
PCI: optionally sort device lists breadth-first
Problem:
New Dell PowerEdge servers have 2 embedded ethernet ports, which are
labeled NIC1 and NIC2 on the chassis, in the BIOS setup screens, and
in the printed documentation. Assuming no other add-in ethernet ports
in the system, Linux 2.4 kernels name these eth0 and eth1
respectively. Many people have come to expect this naming. Linux 2.6
kernels name these eth1 and eth0 respectively (backwards from
expectations). I also have reports that various Sun and HP servers
have similar behavior.
Root cause:
Linux 2.4 kernels walk the pci_devices list, which happens to be
sorted in breadth-first order (or pcbios_find_device order on i386,
which most often is breadth-first also). 2.6 kernels have both the
pci_devices list and the pci_bus_type.klist_devices list, the latter
is what is walked at driver load time to match the pci_id tables; this
klist happens to be in depth-first order.
On systems where, for physical routing reasons, NIC1 appears on a
lower bus number than NIC2, but NIC2's bridge is discovered first in
the depth-first ordering, NIC2 will be discovered before NIC1. If the
list were sorted breadth-first, NIC1 would be discovered before NIC2.
A PowerEdge 1955 system has the following topology which easily
exhibits the difference between depth-first and breadth-first device
lists.
-[0000:00]-+-00.0 Intel Corporation 5000P Chipset Memory Controller Hub
+-02.0-[0000:03-08]--+-00.0-[0000:04-07]--+-00.0-[0000:05-06]----00.0-[0000:06]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC2, 2.4 kernel name eth1, 2.6 kernel name eth0)
+-1c.0-[0000:01-02]----00.0-[0000:02]----00.0 Broadcom Corporation NetXtreme II BCM5708S Gigabit Ethernet (labeled NIC1, 2.4 kernel name eth0, 2.6 kernel name eth1)
Other factors, such as device driver load order and the presence of
PCI slots at various points in the bus hierarchy further complicate
this problem; I'm not trying to solve those here, just restore the
device order, and thus basic behavior, that 2.4 kernels had.
Solution:
The solution can come in multiple steps.
Suggested fix #1: kernel
Patch below optionally sorts the two device lists into breadth-first
ordering to maintain compatibility with 2.4 kernels. It adds two new
command line options:
pci=bfsort
pci=nobfsort
to force the sort order, or not, as you wish. It also adds DMI checks
for the specific Dell systems which exhibit "backwards" ordering, to
make them "right".
Suggested fix #2: udev rules from userland
Many people also have the expectation that embedded NICs are always
discovered before add-in NICs (which this patch does not try to do).
Using the PCI IRQ Routing Table provided by system BIOS, it's easy to
determine which PCI devices are embedded, or if add-in, which PCI slot
they're in. I'm working on a tool that would allow udev to name
ethernet devices in ascending embedded, slot 1 .. slot N order,
subsort by PCI bus/dev/fn breadth-first. It'll be possible to use it
independent of udev as well for those distributions that don't use
udev in their installers.
Suggested fix #3: system board routing rules
One can constrain the system board layout to put NIC1 ahead of NIC2
regardless of breadth-first or depth-first discovery order. This adds
a significant level of complexity to board routing, and may not be
possible in all instances (witness the above systems from several
major manufacturers). I don't want to encourage this particular train
of thought too far, at the expense of not doing #1 or #2 above.
Feedback appreciated. Patch tested on a Dell PowerEdge 1955 blade
with 2.6.18.
You'll also note I took some liberty and temporarily break the klist
abstraction to simplify and speed up the sort algorithm. I think
that's both safe and appropriate in this instance.
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
2006-09-29 13:23:23 -07:00
|
|
|
} else if (!strcmp(str, "bfsort")) {
|
|
|
|
pci_bf_sort = pci_force_bf;
|
|
|
|
return NULL;
|
|
|
|
} else if (!strcmp(str, "nobfsort")) {
|
|
|
|
pci_bf_sort = pci_force_nobf;
|
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
#ifdef CONFIG_PCI_BIOS
|
|
|
|
else if (!strcmp(str, "bios")) {
|
|
|
|
pci_probe = PCI_PROBE_BIOS;
|
|
|
|
return NULL;
|
|
|
|
} else if (!strcmp(str, "nobios")) {
|
|
|
|
pci_probe &= ~PCI_PROBE_BIOS;
|
|
|
|
return NULL;
|
|
|
|
} else if (!strcmp(str, "biosirq")) {
|
|
|
|
pci_probe |= PCI_BIOS_IRQ_SCAN;
|
|
|
|
return NULL;
|
2005-03-21 21:20:42 -07:00
|
|
|
} else if (!strncmp(str, "pirqaddr=", 9)) {
|
|
|
|
pirq_table_addr = simple_strtoul(str+9, NULL, 0);
|
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_PCI_DIRECT
|
|
|
|
else if (!strcmp(str, "conf1")) {
|
|
|
|
pci_probe = PCI_PROBE_CONF1 | PCI_NO_CHECKS;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
else if (!strcmp(str, "conf2")) {
|
|
|
|
pci_probe = PCI_PROBE_CONF2 | PCI_NO_CHECKS;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_PCI_MMCONFIG
|
|
|
|
else if (!strcmp(str, "nommconf")) {
|
|
|
|
pci_probe &= ~PCI_PROBE_MMCONF;
|
|
|
|
return NULL;
|
|
|
|
}
|
2008-04-14 16:08:25 -07:00
|
|
|
else if (!strcmp(str, "check_enable_amd_mmconf")) {
|
|
|
|
pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
else if (!strcmp(str, "noacpi")) {
|
|
|
|
acpi_noirq_set();
|
|
|
|
return NULL;
|
|
|
|
}
|
2006-09-26 01:52:41 -07:00
|
|
|
else if (!strcmp(str, "noearly")) {
|
|
|
|
pci_probe |= PCI_PROBE_NOEARLY;
|
|
|
|
return NULL;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
#ifndef CONFIG_X86_VISWS
|
|
|
|
else if (!strcmp(str, "usepirqmask")) {
|
|
|
|
pci_probe |= PCI_USE_PIRQ_MASK;
|
|
|
|
return NULL;
|
|
|
|
} else if (!strncmp(str, "irqmask=", 8)) {
|
|
|
|
pcibios_irq_mask = simple_strtol(str+8, NULL, 0);
|
|
|
|
return NULL;
|
|
|
|
} else if (!strncmp(str, "lastbus=", 8)) {
|
|
|
|
pcibios_last_bus = simple_strtol(str+8, NULL, 0);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
else if (!strcmp(str, "rom")) {
|
|
|
|
pci_probe |= PCI_ASSIGN_ROMS;
|
|
|
|
return NULL;
|
2008-05-12 13:57:46 -07:00
|
|
|
} else if (!strcmp(str, "norom")) {
|
|
|
|
pci_probe |= PCI_NOASSIGN_ROMS;
|
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
} else if (!strcmp(str, "assign-busses")) {
|
|
|
|
pci_probe |= PCI_ASSIGN_ALL_BUSSES;
|
|
|
|
return NULL;
|
2009-06-24 16:23:03 -07:00
|
|
|
} else if (!strcmp(str, "use_crs")) {
|
|
|
|
pci_probe |= PCI_USE__CRS;
|
2007-10-03 15:56:51 -07:00
|
|
|
return NULL;
|
2010-02-23 10:24:41 -07:00
|
|
|
} else if (!strcmp(str, "nocrs")) {
|
|
|
|
pci_probe |= PCI_ROOT_NO_CRS;
|
|
|
|
return NULL;
|
2008-05-22 14:35:11 -07:00
|
|
|
} else if (!strcmp(str, "earlydump")) {
|
|
|
|
pci_early_dump_regs = 1;
|
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
} else if (!strcmp(str, "routeirq")) {
|
|
|
|
pci_routeirq = 1;
|
|
|
|
return NULL;
|
2008-03-27 01:31:18 -07:00
|
|
|
} else if (!strcmp(str, "skip_isa_align")) {
|
|
|
|
pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
|
|
|
|
return NULL;
|
2008-06-11 07:35:14 -07:00
|
|
|
} else if (!strcmp(str, "noioapicquirk")) {
|
|
|
|
noioapicquirk = 1;
|
|
|
|
return NULL;
|
2008-06-11 07:35:15 -07:00
|
|
|
} else if (!strcmp(str, "ioapicreroute")) {
|
|
|
|
if (noioapicreroute != -1)
|
|
|
|
noioapicreroute = 0;
|
|
|
|
return NULL;
|
2008-07-15 04:48:55 -07:00
|
|
|
} else if (!strcmp(str, "noioapicreroute")) {
|
|
|
|
if (noioapicreroute != -1)
|
|
|
|
noioapicreroute = 1;
|
|
|
|
return NULL;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int pcibios_assign_all_busses(void)
|
|
|
|
{
|
|
|
|
return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pcibios_enable_device(struct pci_dev *dev, int mask)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2008-03-04 11:57:01 -07:00
|
|
|
if ((err = pci_enable_resources(dev, mask)) < 0)
|
2005-04-16 15:20:36 -07:00
|
|
|
return err;
|
|
|
|
|
2009-01-05 06:50:27 -07:00
|
|
|
if (!pci_dev_msi_enabled(dev))
|
2007-03-28 06:36:09 -07:00
|
|
|
return pcibios_enable_irq(dev);
|
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2005-07-27 20:02:00 -07:00
|
|
|
|
|
|
|
void pcibios_disable_device (struct pci_dev *dev)
|
|
|
|
{
|
2009-01-05 06:50:27 -07:00
|
|
|
if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
|
2005-07-27 20:02:00 -07:00
|
|
|
pcibios_disable_irq(dev);
|
|
|
|
}
|
2007-08-10 13:01:19 -07:00
|
|
|
|
2008-11-10 15:30:50 -07:00
|
|
|
int pci_ext_cfg_avail(struct pci_dev *dev)
|
|
|
|
{
|
|
|
|
if (raw_pci_ext_ops)
|
|
|
|
return 1;
|
|
|
|
else
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-04-29 13:38:48 -07:00
|
|
|
struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
|
2007-08-10 13:01:19 -07:00
|
|
|
{
|
|
|
|
struct pci_bus *bus = NULL;
|
|
|
|
struct pci_sysdata *sd;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate per-root-bus (not per bus) arch-specific data.
|
|
|
|
* TODO: leak; this memory is never freed.
|
|
|
|
* It's arguable whether it's worth the trouble to care.
|
|
|
|
*/
|
|
|
|
sd = kzalloc(sizeof(*sd), GFP_KERNEL);
|
|
|
|
if (!sd) {
|
|
|
|
printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno);
|
|
|
|
return NULL;
|
|
|
|
}
|
x86: get mp_bus_to_node early
Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.
Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.
In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.
The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.
pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.
In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.
Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.
This is an updated version of pci_sysdata and Jeff's pci_domain patch.
[ mingo@elte.hu: build fix ]
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-02-19 04:20:09 -07:00
|
|
|
sd->node = node;
|
|
|
|
bus = pci_scan_bus(busno, ops, sd);
|
2007-08-10 13:01:19 -07:00
|
|
|
if (!bus)
|
|
|
|
kfree(sd);
|
|
|
|
|
|
|
|
return bus;
|
|
|
|
}
|
x86: get mp_bus_to_node early
Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.
Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.
In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.
The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.
pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.
In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.
Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.
This is an updated version of pci_sysdata and Jeff's pci_domain patch.
[ mingo@elte.hu: build fix ]
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-02-19 04:20:09 -07:00
|
|
|
|
2008-04-29 13:38:48 -07:00
|
|
|
struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno)
|
x86: get mp_bus_to_node early
Currently, on an amd k8 system with multi ht chains, the numa_node of
pci devices under /sys/devices/pci0000:80/* is always 0, even if that
chain is on node 1 or 2 or 3.
Workaround: pcibus_to_node(bus) is used when we want to get the node that
pci_device is on.
In struct device, we already have numa_node member, and we could use
dev_to_node()/set_dev_node() to get and set numa_node in the device.
set_dev_node is called in pci_device_add() with pcibus_to_node(bus),
and pcibus_to_node uses bus->sysdata for nodeid.
The problem is when pci_add_device is called, bus->sysdata is not assigned
correct nodeid yet. The result is that numa_node will always be 0.
pcibios_scan_root and pci_scan_root could take sysdata. So we need to get
mp_bus_to_node mapping before these two are called, and thus
get_mp_bus_to_node could get correct node for sysdata in root bus.
In scanning of the root bus, all child busses will take parent bus sysdata.
So all pci_device->dev.numa_node will be assigned correctly and automatically.
Later we could use dev_to_node(&pci_dev->dev) to get numa_node, and we
could also could make other bus specific device get the correct numa_node
too.
This is an updated version of pci_sysdata and Jeff's pci_domain patch.
[ mingo@elte.hu: build fix ]
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2008-02-19 04:20:09 -07:00
|
|
|
{
|
|
|
|
return pci_scan_bus_on_node(busno, &pci_root_ops, -1);
|
|
|
|
}
|
2009-07-10 14:04:30 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* NUMA info for PCI busses
|
|
|
|
*
|
|
|
|
* Early arch code is responsible for filling in reasonable values here.
|
|
|
|
* A node id of "-1" means "use current node". In other words, if a bus
|
|
|
|
* has a -1 node id, it's not tightly coupled to any particular chunk
|
|
|
|
* of memory (as is the case on some Nehalem systems).
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
|
|
|
|
#define BUS_NR 256
|
|
|
|
|
|
|
|
#ifdef CONFIG_X86_64
|
|
|
|
|
|
|
|
static int mp_bus_to_node[BUS_NR] = {
|
|
|
|
[0 ... BUS_NR - 1] = -1
|
|
|
|
};
|
|
|
|
|
|
|
|
void set_mp_bus_to_node(int busnum, int node)
|
|
|
|
{
|
|
|
|
if (busnum >= 0 && busnum < BUS_NR)
|
|
|
|
mp_bus_to_node[busnum] = node;
|
|
|
|
}
|
|
|
|
|
|
|
|
int get_mp_bus_to_node(int busnum)
|
|
|
|
{
|
|
|
|
int node = -1;
|
|
|
|
|
|
|
|
if (busnum < 0 || busnum > (BUS_NR - 1))
|
|
|
|
return node;
|
|
|
|
|
|
|
|
node = mp_bus_to_node[busnum];
|
|
|
|
|
|
|
|
/*
|
|
|
|
* let numa_node_id to decide it later in dma_alloc_pages
|
|
|
|
* if there is no ram on that node
|
|
|
|
*/
|
|
|
|
if (node != -1 && !node_online(node))
|
|
|
|
node = -1;
|
|
|
|
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
|
|
|
#else /* CONFIG_X86_32 */
|
|
|
|
|
2009-09-18 09:13:57 -07:00
|
|
|
static int mp_bus_to_node[BUS_NR] = {
|
2009-07-10 14:04:30 -07:00
|
|
|
[0 ... BUS_NR - 1] = -1
|
|
|
|
};
|
|
|
|
|
|
|
|
void set_mp_bus_to_node(int busnum, int node)
|
|
|
|
{
|
|
|
|
if (busnum >= 0 && busnum < BUS_NR)
|
|
|
|
mp_bus_to_node[busnum] = (unsigned char) node;
|
|
|
|
}
|
|
|
|
|
|
|
|
int get_mp_bus_to_node(int busnum)
|
|
|
|
{
|
|
|
|
int node;
|
|
|
|
|
|
|
|
if (busnum < 0 || busnum > (BUS_NR - 1))
|
|
|
|
return 0;
|
|
|
|
node = mp_bus_to_node[busnum];
|
|
|
|
return node;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_X86_32 */
|
|
|
|
|
|
|
|
#endif /* CONFIG_NUMA */
|