A crash kernel boot failure issue is reported in openeuler 5.10 series kernel(in x86 virtual machine with Hi1822 virtual nic). Finally the issue was narrowed down to irq vector conflict.
Backport following patchset from kernel maillist to fix this issue. https://lore.kernel.org/linux-pci/20181018183721.27467-3-gpiccoli@canonical....
Guilherme G. Piccoli (2): x86/PCI: Export find_cap() to be used in early PCI code x86/quirks: Add parameter to clear MSIs early on boot
.../admin-guide/kernel-parameters.txt | 6 ++++ arch/x86/include/asm/pci-direct.h | 2 ++ arch/x86/kernel/aperture_64.c | 30 ++--------------- arch/x86/kernel/early-quirks.c | 32 +++++++++++++++++++ arch/x86/pci/common.c | 4 +++ arch/x86/pci/early.c | 25 +++++++++++++++ 6 files changed, 71 insertions(+), 28 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/4091 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/4091 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/H...
From: "Guilherme G. Piccoli" gpiccoli@canonical.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8X2RA CVE: NA
Reference: https://lore.kernel.org/linux-pci/20181018183721.27467-2-gpiccoli@canonical....
-------------------------------
This patch exports (and renames) the function find_cap() to be used in the early PCI quirk code, by the next patch.
This is being moved out from AGP code to generic early-PCI code since it's not AGP-specific and can be used for any PCI device. No functional changes intended.
Signed-off-by: Guilherme G. Piccoli gpiccoli@canonical.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/pci-direct.h | 1 + arch/x86/kernel/aperture_64.c | 30 ++---------------------------- arch/x86/pci/early.c | 25 +++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 28 deletions(-)
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h index 94597a3cf3d0..813996305bf5 100644 --- a/arch/x86/include/asm/pci-direct.h +++ b/arch/x86/include/asm/pci-direct.h @@ -10,6 +10,7 @@ extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset); extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset); extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset); +extern u32 pci_early_find_cap(int bus, int slot, int func, int cap); extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val); extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 294ed4392a0e..cb25612b322d 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -126,32 +126,6 @@ static u32 __init allocate_aperture(void) }
-/* Find a PCI capability */ -static u32 __init find_cap(int bus, int slot, int func, int cap) -{ - int bytes; - u8 pos; - - if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - pos = read_pci_config_byte(bus, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - /* Read a standard AGPv3 bridge header */ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) { @@ -240,8 +214,8 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp) case PCI_CLASS_BRIDGE_HOST: case PCI_CLASS_BRIDGE_OTHER: /* needed? */ /* AGP bridge? */ - cap = find_cap(bus, slot, func, - PCI_CAP_ID_AGP); + cap = pci_early_find_cap(bus, slot, + func, PCI_CAP_ID_AGP); if (!cap) break; *valid_agp = 1; diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index f5fc953e5848..f1ba9d781b52 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -51,6 +51,31 @@ void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) outw(val, 0xcfc + (offset&2)); }
+u32 pci_early_find_cap(int bus, int slot, int func, int cap) +{ + int bytes; + u8 pos; + + if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(bus, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + int early_pci_allowed(void) { return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
From: "Guilherme G. Piccoli" gpiccoli@canonical.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8X2RA CVE: NA
Reference: https://lore.kernel.org/linux-pci/20181018183721.27467-3-gpiccoli@canonical....
-------------------------------
We observed a kdump failure in x86 that was narrowed down to MSI irq storm coming from a PCI network device. The bug manifests as a lack of progress in the boot process of kdump kernel, and a flood of kernel messages like:
[...] [ 342.265294] do_IRQ: 0.155 No irq handler for vector [ 342.266916] do_IRQ: 0.155 No irq handler for vector [ 347.258422] do_IRQ: 14053260 callbacks suppressed [...]
The root cause of the issue is that kexec process of the kdump kernel doesn't ensure PCI devices are reset or MSI capabilities are disabled, so a PCI adapter could produce a huge amount of irqs which would steal all the processing time for the CPU (specially since we usually restrict kdump kernel to use a single CPU only).
This patch implements the kernel parameter "pci=clearmsi" to clear the MSI/MSI-X enable bits in the Message Control register for all PCI devices during early boot time, thus preventing potential issues in the kexec'ed kernel. PCI spec also supports/enforces this need (see PCI Local Bus spec sections 6.8.1.3 and 6.8.2.3).
Suggested-by: Dan Streetman ddstreet@canonical.com Suggested-by: Gavin Shan shan.gavin@linux.alibaba.com Signed-off-by: Guilherme G. Piccoli gpiccoli@canonical.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 6 ++++ arch/x86/include/asm/pci-direct.h | 1 + arch/x86/kernel/early-quirks.c | 32 +++++++++++++++++++ arch/x86/pci/common.c | 4 +++ 4 files changed, 43 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a3feab139811..f6281e851615 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3905,6 +3905,12 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. + clearmsi [X86] Clears MSI/MSI-X enable bits early in boot + time in order to avoid issues like adapters + screaming irqs and preventing boot progress. + Also, it enforces the PCI Local Bus spec + rule that those bits should be 0 in system reset + events (useful for kexec/kdump cases). noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h index 813996305bf5..ebb3db2eee41 100644 --- a/arch/x86/include/asm/pci-direct.h +++ b/arch/x86/include/asm/pci-direct.h @@ -15,5 +15,6 @@ extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val); extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val); extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
+extern unsigned int pci_early_clear_msi; extern int early_pci_allowed(void); #endif /* _ASM_X86_PCI_DIRECT_H */ diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 8e27cbefaa4b..d2b0f1af830f 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -28,6 +28,37 @@ #include <asm/irq_remapping.h> #include <asm/early_ioremap.h>
+static void __init early_pci_clear_msi(int bus, int slot, int func) +{ + int pos; + u16 ctrl; + + if (likely(!pci_early_clear_msi)) + return; + + pr_info_once("Clearing MSI/MSI-X enable bits early in boot (quirk)\n"); + + pos = pci_early_find_cap(bus, slot, func, PCI_CAP_ID_MSI); + if (pos) { + ctrl = read_pci_config_16(bus, slot, func, pos + PCI_MSI_FLAGS); + ctrl &= ~PCI_MSI_FLAGS_ENABLE; + write_pci_config_16(bus, slot, func, pos + PCI_MSI_FLAGS, ctrl); + + /* Read again to flush previous write */ + ctrl = read_pci_config_16(bus, slot, func, pos + PCI_MSI_FLAGS); + } + + pos = pci_early_find_cap(bus, slot, func, PCI_CAP_ID_MSIX); + if (pos) { + ctrl = read_pci_config_16(bus, slot, func, pos + PCI_MSIX_FLAGS); + ctrl &= ~PCI_MSIX_FLAGS_ENABLE; + write_pci_config_16(bus, slot, func, pos + PCI_MSIX_FLAGS, ctrl); + + /* Read again to flush previous write */ + ctrl = read_pci_config_16(bus, slot, func, pos + PCI_MSIX_FLAGS); + } +} + static void __init fix_hypertransport_config(int num, int slot, int func) { u32 htcfg; @@ -721,6 +752,7 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, + { PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0, early_pci_clear_msi}, {} };
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 3507f456fcd0..13b0dd380aad 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -34,6 +34,7 @@ int noioapicreroute = 1; #endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; +unsigned int pci_early_clear_msi; const struct pci_raw_ops *__read_mostly raw_pci_ops; const struct pci_raw_ops *__read_mostly raw_pci_ext_ops;
@@ -606,6 +607,9 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "skip_isa_align")) { pci_probe |= PCI_CAN_SKIP_ISA_ALIGN; return NULL; + } else if (!strcmp(str, "clearmsi")) { + pci_early_clear_msi = 1; + return NULL; } else if (!strcmp(str, "noioapicquirk")) { noioapicquirk = 1; return NULL;