hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8XW3M
CVE: NA
-------------------------------------------------------------------------
When I test 'aer-inject' with the following procedures:
1. inject a fatal error into a upstream PCI bridge
2. remove the upstream bridge by sysfs
3. rescan the PCI tree by 'echo 1 > /sys/bus/pci/rescan'
4. execute command 'rmmod aer-inject'
5. remove the upstream bridge by sysfs again
I came across the following Oops.
[ 799.713238] Internal error: Oops: 96000007 [#1] SMP
[ 799.718099] Process bash (pid: 10683, stack limit = 0x00000000125a3b1b)
[ 799.724686] CPU: 108 PID: 10683 Comm: bash Kdump: loaded Not tainted 4.19.36 #2
[ 799.731962] Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.05 09/18/2019
[ 799.739325] pstate: 40400009 (nZcv daif +PAN -UAO)
[ 799.744104] pc : pci_remove_bus+0xc0/0x1c0
[ 799.748182] lr : pci_remove_bus+0x94/0x1c0
[ 799.752260] sp : ffffa02e335df940
[ 799.755560] x29: ffffa02e335df940 x28: ffff2000088216a8
[ 799.760849] x27: 1ffff405c66bbfbc x26: ffff20000a9518c0
[ 799.766139] x25: ffffa02dea6ec418 x24: 1ffff405bd4dd883
[ 799.771427] x23: ffffa02e72576628 x22: 1ffff405ce4aecc0
[ 799.776715] x21: ffffa02e72576608 x20: ffff200002e75080
[ 799.782003] x19: ffffa02e72576600 x18: 0000000000000000
[ 799.787291] x17: 0000000000000000 x16: 0000000000000000
[ 799.792578] x15: 0000000000000001 x14: dfff200000000000
[ 799.797866] x13: ffff20000a6dfaf0 x12: 0000000000000000
[ 799.803154] x11: 1fffe4000159b217 x10: ffff04000159b217
[ 799.808442] x9 : dfff200000000000 x8 : ffff20000acd90bf
[ 799.813730] x7 : 0000000000000000 x6 : 0000000000000000
[ 799.819017] x5 : 0000000000000001 x4 : 0000000000000000
[ 799.824306] x3 : 1ffff405dbe62603 x2 : 1fffe400005cea11
[ 799.829593] x1 : dfff200000000000 x0 : ffff200002e75088
[ 799.834882] Call trace:
[ 799.837323] pci_remove_bus+0xc0/0x1c0
[ 799.841056] pci_remove_bus_device+0xd0/0x2f0
[ 799.845392] pci_stop_and_remove_bus_device_locked+0x2c/0x40
[ 799.851028] remove_store+0x1b8/0x1d0
[ 799.854679] dev_attr_store+0x60/0x80
[ 799.858330] sysfs_kf_write+0x104/0x170
[ 799.862149] kernfs_fop_write+0x23c/0x430
[ 799.866143] __vfs_write+0xec/0x4e0
[ 799.869615] vfs_write+0x12c/0x3d0
[ 799.873001] ksys_write+0xd0/0x190
[ 799.876389] __arm64_sys_write+0x70/0xa0
[ 799.880298] el0_svc_common+0xfc/0x278
[ 799.884030] el0_svc_handler+0x50/0xc0
[ 799.887764] el0_svc+0x8/0xc
[ 799.890634] Code: d2c40001 f2fbffe1 91002280 d343fc02 (38e16841)
[ 799.896700] kernel fault(0x1) notification starting on CPU 108
It is because when we alloc a new bus in rescanning process, the
'pci_ops' of the newly allocced 'pci_bus' is inherited from its parent
pci bus. Whereas, the 'pci_ops' of the parent bus may be changed to
'aer_inj_pci_ops' in 'aer_inject()'. When we unload the module
'aer_inject', we only restore the 'pci_ops' for the pci bus of the
error-injected device and the root port in 'aer_inject_exit'. After we
have unloaded the module, the 'pci_ops' of the newly allocced pci bus is
still 'aer_inj_pci_ops'. When we access it, an Oops happened.
This patch add a member 'backup_ops' in 'struct pci_bus' to record the
original 'ops'. When we alloc a child pci bus, we assign the
'backup_ops' of the parent bus to the 'ops' of the child bus.
Maybe the best way is to not modify the 'pci_ops' in 'struct pci_bus',
but this will refactor the 'aer_inject' framework a lot. I haven't found
a better way to handle it.
Signed-off-by: Xiongfeng Wang <wangxiongfeng2(a)huawei.com>
Reviewed-by: Hanjun Guo <guohanjun(a)huawei.com>
Conflicts:
include/linux/pci.h
Signed-off-by: Xiongfeng Wang <wangxiongfeng2(a)huawei.com>
---
drivers/pci/probe.c | 12 +++++++++---
include/linux/pci.h | 1 +
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 43159965e09e..1681a9f454f4 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -897,6 +897,7 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge)
bus->sysdata = bridge->sysdata;
bus->ops = bridge->ops;
+ bus->backup_ops = bus->ops;
bus->number = bus->busn_res.start = bridge->busnr;
#ifdef CONFIG_PCI_DOMAINS_GENERIC
if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET)
@@ -1098,10 +1099,15 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
child->bus_flags = parent->bus_flags;
host = pci_find_host_bridge(parent);
- if (host->child_ops)
+ if (host->child_ops) {
child->ops = host->child_ops;
- else
- child->ops = parent->ops;
+ } else {
+ if (parent->backup_ops)
+ child->ops = parent->backup_ops;
+ else
+ child->ops = parent->ops;
+ }
+ child->backup_ops = child->ops;
/*
* Initialize some portions of the bus device, but don't register
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b56417276042..9c8d2cddf465 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -657,6 +657,7 @@ struct pci_bus {
struct resource busn_res; /* Bus numbers routed to this bus */
struct pci_ops *ops; /* Configuration access functions */
+ struct pci_ops *backup_ops;
void *sysdata; /* Hook for sys-specific extension */
struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */
--
2.20.1
hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I8XVM8
CVE: NA
------------------------------------
When I do some aer-inject and sysfs remove stress tests, I got the
following use-after-free Calltrace:
==================================================================
BUG: KASAN: use-after-free in pci_stop_bus_device+0x174/0x178
Read of size 8 at addr fffffc3e2e402218 by task bash/26311
CPU: 38 PID: 26311 Comm: bash Tainted: G W 4.19.105+ #82
Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 CS V5.B161.01 06/10/2021
Call trace:
dump_backtrace+0x0/0x360
show_stack+0x24/0x30
dump_stack+0x130/0x164
print_address_description+0x68/0x278
kasan_report+0x204/0x330
__asan_report_load8_noabort+0x30/0x40
pci_stop_bus_device+0x174/0x178
pci_stop_and_remove_bus_device_locked+0x24/0x40
remove_store+0x1c8/0x1e0
dev_attr_store+0x60/0x80
sysfs_kf_write+0x104/0x170
kernfs_fop_write+0x23c/0x430
__vfs_write+0xec/0x4e0
vfs_write+0x12c/0x3d0
ksys_write+0xe8/0x208
__arm64_sys_write+0x70/0xa0
el0_svc_common+0x10c/0x450
el0_svc_handler+0x50/0xc0
el0_svc+0x10/0x14
Allocated by task 684:
kasan_kmalloc+0xe0/0x190
kmem_cache_alloc_trace+0x110/0x240
pci_alloc_dev+0x4c/0x110
pci_scan_single_device+0x100/0x218
pci_scan_slot+0x8c/0x2d8
pci_scan_child_bus_extend+0x90/0x628
pci_scan_child_bus+0x24/0x30
pci_scan_bridge_extend+0x3b8/0xb28
pci_scan_child_bus_extend+0x350/0x628
pci_rescan_bus+0x24/0x48
pcie_do_fatal_recovery+0x390/0x4b0
handle_error_source+0x124/0x158
aer_isr+0x5a0/0x800
process_one_work+0x598/0x1250
worker_thread+0x384/0xf08
kthread+0x2a4/0x320
ret_from_fork+0x10/0x18
Freed by task 685:
__kasan_slab_free+0x120/0x228
kasan_slab_free+0x10/0x18
kfree+0x88/0x218
pci_release_dev+0xb4/0xd8
device_release+0x6c/0x1c0
kobject_put+0x12c/0x400
put_device+0x24/0x30
pci_dev_put+0x24/0x30
handle_error_source+0x12c/0x158
aer_isr+0x5a0/0x800
process_one_work+0x598/0x1250
worker_thread+0x384/0xf08
kthread+0x2a4/0x320
ret_from_fork+0x10/0x18
The buggy address belongs to the object at fffffc3e2e402200
which belongs to the cache kmalloc-4096 of size 4096
The buggy address is located 24 bytes inside of
4096-byte region [fffffc3e2e402200, fffffc3e2e403200)
The buggy address belongs to the page:
page:ffff7ff0f8b90000 count:1 mapcount:0 mapping:ffffdc365f016e00 index:0x0 compound_mapcount: 0
flags: 0x6ffffe0000008100(slab|head)
raw: 6ffffe0000008100 ffff7f70d83aae00 0000000300000003 ffffdc365f016e00
raw: 0000000000000000 0000000080070007 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
fffffc3e2e402100: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
fffffc3e2e402180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>fffffc3e2e402200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
fffffc3e2e402280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
fffffc3e2e402300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
It is caused by the following race condition:
CPU0 CPU1
remove_store() aer_isr()
device_remove_file_self() handle_error_source()
pci_stop_and_remove_bus_device_locked pcie_do_fatal_recovery()
(blocked) pci_lock_rescan_remove() #CPU1 acquire the lock
pci_stop_and_remove_bus_device()
pci_unlock_rescan_remove() #CPU1 release the lock
pci_lock_rescan_remove() #CPU0 acquire the lock
pci_dev_put() #free pci_dev
pci_stop_and_remove_bus_device()
pci_stop_bus_device() #use-after-free
pci_unlock_rescan_remove()
An AER interrupt is triggered on CPU1. CPU1 starts to process it. A work
'aer_isr()' is scheduled on CPU1. It calling into
pcie_do_fatal_recovery(), and aquire lock 'pci_rescan_remove_lock'.
Before it removes the sysfs corresponding to the error pci device, a
sysfs remove operation is executed on CPU0. CPU0 use
device_remove_file_self() to remove the sysfs directory and wait for the
lock to be released. After CPU1 finish pci_stop_and_remove_bus_device(),
it release the lock and free the 'pci_dev' in pci_dev_put(). CPU0 acquire
the lock and access the 'pci_dev'. Then a use-after-free is triggered.
To fix this issue, we increase the reference count in remove_store()
before remove the device and decrease the reference count in the end.
Signed-off-by: Xiongfeng Wang <wangxiongfeng2(a)huawei.com>
Reviewed-by: Hanjun Guo <guohanjun(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
Conflicts:
drivers/pci/pci-sysfs.c
Signed-off-by: Xiongfeng Wang <wangxiongfeng2(a)huawei.com>
---
drivers/pci/pci-sysfs.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 3317b9354716..e3373cdc5244 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -483,12 +483,17 @@ static ssize_t remove_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned long val;
+ struct pci_dev *pdev = to_pci_dev(dev);
if (kstrtoul(buf, 0, &val) < 0)
return -EINVAL;
- if (val && device_remove_file_self(dev, attr))
- pci_stop_and_remove_bus_device_locked(to_pci_dev(dev));
+ if (val) {
+ pci_dev_get(pdev);
+ if (device_remove_file_self(dev, attr))
+ pci_stop_and_remove_bus_device_locked(pdev);
+ pci_dev_put(pdev);
+ }
return count;
}
static DEVICE_ATTR_IGNORE_LOCKDEP(remove, 0220, NULL,
--
2.20.1
From: Dan Carpenter <dan.carpenter(a)linaro.org>
mainline inclusion
from mainline-v6.7-rc2
commit c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I8WQRG
CVE: CVE-2024-0607
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
---------------------------
The problem is in nft_byteorder_eval() where we are iterating through a
loop and writing to dst[0], dst[1], dst[2] and so on... On each
iteration we are writing 8 bytes. But dst[] is an array of u32 so each
element only has space for 4 bytes. That means that every iteration
overwrites part of the previous element.
I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter:
nf_tables: prevent OOB access in nft_byteorder_eval") which is a related
issue. I think that the reason we have not detected this bug in testing
is that most of time we only write one element.
Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion")
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
Signed-off-by: Liu Jian <liujian56(a)huawei.com>
Conflicts:
include/net/netfilter/nf_tables.h
net/netfilter/nft_byteorder.c
net/netfilter/nft_meta.c
---
net/netfilter/nft_byteorder.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 46a8f894717c..ba65e1e8732b 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -41,19 +41,20 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned((u64 *)&src[i]);
- put_unaligned_be64(src64, &dst[i]);
+ put_unaligned_be64(src64, &dst64[i]);
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned_be64(&src[i]);
- put_unaligned(src64, (u64 *)&dst[i]);
+ put_unaligned(src64, (u64 *)&dst64[i]);
}
break;
}
--
2.34.1
From: Dan Carpenter <dan.carpenter(a)linaro.org>
mainline inclusion
from mainline-v6.7-rc2
commit c301f0981fdd3fd1ffac6836b423c4d7a8e0eb63
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I8WQRG
CVE: CVE-2024-0607
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
---------------------------
The problem is in nft_byteorder_eval() where we are iterating through a
loop and writing to dst[0], dst[1], dst[2] and so on... On each
iteration we are writing 8 bytes. But dst[] is an array of u32 so each
element only has space for 4 bytes. That means that every iteration
overwrites part of the previous element.
I spotted this bug while reviewing commit caf3ef7468f7 ("netfilter:
nf_tables: prevent OOB access in nft_byteorder_eval") which is a related
issue. I think that the reason we have not detected this bug in testing
is that most of time we only write one element.
Fixes: ce1e7989d989 ("netfilter: nft_byteorder: provide 64bit le/be conversion")
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
Signed-off-by: Liu Jian <liujian56(a)huawei.com>
Conflicts:
include/net/netfilter/nf_tables.h
net/netfilter/nft_byteorder.c
---
include/net/netfilter/nf_tables.h | 4 ++--
net/netfilter/nft_byteorder.c | 5 +++--
net/netfilter/nft_meta.c | 2 +-
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ea893a6d9b36..7550328080bf 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -132,9 +132,9 @@ static inline u16 nft_reg_load16(const u32 *sreg)
return *(u16 *)sreg;
}
-static inline void nft_reg_store64(u32 *dreg, u64 val)
+static inline void nft_reg_store64(u64 *dreg, u64 val)
{
- put_unaligned(val, (u64 *)dreg);
+ put_unaligned(val, dreg);
}
static inline u64 nft_reg_load64(const u32 *sreg)
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 7b0b8fecb220..9d250bd60bb8 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -38,20 +38,21 @@ void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->size) {
case 8: {
+ u64 *dst64 = (void *)dst;
u64 src64;
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
src64 = nft_reg_load64(&src[i]);
- nft_reg_store64(&dst[i], be64_to_cpu(src64));
+ nft_reg_store64(&dst64[i], be64_to_cpu(src64));
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
src64 = (__force __u64)
cpu_to_be64(nft_reg_load64(&src[i]));
- nft_reg_store64(&dst[i], src64);
+ nft_reg_store64(&dst64[i], src64);
}
break;
}
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 44d9b38e5f90..cb5bb0e21b66 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -63,7 +63,7 @@ nft_meta_get_eval_time(enum nft_meta_keys key,
{
switch (key) {
case NFT_META_TIME_NS:
- nft_reg_store64(dest, ktime_get_real_ns());
+ nft_reg_store64((u64 *)dest, ktime_get_real_ns());
break;
case NFT_META_TIME_DAY:
nft_reg_store8(dest, nft_meta_weekday());
--
2.34.1