From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I641XX CVE: NA
--------------------------------
Patch 1378a5ee451a ("mm: store compound_nr as well as compound_order") add a new member compound_nr in struct page, and use this new member insteal of compound_order in hugetlb_cgroup_move_parent() to compute the nr_pages.
In free_hugepage_to_hugetlb(), we reset page->mapping to NULL for each subpage. Since page->mapping and page->compound_nr is union, we reset page->compound_nr too unexpectly. This will finally result the nr_pages incorrect in hugetlb_cgroup_move_parent() and can't release hugetlb_cgroup.
Fix this problem by reset page->compound_nr using set_compound_order().
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Nanyong Sun sunnanyong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/dynamic_hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index eb9b528b73de..8a985d816c07 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -799,7 +799,8 @@ static int free_hugepage_to_hugetlb(struct dhugetlb_pool *hpool) p->mapping = NULL; } set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - + /* compound_nr and mapping are union in page, reset it. */ + set_compound_order(page, PUD_SHIFT - PAGE_SHIFT); nid = page_to_nid(page); SetHPageFreed(page); list_move(&page->lru, &h->hugepage_freelists[nid]);
From: Ard Biesheuvel ardb@kernel.org
mainline inclusion from mainline-v5.11 commit fe0af09074bfeb46a35357e67635eefe33cdfc49 category: bugfix bugzilla: 187402, https://gitee.com/openeuler/kernel/issues/I61CL6
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This reverts commit 32cf1a12cad43358e47dac8014379c2f33dfbed4.
The 'exisitng buffer' in this case is the firmware provided table, and we should not modify that in place. This fixes a crash on arm64 with initrd table overrides, in which case the DSDT is not mapped with read/write permissions.
Reported-by: Shawn Guo shawn.guo@linaro.org Signed-off-by: Ard Biesheuvel ardb@kernel.org Tested-by: Shawn Guo shawn.guo@linaro.org Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com
Conflicts: drivers/acpi/acpica/nsrepair2.c Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/acpi/acpica/nsrepair2.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c index fb4f8f6209bd..125143c41bb8 100644 --- a/drivers/acpi/acpica/nsrepair2.c +++ b/drivers/acpi/acpica/nsrepair2.c @@ -491,8 +491,9 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, union acpi_operand_object **return_object_ptr) { union acpi_operand_object *return_object = *return_object_ptr; - char *dest; + union acpi_operand_object *new_string; char *source; + char *dest;
ACPI_FUNCTION_NAME(ns_repair_HID);
@@ -513,6 +514,13 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, return (AE_OK); }
+ /* It is simplest to always create a new string object */ + + new_string = acpi_ut_create_string_object(return_object->string.length); + if (!new_string) { + return (AE_NO_MEMORY); + } + /* * Remove a leading asterisk if present. For some unknown reason, there * are many machines in the field that contains IDs like this. @@ -522,7 +530,7 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, source = return_object->string.pointer; if (*source == '*') { source++; - return_object->string.length--; + new_string->string.length--;
ACPI_DEBUG_PRINT((ACPI_DB_REPAIR, "%s: Removed invalid leading asterisk\n", @@ -537,11 +545,12 @@ acpi_ns_repair_HID(struct acpi_evaluate_info *info, * "NNNN####" where N is an uppercase letter or decimal digit, and * # is a hex digit. */ - for (dest = return_object->string.pointer; *source; dest++, source++) { + for (dest = new_string->string.pointer; *source; dest++, source++) { *dest = (char)toupper((int)*source); } - return_object->string.pointer[return_object->string.length] = 0;
+ acpi_ut_remove_reference(return_object); + *return_object_ptr = new_string; return (AE_OK); }
From: Jiasheng Jiang jiasheng@iscas.ac.cn
mainline inclusion from mainline-v5.17-rc1 commit 2cea3ec5b0099d0e9dd6752aa86e08bce38d6b32 category: bugfix bugzilla: 187402, https://gitee.com/openeuler/kernel/issues/I61CL6
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Because devres_alloc() may fail, devm_ioremap() may return NULL.
Then, 'clk_data->base' will be assigned to clkdev->data->base in platform_device_register_data().
The PTR_ERR_OR_ZERO() check on clk_data does not cover 'base', so it is better to add an explicit check against NULL after updating it.
Fixes: 3f4ba94e3615 ("ACPI: APD: Add AMD misc clock handler support") Signed-off-by: Jiasheng Jiang jiasheng@iscas.ac.cn [ rjw: Changelog rewrite ] Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/acpi/acpi_apd.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index f1841d1714be..c1f68fdb31bf 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -96,6 +96,8 @@ static int fch_misc_setup(struct apd_private_data *pdata) resource_size(rentry->res)); break; } + if (!clk_data->base) + return -ENOMEM;
acpi_dev_free_resource_list(&resource_list);
From: Michal Simek michal.simek@xilinx.com
mainline inclusion from mainline-v5.13-rc1 commit b991f8c3622c8c9d01a1ada382682a731932e651 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60OLE CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Right now the handling order depends on how entries are coming which is corresponding with order in DT. We have reached the case with DT overlays where conf and mux descriptions are exchanged which ends up in sequence that firmware has been asked to perform configuration before requesting the pin.
The patch is enforcing the order that pin is requested all the time first followed by pin configuration. This change will ensure that firmware gets requests in the right order.
Signed-off-by: Michal Simek michal.simek@xilinx.com Link: https://lore.kernel.org/r/cfbe01f791c2dd42a596cbda57e15599969b57aa.161536421... Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pinctrl/core.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 840000870d5a..48ee9d1622b5 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1258,13 +1258,34 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
p->state = NULL;
- /* Apply all the settings for the new state */ + /* Apply all the settings for the new state - pinmux first */ list_for_each_entry(setting, &state->settings, node) { switch (setting->type) { case PIN_MAP_TYPE_MUX_GROUP: ret = pinmux_enable_setting(setting); break; case PIN_MAP_TYPE_CONFIGS_PIN: + case PIN_MAP_TYPE_CONFIGS_GROUP: + break; + default: + ret = -EINVAL; + break; + } + + if (ret < 0) + goto unapply_new_state; + + /* Do not link hogs (circular dependency) */ + if (p != setting->pctldev->p) + pinctrl_link_add(setting->pctldev, p->dev); + } + + /* Apply all the settings for the new state - pinconf after */ + list_for_each_entry(setting, &state->settings, node) { + switch (setting->type) { + case PIN_MAP_TYPE_MUX_GROUP: + break; + case PIN_MAP_TYPE_CONFIGS_PIN: case PIN_MAP_TYPE_CONFIGS_GROUP: ret = pinconf_apply_setting(setting); break;
From: Michal Simek michal.simek@xilinx.com
mainline inclusion from mainline-v5.13-rc1 commit 6a37d750037827d385672acdebf5788fc2ffa633 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I60OLE CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Static analyzer tool found that the ret variable is not initialized but code expects ret value >=0 when pinconf is skipped in the first pinmux loop. The same expectation is for pinmux in a pinconf loop. That's why initialize ret to 0 to avoid uninitialized ret value in first loop or reusing ret value from first loop in second.
Addresses-Coverity: ("Uninitialized variables") Signed-off-by: Michal Simek michal.simek@xilinx.com Cc: Dan Carpenter dan.carpenter@oracle.com Reviewed-by: Colin Ian King colin.king@canonical.com Link: https://lore.kernel.org/r/e5203bae68eb94b4b8b4e67e5e7b4d86bb989724.161553429... Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Yuyao Lin linyuyao1@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/pinctrl/core.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 48ee9d1622b5..a3097f0075aa 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1266,6 +1266,7 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state) break; case PIN_MAP_TYPE_CONFIGS_PIN: case PIN_MAP_TYPE_CONFIGS_GROUP: + ret = 0; break; default: ret = -EINVAL; @@ -1284,6 +1285,7 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state) list_for_each_entry(setting, &state->settings, node) { switch (setting->type) { case PIN_MAP_TYPE_MUX_GROUP: + ret = 0; break; case PIN_MAP_TYPE_CONFIGS_PIN: case PIN_MAP_TYPE_CONFIGS_GROUP:
From: Li Lingfeng lilingfeng3@huawei.com
hulk inclusion category: performance bugzilla: https://gitee.com/openeuler/kernel/issues/I65DCK CVE: NA
-------------------------------
This reverts commit 70779878cbc7bae8e43f7e8446a33a3c2b315f90.
There's no evidence that buffer_uptodate and set_buffer_uptodate are unreliable without barriers. What's more, this patch result in the performance deterioration.
Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/buffer_head.h | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 20a2ff1c07a1..6b47f94378c5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -117,6 +117,7 @@ static __always_inline int test_clear_buffer_##name(struct buffer_head *bh) \ * of the form "mark_buffer_foo()". These are higher-level functions which * do something in addition to setting a b_state bit. */ +BUFFER_FNS(Uptodate, uptodate) BUFFER_FNS(Dirty, dirty) TAS_BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Lock, locked) @@ -134,30 +135,6 @@ BUFFER_FNS(Meta, meta) BUFFER_FNS(Prio, prio) BUFFER_FNS(Defer_Completion, defer_completion)
-static __always_inline void set_buffer_uptodate(struct buffer_head *bh) -{ - /* - * make it consistent with folio_mark_uptodate - * pairs with smp_load_acquire in buffer_uptodate - */ - smp_mb__before_atomic(); - set_bit(BH_Uptodate, &bh->b_state); -} - -static __always_inline void clear_buffer_uptodate(struct buffer_head *bh) -{ - clear_bit(BH_Uptodate, &bh->b_state); -} - -static __always_inline int buffer_uptodate(const struct buffer_head *bh) -{ - /* - * make it consistent with folio_test_uptodate - * pairs with smp_mb__before_atomic in set_buffer_uptodate - */ - return (smp_load_acquire(&bh->b_state) & (1UL << BH_Uptodate)) != 0; -} - #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
/* If we *know* page->private refers to buffer_heads */
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CPK CVE: NA
--------------------------------
Writing bandwidth control value to /sys/fs/resctrl/schemata may return an incorrect value.
For instance:
$ cat /sys/fs/resctrl/schemata L3:0=7fff;1=7fff;2=7fff;3=7fff MB:0=100;1=100;2=100;3=100 $ echo 'MB:0=20' > /sys/fs/resctrl/schemata $ cat /sys/fs/resctrl/schemata L3:0=7fff;1=7fff;2=7fff;3=7fff MB:0=18;1=100;2=100;3=100
Assuming that bwa_wd(given in MPAMF_MBW_IDR.BWA_WD) is greater than or equal to 6 (fraction is 64), we can divide mbw_max/min with a granularity of at least 2 in the range between 0 to 100, if we calculate mbw_max/min with this formula1: mbw_max/min = (Input * range) / Scale
Then we turn to ask for Input with this formula2: Input = (mbw_max/min * Scale) / range
But if calculated deviation is too large in formula1, we can no longer use formula2 to get our original value at a granularity of 2. Therefore, we need to correct the calculation result of formula1.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_resctrl.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index e9e77064bdb2..d00746c08922 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -2280,6 +2280,8 @@ mpam_update_from_resctrl_cfg(struct mpam_resctrl_res *res, case QOS_MBA_MAX_EVENT_ID: range = MBW_MAX_BWA_FRACT(res->class->bwa_wd); mpam_cfg->mbw_max = (resctrl_cfg * range) / (MAX_MBA_BW - 1); + /* correct mbw_max if remainder is too large */ + mpam_cfg->mbw_max += ((resctrl_cfg * range) % (MAX_MBA_BW - 1)) / range; mpam_cfg->mbw_max = (mpam_cfg->mbw_max > range) ? range : mpam_cfg->mbw_max; mpam_set_feature(mpam_feat_mbw_max, &mpam_cfg->valid); @@ -2287,6 +2289,8 @@ mpam_update_from_resctrl_cfg(struct mpam_resctrl_res *res, case QOS_MBA_MIN_EVENT_ID: range = MBW_MAX_BWA_FRACT(res->class->bwa_wd); mpam_cfg->mbw_min = (resctrl_cfg * range) / (MAX_MBA_BW - 1); + /* correct mbw_min if remainder is too large */ + mpam_cfg->mbw_min += ((resctrl_cfg * range) % (MAX_MBA_BW - 1)) / range; mpam_cfg->mbw_min = (mpam_cfg->mbw_min > range) ? range : mpam_cfg->mbw_min; mpam_set_feature(mpam_feat_mbw_min, &mpam_cfg->valid);
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CPK CVE: NA
--------------------------------
mpam_resctrl_setup_domain() increase dom_num when domain goes online. As a result, mpam_resctrl_cpu_offline() should decrease dom_num when domain goes offline.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_setup.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/mpam/mpam_setup.c b/arch/arm64/kernel/mpam/mpam_setup.c index d30910e0cda2..6e71c99d19b0 100644 --- a/arch/arm64/kernel/mpam/mpam_setup.c +++ b/arch/arm64/kernel/mpam/mpam_setup.c @@ -172,6 +172,8 @@ int mpam_resctrl_cpu_offline(unsigned int cpu) list_del(&d->list); dom = container_of(d, struct mpam_resctrl_dom, resctrl_dom); kfree(dom); + + res->resctrl_res.dom_num--; }
mpam_resctrl_clear_default_cpu(cpu);
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CPK CVE: NA
--------------------------------
This fixes indent format error in resctrl_parse_param(): fs/resctrlfs.c:616 resctrl_parse_param() warn: ignoring unreachable code. fs/resctrlfs.c:616 resctrl_parse_param() warn: inconsistent indenting fs/resctrlfs.c:619 resctrl_parse_param() warn: inconsistent indenting
Fixes: 100e2317e9b9 ("arm64/mpam: Use fs_context to parse mount options") Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/resctrlfs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/fs/resctrlfs.c b/fs/resctrlfs.c index 8956237de47f..c0a84f40dcc0 100644 --- a/fs/resctrlfs.c +++ b/fs/resctrlfs.c @@ -622,12 +622,11 @@ static int resctrl_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_caPrio: ctx->enable_caPrio = true; return 0; + default: + break; + }
- return 0; -} - -return -EINVAL; - + return -EINVAL; }
static void resctrl_fs_context_free(struct fs_context *fc)
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CPK CVE: NA
--------------------------------
Do not allow min_bw below the granularity with adjusting mbw_max/min, and with setting mbw_max/min less than min_bw, return 'Invalid argument' directly.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_resctrl.c | 8 ++------ arch/arm64/kernel/mpam/mpam_setup.c | 3 +++ 2 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index d00746c08922..f9a360a4c718 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -338,15 +338,11 @@ parse_bw(char *buf, struct resctrl_resource *r, switch (rr->ctrl_features[type].evt) { case QOS_MBA_MAX_EVENT_ID: case QOS_MBA_PBM_EVENT_ID: - if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) - return -EINVAL; - data = (data < r->mbw.min_bw) ? r->mbw.min_bw : data; - data = roundup(data, r->mbw.bw_gran); - break; case QOS_MBA_MIN_EVENT_ID: if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) return -EINVAL; - /* for mbw min feature, 0 of setting is allowed */ + if (data < r->mbw.min_bw) + return -EINVAL; data = roundup(data, r->mbw.bw_gran); break; default: diff --git a/arch/arm64/kernel/mpam/mpam_setup.c b/arch/arm64/kernel/mpam/mpam_setup.c index 6e71c99d19b0..cef66a5e6f2f 100644 --- a/arch/arm64/kernel/mpam/mpam_setup.c +++ b/arch/arm64/kernel/mpam/mpam_setup.c @@ -419,6 +419,9 @@ static int mpam_resctrl_resource_init(struct mpam_resctrl_res *res) * of 1 would appear too fine to make percentage conversions. */ r->mbw.bw_gran = GRAN_MBA_BW; + /* do not allow mbw_max/min below mbw.bw_gran */ + if (r->mbw.min_bw < r->mbw.bw_gran) + r->mbw.min_bw = r->mbw.bw_gran;
/* We will only pick a class that can monitor and control */ r->alloc_capable = true;
From: Wang ShaoBo bobo.shaobowang@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CPK CVE: NA
--------------------------------
Refer to the two commits:
commit fd8d9db3559a ("x86/resctrl: Remove superfluous kernfs_get() calls to prevent refcount leak")
commit 758999246965 ("x86/resctrl: Add necessary kernfs_put() calls to prevent refcount leak")
there are some places where a kernfs_node reference is obtained without a corresponding release. The excessive amount of reference count on kernfs nodes will never be dropped to 0 and the kernfs nodes will never be freed in the call paths of rmdir and umount. It leads to reference count leak and kernfs_node_cache memory leak.
For example, reference count leak is observed in these two cases:
(1) mount -t resctrl none /sys/fs/resctrl mkdir /sys/fs/resctrl/c1 mkdir /sys/fs/resctrl/c1/mon_groups/m1 umount /sys/fs/resctrl
(2) mkdir /sys/fs/resctrl/c1 mkdir /sys/fs/resctrl/c1/mon_groups/m1 rmdir /sys/fs/resctrl/c1
Superfluous kernfs_get() calls are removed from two areas:
(1) In call paths of mount and mkdir, when kernfs nodes for "info", "mon_groups" and "mon_data" directories and sub-directories are created, the reference count of newly created kernfs node is set to 1. But after kernfs_create_dir() returns, superfluous kernfs_get() are called to take an additional reference.
(2) kernfs_get() calls in rmdir call paths.
Necessary kernfs_put() calls are added by following changes:
(1) Introduce rdtgroup removal helper rdtgroup_remove() to wrap up kernfs_put() and kfree().
(2) Call rdtgroup_remove() in rdtgroup removal path where the rdtgroup structure is about to be freed by kfree().
(3) Call rdtgroup_remove() or kernfs_put() as appropriate in the error exit paths of mkdir where an extra reference is taken by kernfs_get().
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/resctrl.h | 18 ++++++++++++++++ arch/arm64/kernel/mpam/mpam_ctrlmon.c | 8 ------- arch/arm64/kernel/mpam/mpam_resctrl.c | 2 +- fs/resctrlfs.c | 31 ++++++--------------------- 4 files changed, 25 insertions(+), 34 deletions(-)
diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h index 1175c3515c92..d3bdb43b662f 100644 --- a/arch/arm64/include/asm/resctrl.h +++ b/arch/arm64/include/asm/resctrl.h @@ -545,5 +545,23 @@ DEFINE_INLINE_CTRL_FEATURE_ENABLE_FUNC(caPbm); DEFINE_INLINE_CTRL_FEATURE_ENABLE_FUNC(caMax); DEFINE_INLINE_CTRL_FEATURE_ENABLE_FUNC(caPrio);
+/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static inline void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + #endif #endif /* _ASM_ARM64_RESCTRL_H */ diff --git a/arch/arm64/kernel/mpam/mpam_ctrlmon.c b/arch/arm64/kernel/mpam/mpam_ctrlmon.c index 724bed6a8e2c..8abbf2823269 100644 --- a/arch/arm64/kernel/mpam/mpam_ctrlmon.c +++ b/arch/arm64/kernel/mpam/mpam_ctrlmon.c @@ -804,7 +804,6 @@ static int resctrl_group_mkdir_info_resdir(struct resctrl_resource *r, if (IS_ERR(kn_subdir)) return PTR_ERR(kn_subdir);
- kernfs_get(kn_subdir); ret = resctrl_group_kn_set_ugid(kn_subdir); if (ret) return ret; @@ -830,7 +829,6 @@ int resctrl_group_create_info_dir(struct kernfs_node *parent_kn, *kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); if (IS_ERR(*kn_info)) return PTR_ERR(*kn_info); - kernfs_get(*kn_info);
ret = resctrl_group_add_files(*kn_info, RF_TOP_INFO); if (ret) @@ -865,12 +863,6 @@ int resctrl_group_create_info_dir(struct kernfs_node *parent_kn, } }
- /* - m This extra ref will be put in kernfs_remove() and guarantees - * that @rdtgrp->kn is always accessible. - */ - kernfs_get(*kn_info); - ret = resctrl_group_kn_set_ugid(*kn_info); if (ret) goto out_destroy; diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index f9a360a4c718..7370f4dcecce 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -1331,7 +1331,7 @@ static void move_myself(struct callback_head *head) (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; current->rmid = 0; - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); }
preempt_disable(); diff --git a/fs/resctrlfs.c b/fs/resctrlfs.c index c0a84f40dcc0..e02e4769edc0 100644 --- a/fs/resctrlfs.c +++ b/fs/resctrlfs.c @@ -211,8 +211,7 @@ void resctrl_group_kn_unlock(struct kernfs_node *kn) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { kernfs_unbreak_active_protection(kn); - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } else { kernfs_unbreak_active_protection(kn); } @@ -272,12 +271,6 @@ mongroup_create_dir(struct kernfs_node *parent_kn, struct resctrl_group *prgrp, if (dest_kn) *dest_kn = kn;
- /* - * This extra ref will be put in kernfs_remove() and guarantees - * that @rdtgrp->kn is always accessible. - */ - kernfs_get(kn); - ret = resctrl_group_kn_set_ugid(kn); if (ret) goto out_destroy; @@ -399,8 +392,6 @@ static int resctrl_get_tree(struct fs_context *fc) if (ret) goto out_info;
- kernfs_get(kn_mongrp); - ret = mkdir_mondata_all_prepare(&resctrl_group_default); if (ret < 0) goto out_mongrp; @@ -410,7 +401,6 @@ static int resctrl_get_tree(struct fs_context *fc) if (ret) goto out_mongrp;
- kernfs_get(kn_mondata); resctrl_group_default.mon.mon_data_kn = kn_mondata; }
@@ -495,7 +485,7 @@ static void free_all_child_rdtgrp(struct resctrl_group *rdtgrp) /* rmid may not be used */ rmid_free(sentry->mon.rmid); list_del(&sentry->mon.crdtgrp_list); - kfree(sentry); + rdtgroup_remove(sentry); } }
@@ -529,7 +519,7 @@ static void rmdir_all_sub(void)
kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->resctrl_group_list); - kfree(rdtgrp); + rdtgroup_remove(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ update_closid_rmid(cpu_online_mask, &resctrl_group_default); @@ -775,7 +765,7 @@ static int mkdir_resctrl_prepare(struct kernfs_node *parent_kn, * kernfs_remove() will drop the reference count on "kn" which * will free it. But we still need it to stick around for the * resctrl_group_kn_unlock(kn} call below. Take one extra reference - * here, which will be dropped inside resctrl_group_kn_unlock(). + * here, which will be dropped inside rdtgroup_remove(). */ kernfs_get(kn);
@@ -815,6 +805,7 @@ static int mkdir_resctrl_prepare(struct kernfs_node *parent_kn, out_prepare_clean: mkdir_mondata_all_prepare_clean(rdtgrp); out_destroy: + kernfs_put(rdtgrp->kn); kernfs_remove(rdtgrp->kn); out_free_rmid: rmid_free(rdtgrp->mon.rmid); @@ -831,7 +822,7 @@ static int mkdir_resctrl_prepare(struct kernfs_node *parent_kn, static void mkdir_resctrl_prepare_clean(struct resctrl_group *rgrp) { kernfs_remove(rgrp->kn); - kfree(rgrp); + rdtgroup_remove(rgrp); }
/* @@ -996,11 +987,6 @@ static int resctrl_group_rmdir_mon(struct kernfs_node *kn, struct resctrl_group { resctrl_group_rm_mon(rdtgrp, tmpmask);
- /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in resctrl_group_kn_unlock() - */ - kernfs_get(kn); kernfs_remove(rdtgrp->kn);
return 0; @@ -1049,11 +1035,6 @@ static int resctrl_group_rmdir_ctrl(struct kernfs_node *kn, struct resctrl_group { resctrl_group_rm_ctrl(rdtgrp, tmpmask);
- /* - * one extra hold on this, will drop when we kfree(rdtgrp) - * in resctrl_group_kn_unlock() - */ - kernfs_get(kn); kernfs_remove(rdtgrp->kn);
return 0;
From: Jialin Zhang zhangjialin11@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CPK CVE: NA
--------------------------------
Update last_cmd_status to tell the reasons of returning 'Invalid argument' in parse_cache() and parse_bw().
Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/mpam/mpam_resctrl.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index 7370f4dcecce..4a1e376ec497 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -310,11 +310,15 @@ parse_cache(char *buf, struct resctrl_resource *r, return -EINVAL; }
- if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) + if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) { + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); return -EINVAL; + }
- if (data >= rr->ctrl_features[type].max_wd) + if (data >= rr->ctrl_features[type].max_wd) { + rdt_last_cmd_puts("Mask out of range\n"); return -EINVAL; + }
cfg->new_ctrl[type] = data; cfg->have_new_ctrl = true; @@ -339,20 +343,30 @@ parse_bw(char *buf, struct resctrl_resource *r, case QOS_MBA_MAX_EVENT_ID: case QOS_MBA_PBM_EVENT_ID: case QOS_MBA_MIN_EVENT_ID: - if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) + if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); return -EINVAL; - if (data < r->mbw.min_bw) + } + if (data < r->mbw.min_bw) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", data, + r->mbw.min_bw, rr->ctrl_features[type].max_wd - 1); return -EINVAL; + } data = roundup(data, r->mbw.bw_gran); break; default: - if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) + if (kstrtoul(buf, rr->ctrl_features[type].base, &data)) { + rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); return -EINVAL; + } break; }
- if (data >= rr->ctrl_features[type].max_wd) + if (data >= rr->ctrl_features[type].max_wd) { + rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", data, + r->mbw.min_bw, rr->ctrl_features[type].max_wd - 1); return -EINVAL; + }
cfg->new_ctrl[type] = data; cfg->have_new_ctrl = true;
From: Peter Zijlstra peterz@infradead.org
mainline inclusion from mainline-v5.11-rc1 commit 55d2eba8e7cd439c11cdb204898c2d384227629b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CQ3
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------------------------
When the static_key is part of the module, and the module calls static_key_inc/enable() from it's __init section *AND* has a static_branch_*() user in that very same __init section, things go wobbly.
If the static_key lives outside the module, jump_label_add_module() would append this module's sites to the key and jump_label_update() would take the static_key_linked() branch and all would be fine.
If all the sites are outside of __init, then everything will be fine too.
However, when all is aligned just as described above, jump_label_update() calls __jump_label_update(.init = false) and we'll not update sites in __init text.
Fixes: 19483677684b ("jump_label: Annotate entries that operate on __init code earlier") Reported-by: Dexuan Cui decui@microsoft.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Josh Poimboeuf jpoimboe@redhat.com Tested-by: Jessica Yu jeyu@kernel.org Link: https://lkml.kernel.org/r/20201216135435.GV3092@hirez.programming.kicks-ass.... Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/jump_label.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7470cdc432a0..600dbdab24e7 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -823,6 +823,7 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; + bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; @@ -834,15 +835,16 @@ static void jump_label_update(struct static_key *key)
preempt_disable(); mod = __module_address((unsigned long)key); - if (mod) + if (mod) { stop = mod->jump_entries + mod->num_jump_entries; + init = mod->state == MODULE_STATE_COMING; + } preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, - system_state < SYSTEM_RUNNING); + __jump_label_update(key, entry, stop, init); }
#ifdef CONFIG_STATIC_KEYS_SELFTEST
From: Zhouyi Zhou zhouzhouyi@gmail.com
mainline inclusion from mainline-v5.12 commit 0c89d87d1d43d9fa268d1dc489518564d58bf497 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CQ3
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------------------------
Commit 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() static call") tried to provide irqentry_exit_cond_resched() static call in irqentry_exit, but has a typo in macro conditional statement.
Fixes: 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() static call") Signed-off-by: Zhouyi Zhou zhouzhouyi@gmail.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210410073523.5493-1-zhouzhouyi@gmail.com Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c index cea3957ebdbc..2228de39bb4f 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -394,7 +394,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
instrumentation_begin(); if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMT_DYNAMIC +#ifdef CONFIG_PREEMPT_DYNAMIC static_call(irqentry_exit_cond_resched)(); #else irqentry_exit_cond_resched();
From: Johan Hovold johan@kernel.org
mainline inclusion from mainline-v5.15-rc6 commit 57116ce17b04fde2fe30f0859df69d8dbe5809f6 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I61CQ3
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
-------------------------------------------------
Console drivers often queue work while holding locks also taken in their console write paths, something which can lead to deadlocks on SMP when dumping workqueue state (e.g. sysrq-t or on suspend failures).
For serial console drivers this could look like:
CPU0 CPU1 ---- ----
show_workqueue_state(); lock(&pool->lock); <IRQ> lock(&port->lock); schedule_work(); lock(&pool->lock); printk(); lock(console_owner); lock(&port->lock);
where workqueues are, for example, used to push data to the line discipline, process break signals and handle modem-status changes. Line disciplines and serdev drivers can also queue work on write-wakeup notifications, etc.
Reworking every console driver to avoid queuing work while holding locks also taken in their write paths would complicate drivers and is neither desirable or feasible.
Instead use the deferred-printk mechanism to avoid printing while holding pool locks when dumping workqueue state.
Note that there are a few WARN_ON() assertions in the workqueue code which could potentially also trigger a deadlock. Hopefully the ongoing printk rework will provide a general solution for this eventually.
This was originally reported after a lockdep splat when executing sysrq-t with the imx serial driver.
Fixes: 3494fc30846d ("workqueue: dump workqueues on sysrq-t") Cc: stable@vger.kernel.org # 4.0 Reported-by: Fabio Estevam festevam@denx.de Tested-by: Fabio Estevam festevam@denx.de Signed-off-by: Johan Hovold johan@kernel.org Reviewed-by: John Ogness john.ogness@linutronix.de Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/workqueue.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 14d4c072c79b..a83808c20e43 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4809,8 +4809,16 @@ void show_workqueue_state(void)
for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_safe_enter(); show_pwq(pwq); + printk_safe_exit(); + } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. @@ -4828,7 +4836,12 @@ void show_workqueue_state(void) raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; - + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_safe_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4843,6 +4856,7 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); + printk_safe_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /*
From: Lu Wei luwei32@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I62TLE CVE: NA
--------------------------------
This reverts commit 58d6be873c3eedf7a50c317fe0e3b3632b8e58f0.
In order to make iproute compatible with other operation system like SUSE, rethad and etc, put IPVLAN_MODE_L2E last.
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan_main.c | 2 +- include/uapi/linux/if_link.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 4e0fa42c18d0..779a84f70914 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -14,7 +14,7 @@ int sysctl_ipvlan_loop_qlen = 131072; int sysctl_ipvlan_loop_delay = 10; static int ipvlan_default_mode = IPVLAN_MODE_L3; module_param(ipvlan_default_mode, int, 0400); -MODULE_PARM_DESC(ipvlan_default_mode, "set ipvlan default mode: 0 for l2, 1 for l3, 2 for l2e, 3 for l3s, others invalid now"); +MODULE_PARM_DESC(ipvlan_default_mode, "set ipvlan default mode: 0 for l2, 1 for l3, 2 for l3s, 3 for l2e, others invalid now");
static struct ctl_table_header *ipvlan_table_hrd; static struct ctl_table ipvlan_table[] = { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 0507ecc7275a..50d4705e1cbc 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -689,8 +689,8 @@ enum { enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, - IPVLAN_MODE_L2E, IPVLAN_MODE_L3S, + IPVLAN_MODE_L2E, IPVLAN_MODE_MAX };
From: Zhihao Cheng chengzhihao1@huawei.com
mainline inclusion from mainline-next commit 8111964f1b8524c4bb56b02cd9c7a37725ea21fd category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I65I9A CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/d...
-------------------------------
Following concurrent processes:
P1(drop cache) P2(kworker) drop_caches_sysctl_handler drop_slab shrink_slab down_read(&shrinker_rwsem) - LOCK A do_shrink_slab super_cache_scan prune_icache_sb dispose_list evict ext4_evict_inode ext4_clear_inode ext4_discard_preallocations ext4_mb_load_buddy_gfp ext4_mb_init_cache ext4_read_block_bitmap_nowait ext4_read_bh_nowait submit_bh dm_submit_bio do_worker process_deferred_bios commit metadata_operation_failed dm_pool_abort_metadata down_write(&pmd->root_lock) - LOCK B __destroy_persistent_data_objects dm_block_manager_destroy dm_bufio_client_destroy unregister_shrinker down_write(&shrinker_rwsem) thin_map | dm_thin_find_block ↓ down_read(&pmd->root_lock) --> ABBA deadlock
, which triggers hung task:
[ 76.974820] INFO: task kworker/u4:3:63 blocked for more than 15 seconds. [ 76.976019] Not tainted 6.1.0-rc4-00011-g8f17dd350364-dirty #910 [ 76.978521] task:kworker/u4:3 state:D stack:0 pid:63 ppid:2 [ 76.978534] Workqueue: dm-thin do_worker [ 76.978552] Call Trace: [ 76.978564] __schedule+0x6ba/0x10f0 [ 76.978582] schedule+0x9d/0x1e0 [ 76.978588] rwsem_down_write_slowpath+0x587/0xdf0 [ 76.978600] down_write+0xec/0x110 [ 76.978607] unregister_shrinker+0x2c/0xf0 [ 76.978616] dm_bufio_client_destroy+0x116/0x3d0 [ 76.978625] dm_block_manager_destroy+0x19/0x40 [ 76.978629] __destroy_persistent_data_objects+0x5e/0x70 [ 76.978636] dm_pool_abort_metadata+0x8e/0x100 [ 76.978643] metadata_operation_failed+0x86/0x110 [ 76.978649] commit+0x6a/0x230 [ 76.978655] do_worker+0xc6e/0xd90 [ 76.978702] process_one_work+0x269/0x630 [ 76.978714] worker_thread+0x266/0x630 [ 76.978730] kthread+0x151/0x1b0 [ 76.978772] INFO: task test.sh:2646 blocked for more than 15 seconds. [ 76.979756] Not tainted 6.1.0-rc4-00011-g8f17dd350364-dirty #910 [ 76.982111] task:test.sh state:D stack:0 pid:2646 ppid:2459 [ 76.982128] Call Trace: [ 76.982139] __schedule+0x6ba/0x10f0 [ 76.982155] schedule+0x9d/0x1e0 [ 76.982159] rwsem_down_read_slowpath+0x4f4/0x910 [ 76.982173] down_read+0x84/0x170 [ 76.982177] dm_thin_find_block+0x4c/0xd0 [ 76.982183] thin_map+0x201/0x3d0 [ 76.982188] __map_bio+0x5b/0x350 [ 76.982195] dm_submit_bio+0x2b6/0x930 [ 76.982202] __submit_bio+0x123/0x2d0 [ 76.982209] submit_bio_noacct_nocheck+0x101/0x3e0 [ 76.982222] submit_bio_noacct+0x389/0x770 [ 76.982227] submit_bio+0x50/0xc0 [ 76.982232] submit_bh_wbc+0x15e/0x230 [ 76.982238] submit_bh+0x14/0x20 [ 76.982241] ext4_read_bh_nowait+0xc5/0x130 [ 76.982247] ext4_read_block_bitmap_nowait+0x340/0xc60 [ 76.982254] ext4_mb_init_cache+0x1ce/0xdc0 [ 76.982259] ext4_mb_load_buddy_gfp+0x987/0xfa0 [ 76.982263] ext4_discard_preallocations+0x45d/0x830 [ 76.982274] ext4_clear_inode+0x48/0xf0 [ 76.982280] ext4_evict_inode+0xcf/0xc70 [ 76.982285] evict+0x119/0x2b0 [ 76.982290] dispose_list+0x43/0xa0 [ 76.982294] prune_icache_sb+0x64/0x90 [ 76.982298] super_cache_scan+0x155/0x210 [ 76.982303] do_shrink_slab+0x19e/0x4e0 [ 76.982310] shrink_slab+0x2bd/0x450 [ 76.982317] drop_slab+0xcc/0x1a0 [ 76.982323] drop_caches_sysctl_handler+0xb7/0xe0 [ 76.982327] proc_sys_call_handler+0x1bc/0x300 [ 76.982331] proc_sys_write+0x17/0x20 [ 76.982334] vfs_write+0x3d3/0x570 [ 76.982342] ksys_write+0x73/0x160 [ 76.982347] __x64_sys_write+0x1e/0x30 [ 76.982352] do_syscall_64+0x35/0x80 [ 76.982357] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Function metadata_operation_failed() is called when operations failed on dm pool metadata, dm pool will destroy and recreate metadata. So, shrinker will be unregistered and registered, which could down write shrinker_rwsem under pmd_write_lock.
Fix it by allocating dm_block_manager before locking pmd->root_lock and destroying old dm_block_manager after unlocking pmd->root_lock, then old dm_block_manager is replaced with new dm_block_manager under pmd->root_lock. So, shrinker register/unregister could be done without holding pmd->root_lock.
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216676 Fixes: e49e582965b3 ("dm thin: add read only and fail io modes") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-thin-metadata.c | 51 +++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 8 deletions(-)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 842d79e5ea3a..d1e0ae06f04d 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -753,13 +753,15 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f return r; }
-static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd) +static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, + bool destroy_bm) { dm_sm_destroy(pmd->data_sm); dm_sm_destroy(pmd->metadata_sm); dm_tm_destroy(pmd->nb_tm); dm_tm_destroy(pmd->tm); - dm_block_manager_destroy(pmd->bm); + if (destroy_bm) + dm_block_manager_destroy(pmd->bm); }
static int __begin_transaction(struct dm_pool_metadata *pmd) @@ -966,7 +968,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) } pmd_write_unlock(pmd); if (!pmd->fail_io) - __destroy_persistent_data_objects(pmd); + __destroy_persistent_data_objects(pmd, true);
kfree(pmd); return 0; @@ -1873,19 +1875,52 @@ static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) { int r = -EINVAL; + struct dm_block_manager *old_bm = NULL, *new_bm = NULL; + + /* fail_io is double-checked with pmd->root_lock held below */ + if (unlikely(pmd->fail_io)) + return r; + + /* + * Replacement block manager (new_bm) is created and old_bm destroyed outside of + * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of + * shrinker associated with the block manager's bufio client vs pmd root_lock). + * - must take shrinker_rwsem without holding pmd->root_lock + */ + new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + THIN_MAX_CONCURRENT_LOCKS);
pmd_write_lock(pmd); - if (pmd->fail_io) + if (pmd->fail_io) { + pmd_write_unlock(pmd); goto out; + }
__set_abort_with_changes_flags(pmd); - __destroy_persistent_data_objects(pmd); - r = __create_persistent_data_objects(pmd, false); + __destroy_persistent_data_objects(pmd, false); + old_bm = pmd->bm; + if (IS_ERR(new_bm)) { + DMERR("could not create block manager during abort"); + pmd->bm = NULL; + r = PTR_ERR(new_bm); + goto out_unlock; + } + + pmd->bm = new_bm; + r = __open_or_format_metadata(pmd, false); + if (r) { + pmd->bm = NULL; + goto out_unlock; + } + new_bm = NULL; +out_unlock: if (r) pmd->fail_io = true; - -out: pmd_write_unlock(pmd); + dm_block_manager_destroy(old_bm); +out: + if (new_bm && !IS_ERR(new_bm)) + dm_block_manager_destroy(new_bm);
return r; }
From: ruanjinjie ruanjinjie@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I65T0J?from=project-issue CVE: NA
-------------------------------
In emulation_proc_handler(), read and write operations are performed on insn->current_mode. In the concurrency scenario, mutex only protects writing insn->current_mode, and not protects the read. Suppose there are two concurrent tasks, task1 updates insn->current_mode to INSN_EMULATE in the critical section, the prev_mode of task2 is still the old data INSN_UNDEF of insn->current_mode. As a result, two tasks call update_insn_emulation_mode twice with prev_mode = INSN_UNDEF and current_mode = INSN_EMULATE, then call register_emulation_hooks twice, resulting in a list_add double problem.
Call trace: __list_add_valid+0xd8/0xe4 register_undef_hook+0x94/0x13c update_insn_emulation_mode+0xd0/0x12c emulation_proc_handler+0xd8/0xf4 proc_sys_call_handler+0x140/0x250 proc_sys_write+0x1c/0x2c new_sync_write+0xec/0x18c vfs_write+0x214/0x2ac ksys_write+0x70/0xfc __arm64_sys_write+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180
Fixes: f5cd5dd83d6d ("arm64: fix oops in concurrently setting insn_emulation sysctls") Signed-off-by: ruanjinjie ruanjinjie@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/armv8_deprecated.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 4a0ba2800e45..ab955c249f37 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -208,10 +208,12 @@ static int emulation_proc_handler(struct ctl_table *table, int write, loff_t *ppos) { int ret = 0; - struct insn_emulation *insn = container_of(table->data, struct insn_emulation, current_mode); - enum insn_emulation_mode prev_mode = insn->current_mode; + struct insn_emulation *insn; + enum insn_emulation_mode prev_mode;
mutex_lock(&insn_emulation_mutex); + insn = container_of(table->data, struct insn_emulation, current_mode); + prev_mode = insn->current_mode; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write || prev_mode == insn->current_mode)
From: Hui Tang tanghui20@huawei.com
hulk inclusion category: bugfix bugzilla: 187419, https://gitee.com/openeuler/kernel/issues/I612FD
-------------------------------
do_el0_svc+0x50/0x11c arch/arm64/kernel/syscall.c:217 el0_svc+0x20/0x30 arch/arm64/kernel/entry-common.c:353 el0_sync_handler+0xe4/0x1e0 arch/arm64/kernel/entry-common.c:369 el0_sync+0x148/0x180 arch/arm64/kernel/entry.S:683
================================================================== BUG: KASAN: null-ptr-deref in rq_of kernel/sched/sched.h:1118 [inline] BUG: KASAN: null-ptr-deref in unthrottle_qos_sched_group kernel/sched/fair.c:7619 [inline] BUG: KASAN: null-ptr-deref in free_fair_sched_group+0x124/0x320 kernel/sched/fair.c:12131 Read of size 8 at addr 0000000000000130 by task syz-executor100/223
CPU: 3 PID: 223 Comm: syz-executor100 Not tainted 5.10.0 #6 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x40c arch/arm64/kernel/stacktrace.c:132 show_stack+0x30/0x40 arch/arm64/kernel/stacktrace.c:196 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b4/0x248 lib/dump_stack.c:118 __kasan_report mm/kasan/report.c:551 [inline] kasan_report+0x18c/0x210 mm/kasan/report.c:564 check_memory_region_inline mm/kasan/generic.c:187 [inline] __asan_load8+0x98/0xc0 mm/kasan/generic.c:253 rq_of kernel/sched/sched.h:1118 [inline] unthrottle_qos_sched_group kernel/sched/fair.c:7619 [inline] free_fair_sched_group+0x124/0x320 kernel/sched/fair.c:12131 sched_free_group kernel/sched/core.c:7767 [inline] sched_create_group+0x48/0xc0 kernel/sched/core.c:7798 cpu_cgroup_css_alloc+0x18/0x40 kernel/sched/core.c:7930 css_create+0x7c/0x4a0 kernel/cgroup/cgroup.c:5328 cgroup_apply_control_enable+0x288/0x340 kernel/cgroup/cgroup.c:3135 cgroup_apply_control kernel/cgroup/cgroup.c:3217 [inline] cgroup_subtree_control_write+0x668/0x8b0 kernel/cgroup/cgroup.c:3375 cgroup_file_write+0x1a8/0x37c kernel/cgroup/cgroup.c:3909 kernfs_fop_write_iter+0x220/0x2f4 fs/kernfs/file.c:296 call_write_iter include/linux/fs.h:1960 [inline] new_sync_write+0x260/0x370 fs/read_write.c:515 vfs_write+0x3dc/0x4ac fs/read_write.c:602 ksys_write+0xfc/0x200 fs/read_write.c:655 __do_sys_write fs/read_write.c:667 [inline] __se_sys_write fs/read_write.c:664 [inline] __arm64_sys_write+0x50/0x60 fs/read_write.c:664 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall arch/arm64/kernel/syscall.c:48 [inline] el0_svc_common.constprop.0+0xf4/0x414 arch/arm64/kernel/syscall.c:155 do_el0_svc+0x50/0x11c arch/arm64/kernel/syscall.c:217 el0_svc+0x20/0x30 arch/arm64/kernel/entry-common.c:353 el0_sync_handler+0xe4/0x1e0 arch/arm64/kernel/entry-common.c:369 el0_sync+0x148/0x180 arch/arm64/kernel/entry.S:683
So add check for tg->cfs_rq[i] before unthrottle_qos_sched_group() called.
Signed-off-by: Hui Tang tanghui20@huawei.com Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com Reviewed-by: Zhang Qiao zhangqiao22@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee3d9e2d9c5b..49818f4fc532 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11827,7 +11827,7 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) { #ifdef CONFIG_QOS_SCHED - if (tg->cfs_rq) + if (tg->cfs_rq && tg->cfs_rq[i]) unthrottle_qos_sched_group(tg->cfs_rq[i]); #endif if (tg->cfs_rq)
From: Hangbin Liu liuhangbin@gmail.com
mainline inclusion from mainline-v5.19-rc1 commit dfed913e8b55a0c2c4906f1242fd38fd9a116e49 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I65V65 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, the kernel drops GSO VLAN tagged packet if it's created with socket(AF_PACKET, SOCK_RAW, 0) plus virtio_net_hdr.
The reason is AF_PACKET doesn't adjust the skb network header if there is a VLAN tag. Then after virtio_net_hdr_set_proto() called, the skb->protocol will be set to ETH_P_IP/IPv6. And in later inet/ipv6_gso_segment() the skb is dropped as network header position is invalid.
Let's handle VLAN packets by adjusting network header position in packet_parse_headers(). The adjustment is safe and does not affect the later xmit as tap device also did that.
In packet_snd(), packet_parse_headers() need to be moved before calling virtio_net_hdr_set_proto(), so we can set correct skb->protocol and network header first.
There is no need to update tpacket_snd() as it calls packet_parse_headers() in tpacket_fill_skb(), which is already before calling virtio_net_hdr_* functions.
skb->no_fcs setting is also moved upper to make all skb settings together and keep consistency with function packet_sendmsg_spkt().
Signed-off-by: Hangbin Liu liuhangbin@gmail.com Acked-by: Willem de Bruijn willemb@google.com Acked-by: Michael S. Tsirkin mst@redhat.com Link: https://lore.kernel.org/r/20220425014502.985464-1-liuhangbin@gmail.com Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Baisong Zhong zhongbaisong@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/packet/af_packet.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5ee600d108a0..5d8b432ebad7 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1885,12 +1885,20 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { + int depth; + if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); }
+ /* Move network header to the right position for VLAN tagged packets */ + if (likely(skb->dev->type == ARPHRD_ETHER) && + eth_type_vlan(skb->protocol) && + __vlan_get_protocol(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); + skb_probe_transport_header(skb); }
@@ -3006,6 +3014,11 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time;
+ if (unlikely(extra_len == 4)) + skb->no_fcs = 1; + + packet_parse_headers(skb, sock); + if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) @@ -3014,11 +3027,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) virtio_net_hdr_set_proto(skb, &vnet_hdr); }
- packet_parse_headers(skb, sock); - - if (unlikely(extra_len == 4)) - skb->no_fcs = 1; - err = po->xmit(skb); if (unlikely(err != 0)) { if (err > 0)
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.19-rc1 commit e9d3f80935b6607dcdc5682b00b1d4b28e0a0c5d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I65V65 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
GSO assumes skb->head contains link layer headers.
tun device in some case can provide base 14 bytes, regardless of VLAN being used or not.
After blamed commit, we can end up setting a network header offset of 18+, we better pull the missing bytes to avoid a posible crash in GSO.
syzbot report was: kernel BUG at include/linux/skbuff.h:2699! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 3601 Comm: syz-executor210 Not tainted 5.18.0-syzkaller-11338-g2c5ca23f7414 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__skb_pull include/linux/skbuff.h:2699 [inline] RIP: 0010:skb_mac_gso_segment+0x48f/0x530 net/core/gro.c:136 Code: 00 48 c7 c7 00 96 d4 8a c6 05 cb d3 45 06 01 e8 26 bb d0 01 e9 2f fd ff ff 49 c7 c4 ea ff ff ff e9 f1 fe ff ff e8 91 84 19 fa <0f> 0b 48 89 df e8 97 44 66 fa e9 7f fd ff ff e8 ad 44 66 fa e9 48 RSP: 0018:ffffc90002e2f4b8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000012 RCX: 0000000000000000 RDX: ffff88805bb58000 RSI: ffffffff8760ed0f RDI: 0000000000000004 RBP: 0000000000005dbc R08: 0000000000000004 R09: 0000000000000fe0 R10: 0000000000000fe4 R11: 0000000000000000 R12: 0000000000000fe0 R13: ffff88807194d780 R14: 1ffff920005c5e9b R15: 0000000000000012 FS: 000055555730f300(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200015c0 CR3: 0000000071ff8000 CR4: 0000000000350ee0 Call Trace: <TASK> __skb_gso_segment+0x327/0x6e0 net/core/dev.c:3411 skb_gso_segment include/linux/netdevice.h:4749 [inline] validate_xmit_skb+0x6bc/0xf10 net/core/dev.c:3669 validate_xmit_skb_list+0xbc/0x120 net/core/dev.c:3719 sch_direct_xmit+0x3d1/0xbe0 net/sched/sch_generic.c:327 __dev_xmit_skb net/core/dev.c:3815 [inline] __dev_queue_xmit+0x14a1/0x3a00 net/core/dev.c:4219 packet_snd net/packet/af_packet.c:3071 [inline] packet_sendmsg+0x21cb/0x5550 net/packet/af_packet.c:3102 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2492 ___sys_sendmsg+0xf3/0x170 net/socket.c:2546 __sys_sendmsg net/socket.c:2575 [inline] __do_sys_sendmsg net/socket.c:2584 [inline] __se_sys_sendmsg net/socket.c:2582 [inline] __x64_sys_sendmsg+0x132/0x220 net/socket.c:2582 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f4b95da06c9 Code: 28 c3 e8 4a 15 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffd7defc4c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007ffd7defc4f0 RCX: 00007f4b95da06c9 RDX: 0000000000000000 RSI: 0000000020000140 RDI: 0000000000000003 RBP: 0000000000000003 R08: bb1414ac00000050 R09: bb1414ac00000050 R10: 0000000000000004 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd7defc4e0 R14: 00007ffd7defc4d8 R15: 00007ffd7defc4d4 </TASK>
Fixes: dfed913e8b55 ("net/af_packet: add VLAN support for AF_PACKET SOCK_RAW GSO") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Acked-by: Hangbin Liu liuhangbin@gmail.com Acked-by: Willem de Bruijn willemb@google.com Cc: Michael S. Tsirkin mst@redhat.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Baisong Zhong zhongbaisong@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/packet/af_packet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5d8b432ebad7..b61a2c39e2ac 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1896,8 +1896,10 @@ static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) - skb_set_network_header(skb, depth); + __vlan_get_protocol(skb, skb->protocol, &depth) != 0) { + if (pskb_may_pull(skb, depth)) + skb_set_network_header(skb, depth); + }
skb_probe_transport_header(skb); }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 187345, https://gitee.com/openeuler/kernel/issues/I5L5ZG CVE: NA
--------------------------------
Otherwise, null pointer crash can be triggered to handle bio in blk_mq_submit_bio() while queue is not initialized.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b97defbe21bb..335eef5c3c05 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2149,12 +2149,16 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
switch (type) { case DM_TYPE_REQUEST_BASED: - md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } + /* + * Change the fops after queue is initialized, so that bio won't + * issued by rq-based path until that. + */ + md->disk->fops = &dm_rq_blk_dops; break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED:
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 187345, https://gitee.com/openeuler/kernel/issues/I5L5ZG CVE: NA
--------------------------------
Commit 9f8da7587362 ("[Huawei] block: fix that part scan is disabled in device_add_disk()") introduce a regression:
Test procedures: dmsetup create test --notable dmsetup remove test
Test result: dmsetup will stuck forever
Root cause: before: 1) dmsetup creat add_disk_add_disk_no_queue_reg() scan partitions uevent 2) blk_register_queue -> notable will not call this 3) dmsetup remove wait for uevent
after: 1) dmsetup creat add_disk_add_disk_no_queue_reg() 2) blk_register_queue() -> notable will not call this scan_partitions uevent 3) dmsetup remove wait for uevent -> impossible for notable
Fix the problem by moving scan_partitions and uevent from blk_register_queue() to the end of add_disk_add_disk_no_queue_reg().
Fixes: 9f8da7587362 ("block: fix that part scan is disabled in device_add_disk()") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-sysfs.c | 45 --------------------------------------------- block/genhd.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 45 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 990a3356ee8a..780f02cbda84 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -821,38 +821,6 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, };
-static void disk_scan_partitions(struct gendisk *disk) -{ - struct block_device *bdev; - - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; - - set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); -} - -static void disk_init_partition(struct gendisk *disk) -{ - struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; - - disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); -} - /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -942,22 +910,9 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock);
- - /* - * Set the flag at last, so that block devcie can't be opened - * before it's registration is done. - */ - disk->flags |= GENHD_FL_UP; ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); - /* - * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep - * will be confused because it will treat 'bd_mutex' from different - * devices as the same lock. - */ - if (!ret) - disk_init_partition(disk);
return ret; } diff --git a/block/genhd.c b/block/genhd.c index 4f6a0be74d85..a5a6840f2e85 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -736,6 +736,38 @@ static void register_disk(struct device *parent, struct gendisk *disk, } }
+static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void disk_init_partition(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk @@ -814,6 +846,13 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
disk_add_events(disk); blk_integrity_add(disk); + + /* + * Set the flag at last, so that block devcie can't be opened + * before it's registration is done. + */ + disk->flags |= GENHD_FL_UP; + disk_init_partition(disk); }
void device_add_disk(struct device *parent, struct gendisk *disk,