From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
------------------------------------------------- 1. Error handling of sp_free().
2. When sp_alloc(..., SP_HUGEPAGE, ...) rollbacks to normal page, we need to call vfs_fallocate() otherwise memory leaks until sp group is dead.
3. When sp_alloc(..., SP_HUGEPAGE, ...) rollbacks to normal page, we need to clear SP_HUGEPAGE bit in sp_flags then spa_stat interface will show the spa as a normal page spa correctly.
4. Add the reference count of a spg in find_or_alloc_sp_group() due to closer relationship.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index d39c2c3d728c..a71f6fb214ce 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -358,7 +358,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; spg->owner = current->group_leader; - atomic_set(&spg->use_count, 0); + atomic_set(&spg->use_count, 1); INIT_LIST_HEAD(&spg->procs); INIT_LIST_HEAD(&spg->spa_list);
@@ -391,6 +391,10 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) ret = PTR_ERR(spg->file_hugetlb); goto out_fput; } + } else { + if (!spg_valid(spg)) + return ERR_PTR(-ENODEV); + atomic_inc(&spg->use_count); }
return spg; @@ -540,12 +544,6 @@ int sp_group_add_task(int pid, int spg_id) goto out_put_task; }
- if (!spg_valid(spg)) { - ret = -ENODEV; - goto out_put_task; - } - atomic_inc(&spg->use_count); - /* access control permission check */ if (sysctl_ac_mode == AC_SINGLE_OWNER) { if (spg->owner != current->group_leader) { @@ -1102,6 +1100,7 @@ int sp_free(unsigned long addr) if (printk_ratelimit()) pr_err("share pool: sp free failed, addr %pK is not from sp_alloc\n", (void *)addr); + goto drop_spa; }
if (!spg_valid(spa->spg)) @@ -1312,31 +1311,32 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node));
+ if (printk_ratelimit()) + pr_warn("share pool: allocation failed due to mm populate failed" + "(potential no enough memory when -12): %d\n", ret); + p = ERR_PTR(ret); + + mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + offset = sp_addr - MMAP_SHARE_POOL_START; + + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + pr_err("share pool: sp alloc normal page fallocate failed %d\n", ret); + if (file == spg->file_hugetlb) { spg->hugepage_failures++;
/* fallback to small pages */ if (!(sp_flags & SP_HUGEPAGE_ONLY)) { file = spg->file; - spa->is_hugepage = false; size_aligned = ALIGN(size, PAGE_SIZE); + sp_flags &= ~SP_HUGEPAGE; __sp_area_drop(spa); mmput(mm); goto try_again; } }
- if (printk_ratelimit()) - pr_warn("share pool: allocation failed due to mm populate failed" - "(potential no enough memory when -12): %d\n", ret); - p = ERR_PTR(ret); - - mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; - offset = sp_addr - MMAP_SHARE_POOL_START; - ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); - if (ret) - pr_err("share pool: fallocate failed %d\n", ret); - mmput(mm); break; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
------------------------------------------------- Since we must use "%pK" when printing address for security issue, memleak debug printing is useless and can be removed.
Additionally, we change the permission mode of spa_stat and proc_stat to 400 for security issue.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/svm.c | 4 ---- mm/share_pool.c | 44 +++++++++----------------------------------- 2 files changed, 9 insertions(+), 39 deletions(-)
diff --git a/drivers/char/svm.c b/drivers/char/svm.c index d36246910925..4cf6f75f7459 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -1919,8 +1919,6 @@ static unsigned long svm_sp_alloc_mem(unsigned long __user *arg) return EFAULT; }
- pr_notice("svm: [sp alloc] caller %s(%d/%d); return addr 0x%pK, size %lu\n", - current->comm, current->tgid, current->pid, addr, spallocinfo.size); sp_dump_stack();
spallocinfo.addr = (uintptr_t)addr; @@ -1960,8 +1958,6 @@ static int svm_sp_free_mem(unsigned long __user *arg) return -EFAULT; }
- pr_notice("svm: [sp free] caller %s(%d/%d); addr 0x%pK\n", - current->comm, current->tgid, current->pid, (void *)spallocinfo.addr); sp_dump_stack();
return 0; diff --git a/mm/share_pool.c b/mm/share_pool.c index a71f6fb214ce..215c5c0b0b77 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1106,10 +1106,6 @@ int sp_free(unsigned long addr) if (!spg_valid(spa->spg)) goto drop_spa;
- pr_notice("share pool: [sp free] caller %s(%d/%d); " - "group id %d addr 0x%pK, size %ld\n", - current->comm, current->tgid, current->pid, spa->spg->id, - (void *)spa->va_start, spa->real_size); sp_dump_stack();
__sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); @@ -1356,13 +1352,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (spa && !IS_ERR(spa)) __sp_area_drop(spa);
- if (!IS_ERR(p)) { - pr_notice("share pool: [sp alloc] caller %s(%d/%d); group id %d; " - "return addr 0x%pK, size %ld\n", - current->comm, current->tgid, current->pid, spa->spg->id, - (void *)spa->va_start, spa->real_size); - sp_dump_stack(); - } + sp_dump_stack(); return p; } EXPORT_SYMBOL_GPL(sp_alloc); @@ -1572,7 +1562,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long kva_aligned; unsigned long size_aligned; unsigned int page_size = PAGE_SIZE; - enum spa_type type; int ret; struct vm_struct *area;
@@ -1599,14 +1588,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { - type = SPA_TYPE_K2TASK; + /* k2u to task */ if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); return ERR_PTR(-EINVAL); } - spa = sp_alloc_area(size_aligned, sp_flags, NULL, type); + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) @@ -1618,14 +1607,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); mutex_unlock(&sp_mutex); } else if (spg_valid(spg)) { - type = SPA_TYPE_K2SPG; + /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); return ERR_PTR(-EINVAL); } - spa = sp_alloc_area(size_aligned, sp_flags, spg, type); + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); if (IS_ERR(spa)) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) @@ -1649,20 +1638,13 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, __sp_area_drop(spa);
if (!IS_ERR(uva)) { - if (spg_valid(spa->spg)) - spg_id = spa->spg->id; - pr_notice("share pool: [sp k2u type %d] caller %s(%d/%d); group id %d; " - "return addr 0x%pK size %ld\n", - type, current->comm, current->tgid, current->pid, spg_id, - (void *)spa->va_start, spa->real_size); - sp_dump_stack(); - /* associate vma and spa */ area = find_vm_area((void *)kva); if (area) area->flags |= VM_SHAREPOOL; spa->kva = kva; } + sp_dump_stack();
return uva; } @@ -2051,15 +2033,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp __sp_free(spa->spg, uva_aligned, size_aligned, NULL); }
- if (!ret) { - if (spg_valid(spa->spg)) - spg_id = spa->spg->id; - pr_notice("share pool: [sp unshare uva type %d] caller %s(%d/%d); " - "group id %d addr 0x%pK size %ld\n", - spa->type, current->comm, current->tgid, current->pid, - spg_id, (void *)spa->va_start, spa->real_size); - sp_dump_stack(); - } + sp_dump_stack();
out_drop_area: /* deassociate vma and spa */ @@ -2538,8 +2512,8 @@ void __init proc_sharepool_init(void) if (!proc_mkdir("sharepool", NULL)) return;
- proc_create_single_data("sharepool/proc_stat", 0, NULL, proc_stat_show, NULL); - proc_create_single_data("sharepool/spa_stat", 0, NULL, spa_stat_show, NULL); + proc_create_single_data("sharepool/proc_stat", S_IRUSR, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", S_IRUSR, NULL, spa_stat_show, NULL); }
struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask,
From: Tang Yizhou t00467064@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
------------------------------------------------- If an spg id is generated automatically, we must ensure it is freed when something fails in sp_group_add_task().
Notice that spg id will be bonded to a struct sp_group spg after calling find_or_alloc_sp_group(), spg and spg id will be freed in __sp_group_drop_locked(). So we only need to free spg_id before calling it.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 215c5c0b0b77..3d0266490613 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -268,15 +268,20 @@ static int spa_dec_usage(enum spa_type type, unsigned long size, bool is_dvpp) static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate);
+static void free_sp_group_id(unsigned int spg_id) +{ + if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) || + (spg_id >= SPG_ID_DVPP_PASS_THROUGH_MIN && + spg_id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) + ida_free(&sp_group_id_ida, spg_id); +} + static void free_sp_group(struct sp_group *spg) { fput(spg->file); fput(spg->file_hugetlb); idr_remove(&sp_group_idr, spg->id); - if ((spg->id >= SPG_ID_AUTO_MIN && spg->id <= SPG_ID_AUTO_MAX) || - (spg->id >= SPG_ID_DVPP_PASS_THROUGH_MIN && - spg->id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) - ida_free(&sp_group_id_ida, (unsigned int)spg->id); + free_sp_group_id((unsigned int)spg->id); kfree(spg); }
@@ -535,12 +540,15 @@ int sp_group_add_task(int pid, int spg_id) get_task_struct(tsk);
rcu_read_unlock(); - if (ret) + if (ret) { + free_sp_group_id((unsigned int)spg_id); goto out_unlock; + }
spg = find_or_alloc_sp_group(spg_id); if (IS_ERR(spg)) { ret = PTR_ERR(spg); + free_sp_group_id((unsigned int)spg_id); goto out_put_task; }
From: Xu Qiang xuqiang36@huawei.com
mainline inclusion from mainline-v5.10-rc6 commit 74cde1a53368aed4f2b4b54bf7030437f64a534b category: bugfix bugzilla: NA CVE: NA
---------------------------------------
On systems without HW-based collections (i.e. anything except GIC-500), we rely on firmware to perform the ITS save/restore. This doesn't really work, as although FW can properly save everything, it cannot fully restore the state of the command queue (the read-side is reset to the head of the queue). This results in the ITS consuming previously processed commands, potentially corrupting the state.
Instead, let's always save the ITS state on suspend, disabling it in the process, and restore the full state on resume. This saves us from broken FW as long as it doesn't enable the ITS by itself (for which we can't do anything).
This amounts to simply dropping the ITS_FLAGS_SAVE_SUSPEND_STATE.
Signed-off-by: Xu Qiang xuqiang36@huawei.com [maz: added warning on resume, rewrote commit message] Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/20201107104226.14282-1-xuqiang36@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f95c8ccff768..03b720aaa054 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -54,8 +54,6 @@ #define ITS_FLAGS_CMDQ_NEEDS_FLUSHING (1ULL << 0) #define ITS_FLAGS_WORKAROUND_CAVIUM_22375 (1ULL << 1) #define ITS_FLAGS_WORKAROUND_CAVIUM_23144 (1ULL << 2) -#define ITS_FLAGS_SAVE_SUSPEND_STATE (1ULL << 3) -#define ITS_FLAGS_SAVE_HIBERNATE_STATE (1ULL << 4)
#define RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING (1 << 0) #define RDIST_FLAGS_RD_TABLES_PREALLOCATED (1 << 1) @@ -3617,17 +3615,6 @@ static int its_save_disable(void) raw_spin_lock(&its_lock); list_for_each_entry(its, &its_nodes, entry) { void __iomem *base; - u64 flags; - - if (system_in_hibernation()) - its->flags |= ITS_FLAGS_SAVE_HIBERNATE_STATE; - - flags = its->flags; - flags &= (ITS_FLAGS_SAVE_SUSPEND_STATE | - ITS_FLAGS_SAVE_HIBERNATE_STATE); - - if (!flags) - continue;
base = its->base; its->ctlr_save = readl_relaxed(base + GITS_CTLR); @@ -3647,9 +3634,6 @@ static int its_save_disable(void) list_for_each_entry_continue_reverse(its, &its_nodes, entry) { void __iomem *base;
- if (!(its->flags & ITS_FLAGS_SAVE_SUSPEND_STATE)) - continue; - base = its->base; writel_relaxed(its->ctlr_save, base + GITS_CTLR); } @@ -3667,16 +3651,8 @@ static void its_restore_enable(void) raw_spin_lock(&its_lock); list_for_each_entry(its, &its_nodes, entry) { void __iomem *base; - u64 flags; int i;
- flags = its->flags; - flags &= (ITS_FLAGS_SAVE_SUSPEND_STATE | - ITS_FLAGS_SAVE_HIBERNATE_STATE); - if (!flags) - continue; - - its->flags &= ~ITS_FLAGS_SAVE_HIBERNATE_STATE; base = its->base;
/* @@ -3684,7 +3660,10 @@ static void its_restore_enable(void) * don't restore it since writing to CBASER or BASER<n> * registers is undefined according to the GIC v3 ITS * Specification. + * + * Firmware resuming with the ITS enabled is terminally broken. */ + WARN_ON(readl_relaxed(base + GITS_CTLR) & GITS_CTLR_ENABLE); ret = its_force_quiescent(base); if (ret) { pr_err("ITS@%pa: failed to quiesce on resume: %d\n", @@ -3950,9 +3929,6 @@ static int __init its_probe_one(struct resource *res, ctlr |= GITS_CTLR_ImDe; writel_relaxed(ctlr, its->base + GITS_CTLR);
- if (GITS_TYPER_HCC(typer)) - its->flags |= ITS_FLAGS_SAVE_SUSPEND_STATE; - err = its_init_domain(handle, its); if (err) goto out_free_tables;
From: Victor Gladkov victor.gladkov@kioxia.com
driver inclusion category: bugfix bugzilla: NA CVE: NA Link: https://gitee.com/openeuler/kernel/issues/I1WGZE
-------------------------------------------------
Commands get stuck while Host NVMe-oF controller is in reconnect state. NVMe ctrler enters into reconnect state when it loses connection with the target. It tries to reconnect every 10 seconds until successful reconnection or until reconnect time-out is reached. The default reconnect time out is 10 minutes.
Applications are expecting commands to complete with success or error within a certain timeout (30 seconds by default). The NVMe host is enforcing that timeout while it is connected, never the less, during reconnection, the timeout is not enforced and commands may get stuck for a long period or even forever.
To fix this long delay due to the default timeout we introduce new session parameter "fast_io_fail_tmo". The timeout is measured in seconds from the controller reconnect, any command beyond that timeout is rejected. The new parameter value may be passed during 'connect'. The default value of -1 means no timeout (similar to current behavior).
We add a new controller NVME_CTRL_FAILFAST_EXPIRED and respective delayed work that updates the NVME_CTRL_FAILFAST_EXPIRED flag.
When the controller is entering the CONNECTING state, we schedule the delayed_work based on failfast timeout value. If the transition is out of CONNECTING, terminate delayed work item and ensure failfast_expired is false. If delayed work item expires then set "NVME_CTRL_FAILFAST_EXPIRED" flag to true.
We also update nvmf_fail_nonready_command() and nvme_available_path() functions with check the "NVME_CTRL_FAILFAST_EXPIRED" controller flag.
Signed-off-by: Victor Gladkov victor.gladkov@kioxia.com Signed-off-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Hannes Reinecke hare@suse.de Reviewed-by: Chao Leng lengchao@huawei.com Reviewed-by: Jike Cheng chengjike.cheng@huawei.com Signed-off-by: Ruozhu Li liruozhu@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 51 +++++++++++++++++++++++++++++++++-- drivers/nvme/host/fabrics.c | 28 ++++++++++++++++--- drivers/nvme/host/fabrics.h | 5 ++++ drivers/nvme/host/multipath.c | 2 ++ drivers/nvme/host/nvme.h | 3 +++ 5 files changed, 83 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c9faa824de26..5f500ee424a3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -129,6 +129,37 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->scan_work); }
+static void nvme_failfast_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_ctrl, failfast_work); + + if (ctrl->state != NVME_CTRL_CONNECTING) + return; + + set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); + dev_info(ctrl->device, "failfast expired\n"); + nvme_kick_requeue_lists(ctrl); +} + +static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1) + return; + + schedule_delayed_work(&ctrl->failfast_work, + ctrl->opts->fast_io_fail_tmo * HZ); +} + +static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl) +{ + if (!ctrl->opts) + return; + + cancel_delayed_work_sync(&ctrl->failfast_work); + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); +} + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -384,8 +415,21 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ctrl->state = new_state;
spin_unlock_irqrestore(&ctrl->lock, flags); - if (changed && ctrl->state == NVME_CTRL_LIVE) - nvme_kick_requeue_lists(ctrl); + if (changed) { + switch (ctrl->state) { + case NVME_CTRL_LIVE: + if (old_state == NVME_CTRL_CONNECTING) + nvme_stop_failfast_work(ctrl); + nvme_kick_requeue_lists(ctrl); + break; + case NVME_CTRL_CONNECTING: + if (old_state == NVME_CTRL_RESETTING) + nvme_start_failfast_work(ctrl); + break; + default: + break; + } + } return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -3696,6 +3740,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); nvme_stop_keep_alive(ctrl); + nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fw_act_work); if (ctrl->ops->stop_ctrl) @@ -3761,6 +3806,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, int ret;
ctrl->state = NVME_CTRL_NEW; + clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); @@ -3776,6 +3822,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; + INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) > PAGE_SIZE); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a509d90e520f..86e344b4ee46 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -550,6 +550,7 @@ blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, { if (ctrl->state != NVME_CTRL_DELETING && ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) return BLK_STS_RESOURCE;
@@ -606,6 +607,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, + { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_ERR, NULL } };
@@ -625,6 +627,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; opts->duplicate_connect = false; + opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO;
options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -749,6 +752,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n"); ctrl_loss_tmo = token; break; + case NVMF_OPT_FAIL_FAST_TMO: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + + if (token >= 0) + pr_warn("I/O will fail on after %d sec reconnect\n", + token); + opts->fast_io_fail_tmo = token; + break; case NVMF_OPT_HOSTNQN: if (opts->host) { pr_err("hostnqn already user-assigned: %s\n", @@ -829,11 +843,17 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->nr_io_queues = 0; opts->duplicate_connect = true; } - if (ctrl_loss_tmo < 0) + + if (ctrl_loss_tmo < 0) { opts->max_reconnects = -1; - else + } else { opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, opts->reconnect_delay); + if (ctrl_loss_tmo < opts->fast_io_fail_tmo) + pr_warn("failfast tmo (%d) > ctrl_loss_tmo (%d)\n", + opts->fast_io_fail_tmo, + ctrl_loss_tmo); + }
if (!opts->host) { kref_get(&nvmf_default_host->ref); @@ -902,8 +922,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ - NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) - + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ + NVMF_OPT_FAIL_FAST_TMO) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 188ebbeec32c..a7a3100714b1 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -24,6 +24,8 @@ /* default to 600 seconds of reconnect attempts before giving up */ #define NVMF_DEF_CTRL_LOSS_TMO 600 #define NVMF_DEF_RECONNECT_FOREVER -1 +/* set default fail fast timeout to 150s */ +#define NVMF_DEF_FAIL_FAST_TMO 150
/* * Define a host as seen by the target. We allocate one at boot, but also @@ -59,6 +61,7 @@ enum { NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, NVMF_OPT_HOST_ID = 1 << 12, NVMF_OPT_DUP_CONNECT = 1 << 13, + NVMF_OPT_FAIL_FAST_TMO = 1 << 20, };
/** @@ -86,6 +89,7 @@ enum { * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; + * @fast_io_fail_tmo: Fast I/O fail timeout in seconds */ struct nvmf_ctrl_options { unsigned mask; @@ -102,6 +106,7 @@ struct nvmf_ctrl_options { unsigned int kato; struct nvmf_host *host; int max_reconnects; + int fast_io_fail_tmo; };
/* diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e1fc9ffbd3ee..8df0bf238455 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -199,6 +199,8 @@ static bool nvme_available_path(struct nvme_ns_head *head) struct nvme_ns *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) { + if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) + continue; switch (ns->ctrl->state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5699397d3a5d..04c2d9ffd004 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -204,6 +204,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct delayed_work failfast_work; struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; @@ -239,6 +240,8 @@ struct nvme_ctrl { u16 icdoff; u16 maxcmd; int nr_reconnects; + unsigned long flags; +#define NVME_CTRL_FAILFAST_EXPIRED 0 struct nvmf_ctrl_options *opts;
struct page *discard_page;
From: Suzuki K Poulose suzuki.poulose@arm.com
mainline inclusion from mainline-v5.0-rc1 commit c9460dcb06ee68af1c75f9232603ece071901abe category: bugfix bugzilla: 46773 CVE: NA References: https://gitee.com/src-openeuler/kernel/issues/I235Y8
---------------------------
We have two entries for ARM64_WORKAROUND_CLEAN_CACHE capability :
1) ARM Errata 826319, 827319, 824069, 819472 on A53 r0p[012] 2) ARM Errata 819472 on A53 r0p[01]
Both have the same work around. Merge these entries to avoid duplicate entries for a single capability. Add a new Kconfig entry to control the "capability" entry to make it easier to handle combinations of the CONFIGs.
Cc: Will Deacon will.deacon@arm.com Cc: Andre Przywara andre.przywara@arm.com Cc: Mark Rutland mark.rutland@arm.com Signed-off-by: Suzuki K Poulose suzuki.poulose@arm.com Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 7 +++++++ arch/arm64/include/asm/cputype.h | 1 + arch/arm64/kernel/cpu_errata.c | 28 ++++++++++++++++------------ 3 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index be2f23c8fdf9..c22d1d187f16 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -322,9 +322,13 @@ menu "Kernel Features"
menu "ARM errata workarounds via the alternatives framework"
+config ARM64_WORKAROUND_CLEAN_CACHE + def_bool n + config ARM64_ERRATUM_826319 bool "Cortex-A53: 826319: System might deadlock if a write cannot complete until read data is accepted" default y + select ARM64_WORKAROUND_CLEAN_CACHE help This option adds an alternative code sequence to work around ARM erratum 826319 on Cortex-A53 parts up to r0p2 with an AMBA 4 ACE or @@ -346,6 +350,7 @@ config ARM64_ERRATUM_826319 config ARM64_ERRATUM_827319 bool "Cortex-A53: 827319: Data cache clean instructions might cause overlapping transactions to the interconnect" default y + select ARM64_WORKAROUND_CLEAN_CACHE help This option adds an alternative code sequence to work around ARM erratum 827319 on Cortex-A53 parts up to r0p2 with an AMBA 5 CHI @@ -367,6 +372,7 @@ config ARM64_ERRATUM_827319 config ARM64_ERRATUM_824069 bool "Cortex-A53: 824069: Cache line might not be marked as clean after a CleanShared snoop" default y + select ARM64_WORKAROUND_CLEAN_CACHE help This option adds an alternative code sequence to work around ARM erratum 824069 on Cortex-A53 parts up to r0p2 when it is connected @@ -389,6 +395,7 @@ config ARM64_ERRATUM_824069 config ARM64_ERRATUM_819472 bool "Cortex-A53: 819472: Store exclusive instructions might cause data corruption" default y + select ARM64_WORKAROUND_CLEAN_CACHE help This option adds an alternative code sequence to work around ARM erratum 819472 on Cortex-A53 parts up to r0p1 with an L2 cache diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 557d838f829c..6e33bee39934 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -150,6 +150,7 @@ struct midr_range { .rv_max = MIDR_CPU_VAR_REV(v_max, r_max), \ }
+#define MIDR_REV_RANGE(m, v, r_min, r_max) MIDR_RANGE(m, v, r_min, v, r_max) #define MIDR_ALL_VERSIONS(m) MIDR_RANGE(m, 0, 0, 0xf, 0xf)
static inline bool midr_is_cpu_model_range(u32 midr, u32 model, u32 rv_min, diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 216126da5bb4..a9c772e85272 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -643,24 +643,28 @@ static const struct midr_range arm64_harden_el2_vectors[] = {
#endif
-const struct arm64_cpu_capabilities arm64_errata[] = { +#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE +static const struct midr_range workaround_clean_cache[] = { #if defined(CONFIG_ARM64_ERRATUM_826319) || \ defined(CONFIG_ARM64_ERRATUM_827319) || \ defined(CONFIG_ARM64_ERRATUM_824069) - { - /* Cortex-A53 r0p[012] */ - .desc = "ARM errata 826319, 827319, 824069", - .capability = ARM64_WORKAROUND_CLEAN_CACHE, - ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 2), - .cpu_enable = cpu_enable_cache_maint_trap, - }, + /* Cortex-A53 r0p[012]: ARM errata 826319, 827319, 824069 */ + MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 2), +#endif +#ifdef CONFIG_ARM64_ERRATUM_819472 + /* Cortex-A53 r0p[01] : ARM errata 819472 */ + MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 1), #endif -#ifdef CONFIG_ARM64_ERRATUM_819472 + {}, +}; +#endif + +const struct arm64_cpu_capabilities arm64_errata[] = { +#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { - /* Cortex-A53 r0p[01] */ - .desc = "ARM errata 819472", + .desc = "ARM errata 826319, 827319, 824069, 819472", .capability = ARM64_WORKAROUND_CLEAN_CACHE, - ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A53, 0, 0, 1), + ERRATA_MIDR_RANGE_LIST(workaround_clean_cache), .cpu_enable = cpu_enable_cache_maint_trap, }, #endif
From: Suzuki K Poulose suzuki.poulose@arm.com
mainline inclusion from mainline-v5.0-rc1 commit f58cdf7e3cab33306efd999c23b4fb606184abf3 category: bugfix bugzilla: 46773 CVE: NA References: https://gitee.com/src-openeuler/kernel/issues/I235Y8
---------------------------
Merge duplicate entries for a single capability using the midr range list for Cavium errata 30115 and 27456.
Cc: Andrew Pinski apinski@cavium.com Cc: David Daney david.daney@cavium.com Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Vladimir Murzin vladimir.murzin@arm.com Tested-by: Vladimir Murzin vladimir.murzin@arm.com Signed-off-by: Suzuki K Poulose suzuki.poulose@arm.com Signed-off-by: Will Deacon will.deacon@arm.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/cputype.h | 1 + arch/arm64/kernel/cpu_errata.c | 50 +++++++++++++++----------------- 2 files changed, 25 insertions(+), 26 deletions(-)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 6e33bee39934..ac266b64a75c 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -151,6 +151,7 @@ struct midr_range { }
#define MIDR_REV_RANGE(m, v, r_min, r_max) MIDR_RANGE(m, v, r_min, v, r_max) +#define MIDR_REV(m, v, r) MIDR_RANGE(m, v, r, v, r) #define MIDR_ALL_VERSIONS(m) MIDR_RANGE(m, 0, 0, 0xf, 0xf)
static inline bool midr_is_cpu_model_range(u32 midr, u32 model, u32 rv_min, diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a9c772e85272..a2984afcf9f6 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -643,6 +643,28 @@ static const struct midr_range arm64_harden_el2_vectors[] = {
#endif
+#ifdef CONFIG_CAVIUM_ERRATUM_27456 +static const struct midr_range cavium_erratum_27456_cpus[] = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, +}; +#endif + +#ifdef CONFIG_CAVIUM_ERRATUM_30115 +static const struct midr_range cavium_erratum_30115_cpus[] = { + /* Cavium ThunderX, T88 pass 1.x - 2.2 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 2), + /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ + MIDR_REV_RANGE(MIDR_THUNDERX_81XX, 0, 0, 2), + /* Cavium ThunderX, T83 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_83XX, 0, 0), + {}, +}; +#endif + #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE static const struct midr_range workaround_clean_cache[] = { #if defined(CONFIG_ARM64_ERRATUM_826319) || \ @@ -715,40 +737,16 @@ const struct arm64_cpu_capabilities arm64_errata[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 { - /* Cavium ThunderX, T88 pass 1.x - 2.1 */ .desc = "Cavium erratum 27456", .capability = ARM64_WORKAROUND_CAVIUM_27456, - ERRATA_MIDR_RANGE(MIDR_THUNDERX, - 0, 0, - 1, 1), - }, - { - /* Cavium ThunderX, T81 pass 1.0 */ - .desc = "Cavium erratum 27456", - .capability = ARM64_WORKAROUND_CAVIUM_27456, - ERRATA_MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + ERRATA_MIDR_RANGE_LIST(cavium_erratum_27456_cpus), }, #endif #ifdef CONFIG_CAVIUM_ERRATUM_30115 { - /* Cavium ThunderX, T88 pass 1.x - 2.2 */ - .desc = "Cavium erratum 30115", - .capability = ARM64_WORKAROUND_CAVIUM_30115, - ERRATA_MIDR_RANGE(MIDR_THUNDERX, - 0, 0, - 1, 2), - }, - { - /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ - .desc = "Cavium erratum 30115", - .capability = ARM64_WORKAROUND_CAVIUM_30115, - ERRATA_MIDR_REV_RANGE(MIDR_THUNDERX_81XX, 0, 0, 2), - }, - { - /* Cavium ThunderX, T83 pass 1.0 */ .desc = "Cavium erratum 30115", .capability = ARM64_WORKAROUND_CAVIUM_30115, - ERRATA_MIDR_REV(MIDR_THUNDERX_83XX, 0, 0), + ERRATA_MIDR_RANGE_LIST(cavium_erratum_30115_cpus), }, #endif {
From: Suzuki K Poulose suzuki.poulose@arm.com
mainline inclusion from mainline-v5.0-rc1 commit a3dcea2c85129716f323d504b087a04200687242 category: bugfix bugzilla: 46773 CVE: NA References: https://gitee.com/src-openeuler/kernel/issues/I235Y8
---------------------------
Remove duplicate entries for Qualcomm erratum 1003. Since the entries are not purely based on generic MIDR checks, use the multi_cap_entry type to merge the entries.
Cc: Christopher Covington cov@codeaurora.org Cc: Will Deacon will.deacon@arm.com Reviewed-by: Vladimir Murzin vladimir.murzin@arm.com Tested-by: Vladimir Murzin vladimir.murzin@arm.com Signed-off-by: Suzuki K Poulose suzuki.poulose@arm.com Signed-off-by: Will Deacon will.deacon@arm.com
Conflicts: arch/arm64/kernel/cpu_errata.c [Zheng Zengkai: replace multi_entry_cap_matches with cpucap_multi_entry_cap_matches skipped in the following commit: arm64: cpufeature: Rework ptr auth hwcaps using multi_entry_cap_matches ]
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/kernel/cpu_errata.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a2984afcf9f6..ec7bb83fabfb 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -665,6 +665,19 @@ static const struct midr_range cavium_erratum_30115_cpus[] = { }; #endif
+#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 +static const struct arm64_cpu_capabilities qcom_erratum_1003_list[] = { + { + ERRATA_MIDR_REV(MIDR_QCOM_FALKOR_V1, 0, 0), + }, + { + .midr_range.model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, + {}, +}; +#endif + #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE static const struct midr_range workaround_clean_cache[] = { #if defined(CONFIG_ARM64_ERRATUM_826319) || \ @@ -765,16 +778,10 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 { - .desc = "Qualcomm Technologies Falkor erratum 1003", + .desc = "Qualcomm Technologies Falkor/Kryo erratum 1003", .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, - ERRATA_MIDR_REV(MIDR_QCOM_FALKOR_V1, 0, 0), - }, - { - .desc = "Qualcomm Technologies Kryo erratum 1003", - .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, - .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, - .midr_range.model = MIDR_QCOM_KRYO, - .matches = is_kryo_midr, + .matches = cpucap_multi_entry_cap_matches, + .match_list = qcom_erratum_1003_list, }, #endif #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
------------------------------------------------- v2: Fix a compilation warning.
1. Set errno when fail in sp_free()
2. Remove redundant deassociation of uva and kva in sp_unshare_kva(). One of the reason is that this code allows vmalloc + k2u + unshare kva to be legal.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3d0266490613..9487918731b5 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -895,6 +895,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, atomic_set(&spa->use_count, 1); spa->type = type; spa->mm = NULL; + spa->kva = 0; /* NULL pointer */
if (spa_inc_usage(type, size, (flags & SP_DVPP))) { err = ERR_PTR(-EINVAL); @@ -1105,6 +1106,7 @@ int sp_free(unsigned long addr) }
if (spa->type != SPA_TYPE_ALLOC) { + ret = -EINVAL; if (printk_ratelimit()) pr_err("share pool: sp free failed, addr %pK is not from sp_alloc\n", (void *)addr); @@ -2062,7 +2064,6 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) unsigned long step; bool is_hugepage = true; int ret; - struct vm_struct *area;
ret = is_vmap_hugepage(kva); if (ret > 0) { @@ -2098,11 +2099,6 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) (void *)addr); }
- /* deassociate vma and spa */ - area = find_vm_area((void *)kva_aligned); - if (area) - area->flags &= ~VM_SHAREPOOL; - vunmap((void *)kva_aligned);
return 0;
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The ascend memory feature depends on the large page feature, but it is not associated. When CONFIG_HUGETLBFS is turned off, the compilation will fail.
This patch add dependencies in configs.
Acked-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 3 +++ drivers/char/Kconfig | 2 +- include/linux/hugetlb.h | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c22d1d187f16..cfde721a5961 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1523,6 +1523,7 @@ config ASCEND_IOPF_HIPRI
config ASCEND_CHARGE_MIGRATE_HUGEPAGES bool "Enable support for migrate hugepages" + depends on HUGETLBFS default y help When reseved hugepages are used up, we attempts to apply for migrate @@ -1544,6 +1545,7 @@ config ASCEND_WATCHDOG_SYSFS_CONFIGURE
config ASCEND_AUTO_TUNING_HUGEPAGE bool "Enable support for the auto-tuning hugepage" + depends on HUGETLBFS default y help The hugepage auto-tuning means the kernel dynamically manages the number of @@ -1554,6 +1556,7 @@ config ASCEND_SHARE_POOL default n select ARCH_USES_HIGH_VMA_FLAGS select MM_OWNER + depends on HUGETLBFS help This feature allows multiple processes to share virtual memory both in kernel and user level, which is only enabled for ascend platform. diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 0c25d37ca1be..60e05b27b2a3 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -554,7 +554,7 @@ config ADI
config HISI_SVM bool "Hisilicon svm driver" - depends on ARM64 && ARM_SMMU_V3 && MMU_NOTIFIER + depends on ARM64 && ARM_SMMU_V3 && MMU_NOTIFIER && HUGETLBFS default y help This driver provides character-level access to Hisilicon diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index debd4603991e..5c0dabc8e4e8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -636,6 +636,13 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr pte_t *ptep, pte_t pte, unsigned long sz) { } + +static inline int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr) +{ + return 0; +} #endif /* CONFIG_HUGETLB_PAGE */
static inline spinlock_t *huge_pte_lock(struct hstate *h,
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
In func sp_remap_kva_to_vma, if sp_mmap failed, the caller does not need to free this memory area of the current process. But if the operation fails after sp_mmap, roll back.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 9487918731b5..40875b13acf5 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1435,7 +1435,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: k2u mmap failed %lx\n", ret_addr); - goto out; + goto put_mm; } BUG_ON(ret_addr != spa->va_start);
@@ -1446,9 +1446,10 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (is_vm_hugetlb_page(vma)) { ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); ret_addr = ret; - goto out; + goto put_mm; } vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; } else { @@ -1459,9 +1460,10 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, __pgprot(vma->vm_page_prot.pgprot)); if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); pr_err("share pool: remap_pfn_range failed, ret %d\n", ret); ret_addr = ret; - goto out; + goto put_mm; } offset += PAGE_SIZE; buf += PAGE_SIZE; @@ -1469,7 +1471,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, } while (offset < spa_size(spa)); }
-out: +put_mm: up_write(&mm->mmap_sem); mmput(mm); put_file: @@ -1501,7 +1503,6 @@ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, ret_addr = sp_remap_kva_to_vma(kva, spa, tsk->mm); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: remap k2u to task failed, ret %ld\n", ret_addr); - sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); p = ERR_PTR(ret_addr); goto out; } @@ -1509,10 +1510,12 @@ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, p = (void *)ret_addr;
task_lock(tsk); - if (tsk->mm == NULL) + if (tsk->mm == NULL) { + sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); p = ERR_PTR(-ESRCH); - else + } else { spa->mm = tsk->mm; + } task_unlock(tsk); out: put_task_struct(tsk); @@ -1532,8 +1535,7 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, ret_addr = sp_remap_kva_to_vma(kva, spa, mm); if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); - __sp_free(spg, spa->va_start, spa_size(spa), - list_next_entry(mm, sp_node)); + __sp_free(spg, spa->va_start, spa_size(spa), mm); p = ERR_PTR(ret_addr); goto out; }
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
------------------------------------------------- When the memory is insufficient or fragmentation is severe, the 2MB hugepage allocation will perform direct reclaim and compact.
The direct reclaim and compact may take a long time. As a result, sp mutex will be hold for too long time to casue the hung task problem. In this case, set the PF_MEMALLOC flag to prevent the direct reclaim and compact from being executed.
Direct compact is not allowed during hugepage allocation. As a result, 2MB hugepage may failed to be applied for.
When do sp alloc, if the 2MB hugepage cannot be allocated of the total free memory is less than 1/3 of total memory, a work is added compact the memory asynchronously.
When do sp free, if the total free memory is less than 1/3 of total memory, compact memory synchronously.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 40875b13acf5..04c522cf95d4 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -43,6 +43,7 @@ #include <linux/seq_file.h> #include <linux/rmap.h> #include <linux/hugetlb.h> +#include <linux/compaction.h>
/* access control mode macros */ #define AC_NONE 0 @@ -1036,6 +1037,45 @@ void sp_area_drop(struct vm_area_struct *vma) spin_unlock(&sp_area_lock); }
+static unsigned long last_jiffies; +static void sp_compact_nodes(struct work_struct *work) +{ + sysctl_compaction_handler(NULL, 1, NULL, NULL, NULL); + + kfree(work); +} + +static void sp_add_work_compact(void) +{ + struct work_struct *compact_work; + + if (!time_after(jiffies, last_jiffies + 10 * HZ)) + return; + + compact_work = kzalloc(sizeof(*compact_work), GFP_KERNEL); + if (!compact_work) + return; + + last_jiffies = jiffies; + INIT_WORK(compact_work, sp_compact_nodes); + schedule_work(compact_work); +} + +static void sp_try_to_compact(void) +{ + unsigned long totalram; + unsigned long freeram; + + totalram = totalram_pages; + freeram = global_zone_page_state(NR_FREE_PAGES); + + /* free < total / 3 */ + if ((freeram + (freeram << 1)) > totalram) + return; + + sp_add_work_compact(); +} + /* The caller must hold sp_mutex. */ static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size) @@ -1143,6 +1183,7 @@ int sp_free(unsigned long addr) out: mutex_unlock(&sp_mutex);
+ sp_try_to_compact(); return ret; } EXPORT_SYMBOL_GPL(sp_free); @@ -1194,6 +1235,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) int ret = 0; struct mm_struct *tmp; unsigned long mode, offset; + unsigned int noreclaim_flag;
/* mdc scene hack */ if (enable_mdc_default_group) @@ -1306,6 +1348,21 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) /* clean PTE_RDONLY flags or trigger SMMU event */ vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); up_write(&mm->mmap_sem); + + /* + * The direct reclaim and compact may take a long + * time. As a result, sp mutex will be hold for too + * long time to casue the hung task problem. In this + * case, set the PF_MEMALLOC flag to prevent the + * direct reclaim and compact from being executed. + * Since direct reclaim and compact are not performed + * when the fragmentation is severe or the memory is + * insufficient, 2MB continuous physical pages fail + * to be allocated. This situation is allowed. + */ + if (spa->is_hugepage) + noreclaim_flag = memalloc_noreclaim_save(); + /* * We are not ignoring errors, so if we fail to allocate * physical memory we just return failure, so we won't encounter @@ -1313,6 +1370,11 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) * depends on this feature (and MAP_LOCKED) to work correctly. */ ret = do_mm_populate(mm, sp_addr, populate, 0); + if (spa->is_hugepage) { + memalloc_noreclaim_restore(noreclaim_flag); + if (ret) + sp_add_work_compact(); + } if (ret) { __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node)); @@ -1363,6 +1425,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) __sp_area_drop(spa);
sp_dump_stack(); + sp_try_to_compact(); return p; } EXPORT_SYMBOL_GPL(sp_alloc);
From: Dave Kleikamp dave.kleikamp@oracle.com
mainline inclusion from mainline-v5.11 commit c61b3e4839007668360ed8b87d7da96d2e59fc6c category: bugfix bugzilla: NA CVE: CVE-2020-27815
--------------------------------
Bounds checking tools can flag a bug in dbAdjTree() for an array index out of bounds in dmt_stree. Since dmt_stree can refer to the stree in both structures dmaptree and dmapctl, use the larger array to eliminate the false positive.
Signed-off-by: Dave Kleikamp dave.kleikamp@oracle.com Reported-by: butt3rflyh4ck butterflyhuangxx@gmail.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/jfs/jfs_dmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 562b9a7e4311..f502a15c6c98 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -196,7 +196,7 @@ typedef union dmtree { #define dmt_leafidx t1.leafidx #define dmt_height t1.height #define dmt_budmin t1.budmin -#define dmt_stree t1.stree +#define dmt_stree t2.stree
/* * on-disk aggregate disk allocation map descriptor.
From: Lorenzo Pieralisi lorenzo.pieralisi@arm.com
mainline inclusion from mainline-v5.10-rc1 commit f5810e5c329238b8553ebd98b914bdbefd8e6737 category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
For arches that do not select CONFIG_GENERIC_IOMAP, the current pci_iounmap() function does nothing causing obvious memory leaks for mapped regions that are backed by MMIO physical space.
In order to detect if a mapped pointer is IO vs MMIO, a check must made available to the pci_iounmap() function so that it can actually detect whether the pointer has to be unmapped.
In configurations where CONFIG_HAS_IOPORT_MAP && !CONFIG_GENERIC_IOMAP, a mapped port is detected using an ioport_map() stub defined in asm-generic/io.h.
Use the same logic to implement a stub (ie __pci_ioport_unmap()) that detects if the passed in pointer in pci_iounmap() is IO vs MMIO to iounmap conditionally and call it in pci_iounmap() fixing the issue.
Leave __pci_ioport_unmap() as a NOP for all other config options.
Tested-by: George Cherian george.cherian@marvell.com Link: https://lore.kernel.org/lkml/20200905024811.74701-1-yangyingliang@huawei.com Link: https://lore.kernel.org/lkml/20200824132046.3114383-1-george.cherian@marvell... Link: https://lore.kernel.org/r/a9daf8d8444d0ebd00bc6d64e336ec49dbb50784.160025414... Reported-by: George Cherian george.cherian@marvell.com Signed-off-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Signed-off-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Reviewed-by: Catalin Marinas catalin.marinas@arm.com Cc: Arnd Bergmann arnd@arndb.de Cc: George Cherian george.cherian@marvell.com Cc: Will Deacon will@kernel.org Cc: Bjorn Helgaas bhelgaas@google.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/asm-generic/io.h | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-)
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 303871651f8a..cc946f38182d 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -894,18 +894,6 @@ static inline void iowrite64_rep(volatile void __iomem *addr, #include <linux/vmalloc.h> #define __io_virt(x) ((void __force *)(x))
-#ifndef CONFIG_GENERIC_IOMAP -struct pci_dev; -extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max); - -#ifndef pci_iounmap -#define pci_iounmap pci_iounmap -static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p) -{ -} -#endif -#endif /* CONFIG_GENERIC_IOMAP */ - /* * Change virtual addresses to physical addresses and vv. * These are pretty trivial @@ -1029,6 +1017,16 @@ static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) port &= IO_SPACE_LIMIT; return (port > MMIO_UPPER_LIMIT) ? NULL : PCI_IOBASE + port; } +#define __pci_ioport_unmap __pci_ioport_unmap +static inline void __pci_ioport_unmap(void __iomem *p) +{ + uintptr_t start = (uintptr_t) PCI_IOBASE; + uintptr_t addr = (uintptr_t) p; + + if (addr >= start && addr < start + IO_SPACE_LIMIT) + return; + iounmap(p); +} #endif
#ifndef ioport_unmap @@ -1043,6 +1041,23 @@ extern void ioport_unmap(void __iomem *p); #endif /* CONFIG_GENERIC_IOMAP */ #endif /* CONFIG_HAS_IOPORT_MAP */
+#ifndef CONFIG_GENERIC_IOMAP +struct pci_dev; +extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max); + +#ifndef __pci_ioport_unmap +static inline void __pci_ioport_unmap(void __iomem *p) {} +#endif + +#ifndef pci_iounmap +#define pci_iounmap pci_iounmap +static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p) +{ + __pci_ioport_unmap(p); +} +#endif +#endif /* CONFIG_GENERIC_IOMAP */ + /* * Convert a virtual cached pointer to an uncached pointer */
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------------------------
Add workaround bindings in device tree to init ts core GICR.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/irqchip/irq-gic-v3.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-)
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 7bf14acdcd28..6bb787ba1764 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -922,12 +922,18 @@ static struct workaround_oem_info gicr_wkrd_info[] = { } };
-static void gic_check_hisi_workaround(void) +static void gic_check_hisi_workaround(struct fwnode_handle *handle) { struct acpi_table_header *tbl; acpi_status status = AE_OK; + struct device_node *node = to_of_node(handle); int i;
+ if ((node != NULL) && of_property_read_bool(node, "enable-init-all-gicr")) { + its_enable_init_all_gicr(); + return; + } + status = acpi_get_table(ACPI_SIG_MADT, 0, &tbl); if (ACPI_FAILURE(status) || !tbl) return; @@ -1088,11 +1094,11 @@ static void gic_cpu_init_others(void) } } #else -static inline void gic_check_hisi_workaround(void) {} +#define gic_check_hisi_workaround(x)
-static inline void gic_compute_nr_gicr(void) {} +#define gic_compute_nr_gicr()
-static inline void gic_cpu_init_others(void) {} +#define gic_cpu_init_others() #endif
#ifdef CONFIG_SMP @@ -1549,7 +1555,7 @@ static int __init gic_init_bases(void __iomem *dist_base, gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist)); gic_data.rdists.has_vlpis = true; gic_data.rdists.has_direct_lpi = true; - gic_check_hisi_workaround(); + gic_check_hisi_workaround(handle); gic_compute_nr_gicr();
if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------------------------
In the binding document, add enable-init-all-GICR field description.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../devicetree/bindings/interrupt-controller/arm,gic-v3.txt | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt index 3ea78c4ef887..9f4fe47d9d54 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.txt @@ -71,6 +71,10 @@ Optional region containing only the {SET,CLR}SPI registers to be used if isolation is required, and if supported by the HW.
+- enable-init-all-gicr: Boolean property. Identifies kernel initializes + message interrupt functionality for other GICR not managed by this + operating system. + Sub-nodes:
PPI affinity can be expressed as a single "ppi-partitions" node,
From: Takashi Iwai tiwai@suse.de
mainline inclusion from mainline-v5.7-rc1 commit a900cc5cd846edc6964736834e098e88d7ecfcd6 category: bugfix bugzilla: 32178 CVE: NA
-------------------------------------------------
Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf().
Signed-off-by: Takashi Iwai tiwai@suse.de Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Feng Yubo fengyubo3@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/ata/libata-transport.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index a0b0b4d986f2..c3f446fc24c4 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -208,7 +208,7 @@ show_ata_port_##name(struct device *dev, \ { \ struct ata_port *ap = transport_class_to_port(dev); \ \ - return snprintf(buf, 20, format_string, cast ap->field); \ + return scnprintf(buf, 20, format_string, cast ap->field); \ }
#define ata_port_simple_attr(field, name, format_string, type) \ @@ -479,7 +479,7 @@ show_ata_dev_##field(struct device *dev, \ { \ struct ata_device *ata_dev = transport_class_to_dev(dev); \ \ - return snprintf(buf, 20, format_string, cast ata_dev->field); \ + return scnprintf(buf, 20, format_string, cast ata_dev->field); \ }
#define ata_dev_simple_attr(field, format_string, type) \ @@ -533,7 +533,7 @@ show_ata_dev_id(struct device *dev, if (ata_dev->class == ATA_DEV_PMP) return 0; for(i=0;i<ATA_ID_WORDS;i++) { - written += snprintf(buf+written, 20, "%04x%c", + written += scnprintf(buf+written, 20, "%04x%c", ata_dev->id[i], ((i+1) & 7) ? ' ' : '\n'); } @@ -552,7 +552,7 @@ show_ata_dev_gscr(struct device *dev, if (ata_dev->class != ATA_DEV_PMP) return 0; for(i=0;i<SATA_PMP_GSCR_DWORDS;i++) { - written += snprintf(buf+written, 20, "%08x%c", + written += scnprintf(buf+written, 20, "%08x%c", ata_dev->gscr[i], ((i+1) & 3) ? ' ' : '\n'); } @@ -581,7 +581,7 @@ show_ata_dev_trim(struct device *dev, else mode = "unqueued";
- return snprintf(buf, 20, "%s\n", mode); + return scnprintf(buf, 20, "%s\n", mode); }
static DEVICE_ATTR(trim, S_IRUGO, show_ata_dev_trim, NULL);