[PATCH 01/14] blk-cgroup: prevent rcu_sched detected stalls warnings in blkg_destroy_all()

Yang Yingliang

28 Nov 2020 28 Nov '20

9:27 a.m.

From: Yu Kuai <yukuai3@huawei.com> hulk inclusion category: bugfix bugzilla: 46357 CVE: NA --------------------------- test procedures: a. create 20000 cgroups, and echo "8:0 10000" to blkio.throttle.write_bps_device b. echo 1 > /sys/blocd/sda/device/delete test result: rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [5/1143] rcu: 0-...0: (0 ticks this GP) idle=0f2/1/0x4000000000000000 softirq=15507/15507 fq rcu: (detected by 6, t=60012 jiffies, g=119977, q=27153) NMI backtrace for cpu 0 CPU: 0 PID: 443 Comm: bash Not tainted 4.19.95-00061-g0bcc83b30eec #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-p4 RIP: 0010:blk_throtl_update_limit_valid.isra.0+0x116/0x2a0 Code: 01 00 00 e8 7c dd 74 ff 48 83 bb 78 01 00 00 00 0f 85 54 01 00 00 48 8d bb 88 01 1 RSP: 0018:ffff8881030bf9f0 EFLAGS: 00000046 RAX: 0000000000000000 RBX: ffff8880b4f37080 RCX: ffffffff95da0afe RDX: dffffc0000000000 RSI: ffff888100373980 RDI: ffff8880b4f37208 RBP: ffff888100deca00 R08: ffffffff9528f951 R09: 0000000000000001 R10: ffffed10159dbf56 R11: ffff8880acedfab3 R12: ffff8880b9fda498 R13: ffff8880b9fda4f4 R14: 0000000000000050 R15: ffffffff98b622c0 FS: 00007feb51c51700(0000) GS:ffff888106200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000561619547080 CR3: 0000000102bc9000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: throtl_pd_offline+0x98/0x100 blkg_destroy+0x133/0x630 ? blkcg_deactivate_policy+0x2c0/0x2c0 ? lock_timer_base+0x65/0x110 blkg_destroy_all+0x7f/0x100 blkcg_exit_queue+0x3f/0xa5 blk_exit_queue+0x69/0xa0 blk_cleanup_queue+0x226/0x360 __scsi_remove_device+0xb4/0x3c0 scsi_remove_device+0x38/0x60 sdev_store_delete+0x74/0x100 ? dev_driver_string+0xb0/0xb0 dev_attr_store+0x41/0x70 sysfs_kf_write+0x89/0xc0 kernfs_fop_write+0x1b6/0x2e0 ? sysfs_kf_bin_read+0x130/0x130 __vfs_write+0xca/0x420 ? kernel_read+0xc0/0xc0 ? __alloc_fd+0x16f/0x2d0 ? __fd_install+0x95/0x1a0 ? handle_mm_fault+0x3e0/0x560 vfs_write+0x11a/0x2f0 ksys_write+0xb9/0x1e0 ? __x64_sys_read+0x60/0x60 ? kasan_check_write+0x20/0x30 ? filp_close+0xb5/0xf0 __x64_sys_write+0x46/0x60 do_syscall_64+0xd9/0x1f0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The usage of so many blkg is very rare, however, such problem do exist in theory. In order to avoid such warnings, release 'q->queue_lock' for a while when a batch of blkg were destroyed. Signed-off-by: Yu Kuai <yukuai3@huawei.com> Reviewed-by: Tao Hou <houtao1@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- block/blk-cgroup.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e592167449aa..c64f0afa27dc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -364,16 +364,31 @@ static void blkg_destroy(struct blkcg_gq *blkg) */ static void blkg_destroy_all(struct request_queue *q) { +#define BLKG_DESTROY_BATCH 4096 struct blkcg_gq *blkg, *n; + int count; lockdep_assert_held(q->queue_lock); +again: + count = BLKG_DESTROY_BATCH; list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); + /* + * If the list is too long, the loop can took a long time, + * thus relese the lock for a while when a batch of blkg + * were destroyed. + */ + if (!--count) { + spin_unlock_irq(q->queue_lock); + cond_resched(); + spin_lock_irq(q->queue_lock); + goto again; + } } q->root_blkg = NULL; -- 2.25.1

Show replies by date

Yang Yingliang

28 Nov 28 Nov

9:27 a.m.

New subject: [PATCH 02/14] RDMA/hns: Disable UD on HIP08

From: Weihang Li <liweihang@huawei.com> driver inclusion category: bugfix bugzilla: NA CVE: NA ---------------------------------- Unreliable Datagram is not fully supported on HIP08, so this feature should be disabled. Reviewed-by: Zhao Weibo <zhaoweibo3@huawei.com> Reviewed-by: Hu Chunzhi <huchunzhi@huawei.com> Signed-off-by: Yang Shunfeng <yangshunfeng2@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Signed-off-by: Weihang Li <liweihang@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/infiniband/hw/hns/hns_roce_ah.c | 23 ++------------------- drivers/infiniband/hw/hns/hns_roce_device.h | 1 - drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 23 +++++++-------------- drivers/infiniband/hw/hns/hns_roce_main.c | 6 ------ drivers/infiniband/hw/hns/hns_roce_qp.c | 5 ----- include/uapi/rdma/hns-abi.h | 7 ------- 6 files changed, 10 insertions(+), 55 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index da07f5221bd6..82e64d193cd6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -32,7 +32,6 @@ #include "roce_k_compat.h" #include <linux/platform_device.h> -#include <rdma/hns-abi.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include "hns_roce_device.h" @@ -45,13 +44,11 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata) { - struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device); - struct hns_roce_ib_create_ah_resp resp = {}; - struct device *dev = hr_dev->dev; #ifdef CONFIG_KERNEL_419 const struct ib_gid_attr *gid_attr; - int ret = 0; #else + struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device); + struct device *dev = hr_dev->dev; struct ib_gid_attr gid_attr; union ib_gid sgid; int ret; @@ -62,8 +59,6 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); u8 vlan_en = 0; - rdfx_func_cnt(hr_dev, RDFX_FUNC_CREATE_AH); - ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); @@ -121,8 +116,6 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, ah->av.gid_index = grh->sgid_index; ah->av.vlan = vlan_tag; ah->av.vlan_en = vlan_en; - dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index, - ah->av.vlan); if (rdma_ah_get_static_rate(ah_attr)) ah->av.stat_rate = IB_RATE_10_GBPS; @@ -134,18 +127,6 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, ah->av.hop_limit = grh->hop_limit; } - if (udata) { - memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN); - resp.vlan = vlan_tag; - resp.vlan_en = vlan_en; - ret = ib_copy_to_udata(udata, &resp, - min(udata->outlen, sizeof(resp))); - if (ret) { - kfree(ah); - return ERR_PTR(ret); - } - } - return &ah->ibah; } diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2be69a6c9646..533df3c68d2f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -236,7 +236,6 @@ enum { HNS_ROCE_CAP_FLAG_FRMR = BIT(8), HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL = BIT(9), HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), - HNS_ROCE_CAP_FLAG_UD = BIT(11), }; enum hns_roce_mtt_type { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e7efd37b7734..253263b68cf1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -352,8 +352,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, if (unlikely(ibqp->qp_type != IB_QPT_RC && ibqp->qp_type != IB_QPT_UC && - ibqp->qp_type != IB_QPT_GSI && - ibqp->qp_type != IB_QPT_UD) && + ibqp->qp_type != IB_QPT_GSI) && (ibqp->qp_type != IB_QPT_XRC_INI) && (ibqp->qp_type != IB_QPT_XRC_TGT)) { dev_err(dev, "Not supported QP type, type-0x%x, qpn-0x%x!\n", @@ -408,7 +407,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, tmp_len = 0; /* Corresponding to the QP type, wqe process separately */ - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) { + if (ibqp->qp_type == IB_QPT_GSI) { ah = to_hr_ah(ud_wr(wr)->ah); ud_sq_wqe = wqe; memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe)); @@ -1877,8 +1876,7 @@ static void set_default_caps(struct hns_roce_dev *hr_dev) if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_B) { caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW | HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR | - HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL | - HNS_ROCE_CAP_FLAG_UD; + HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL; caps->num_qpc_timer = HNS_ROCE_V2_MAX_QPC_TIMER_NUM; caps->qpc_timer_entry_sz = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ; @@ -3896,7 +3894,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M, V2_QPC_BYTE_4_TST_S, 0); - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) + if (ibqp->qp_type == IB_QPT_GSI) roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S, @@ -4456,8 +4454,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, - (((ibqp->qp_type == IB_QPT_GSI) || - ibqp->qp_type == IB_QPT_UD) || + (ibqp->qp_type == IB_QPT_GSI || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? hr_dev->caps.wqe_sge_hop_num : 0); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, @@ -4680,8 +4677,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); - context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI || - ibqp->qp_type == IB_QPT_UD) || + context->sq_cur_sge_blk_addr = (ibqp->qp_type == IB_QPT_GSI || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? (cpu_to_le32(sge_cur_blk >> @@ -4689,8 +4685,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, - ((ibqp->qp_type == IB_QPT_GSI || - ibqp->qp_type == IB_QPT_UD) || + (ibqp->qp_type == IB_QPT_GSI || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? (sge_cur_blk >> (32 + PAGE_ADDR_SHIFT)) : 0); qpc_mask->sq_cur_sge_blk_addr = 0; @@ -5430,9 +5425,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, unsigned long flags; int ret = 0; - if ((hr_qp->ibqp.qp_type == IB_QPT_RC || - hr_qp->ibqp.qp_type == IB_QPT_UD) && - hr_qp->state != IB_QPS_RESET) { + if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state != IB_QPS_RESET) { /* Modify qp to reset before destroying qp */ ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET); diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 74f05e82a526..9ed050f790a4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -992,12 +992,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) (1ULL << IB_USER_VERBS_CMD_CLOSE_XRCD); } - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_UD) { - ib_dev->uverbs_cmd_mask |= - (1ULL << IB_USER_VERBS_CMD_CREATE_AH) | - (1ULL << IB_USER_VERBS_CMD_DESTROY_AH); - } - #ifdef CONFIG_NEW_KERNEL ib_dev->driver_id = RDMA_DRIVER_HNS; #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index eb78c4de0be2..b6017ab10ab6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -465,10 +465,6 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * (hr_qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE)); - if (hr_qp->ibqp.qp_type == IB_QPT_UD) - hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * - hr_qp->sq.max_gs); - if ((hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) && (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A)) { if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { @@ -1181,7 +1177,6 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd, if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_XRC)) return ERR_PTR(-EINVAL); init_attr->recv_cq = init_attr->send_cq; - case IB_QPT_UD: case IB_QPT_UC: case IB_QPT_RC: { hr_qp = kzalloc(sizeof(*hr_qp), GFP_KERNEL); diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h index 55e70168a1ad..ce264f68066e 100644 --- a/include/uapi/rdma/hns-abi.h +++ b/include/uapi/rdma/hns-abi.h @@ -36,13 +36,6 @@ #include <linux/types.h> -struct hns_roce_ib_create_ah_resp { - __u8 dmac[6]; - __u16 vlan; - __u8 vlan_en; - __u8 reserved[7]; -}; - struct hns_roce_ib_create_cq { __aligned_u64 buf_addr; __aligned_u64 db_addr; -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 03/14] scsi: flip the default on use_clustering

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA Most SCSI drivers want to enable "clustering", that is merging of segments so that they might span more than a single page. Remove the ENABLE_CLUSTERING define, and require drivers to explicitly set DISABLE_CLUSTERING to disable this feature. Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_v1_hw.c | 1 - drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 1 - drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c index 031091e31774..8266b388e167 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c @@ -1779,7 +1779,6 @@ static struct scsi_host_template sht_v1_hw = { .this_id = -1, .sg_tablesize = HISI_SAS_SGE_PAGE_CNT, .max_sectors = SCSI_DEFAULT_MAX_SECTORS, - .use_clustering = ENABLE_CLUSTERING, .eh_device_reset_handler = sas_eh_device_reset_handler, .eh_target_reset_handler = sas_eh_target_reset_handler, .target_destroy = sas_target_destroy, diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 6568e200e4b7..206a13f53aa5 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -3591,7 +3591,6 @@ static struct scsi_host_template sht_v2_hw = { .this_id = -1, .sg_tablesize = HISI_SAS_SGE_PAGE_CNT, .max_sectors = SCSI_DEFAULT_MAX_SECTORS, - .use_clustering = ENABLE_CLUSTERING, .eh_device_reset_handler = sas_eh_device_reset_handler, .eh_target_reset_handler = sas_eh_target_reset_handler, .target_destroy = sas_target_destroy, diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 5b808562e068..78fab27b0983 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3258,7 +3258,6 @@ static struct scsi_host_template sht_v3_hw = { .sg_tablesize = HISI_SAS_SGE_PAGE_CNT, .sg_prot_tablesize = HISI_SAS_SGE_PAGE_CNT, .max_sectors = SCSI_DEFAULT_MAX_SECTORS, - .use_clustering = ENABLE_CLUSTERING, .eh_device_reset_handler = sas_eh_device_reset_handler, .eh_target_reset_handler = sas_eh_target_reset_handler, .target_destroy = sas_target_destroy, -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 04/14] scsi: hisi_sas: Reduce HISI_SAS_SGE_PAGE_CNT in size

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA Macro HISI_SAS_SGE_PAGE_CNT is defined to SG_CHUNK_SIZE, which is 128. This means that sizeof(struct hisi_sas_slot_buf_table) is 4192. This is just over a 4K, which can mean inefficient DMA memory usage (for no PI). Reduce the size of HISI_SAS_SGE_PAGE_CNT to 124 to fit in a 4K page. With this change, we experience no performance hit. Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 840a506ebb7e..012445ee5dc5 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -508,12 +508,12 @@ struct hisi_sas_command_table_stp { u8 atapi_cdb[ATAPI_CDB_LEN]; }; -#define HISI_SAS_SGE_PAGE_CNT SG_CHUNK_SIZE +#define HISI_SAS_SGE_PAGE_CNT (124) struct hisi_sas_sge_page { struct hisi_sas_sge sge[HISI_SAS_SGE_PAGE_CNT]; } __aligned(16); -#define HISI_SAS_SGE_DIF_PAGE_CNT SG_CHUNK_SIZE +#define HISI_SAS_SGE_DIF_PAGE_CNT HISI_SAS_SGE_PAGE_CNT struct hisi_sas_sge_dif_page { struct hisi_sas_sge sge[HISI_SAS_SGE_DIF_PAGE_CNT]; } __aligned(16); -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 05/14] scsi: hisi_sas: Make slot buf minimum allocation of PAGE_SIZE

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA For a system with PAGE_SIZE of 16K or 64K, the size every time we want to alloc may be small like 4K, but for function dmam_alloc_coherent(), the least size it allocates is PAGE_SIZE, so it will waste much memory for the situation. To solve the issue, limit the minimum allocation size of slot buf to PAGE_SIZE. Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index ff5330bcbb91..396eb230ffc1 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -2415,7 +2415,7 @@ int hisi_sas_alloc(struct hisi_hba *hisi_hba) else sz_slot_buf_ru = roundup(sizeof( struct hisi_sas_slot_dif_buf_table), 64); - s = lcm(max_command_entries_ru, sz_slot_buf_ru); + s = max(lcm(max_command_entries_ru, sz_slot_buf_ru), PAGE_SIZE); blk_cnt = (max_command_entries_ru * sz_slot_buf_ru) / s; slots_per_blk = s / sz_slot_buf_ru; for (i = 0; i < blk_cnt; i++) { -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 06/14] scsi: hisi_sas: Update all the registers after suspend and resume

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA After suspend and resume, the HW registers will be set back to their initial value. We use init_reg_v3_hw() to set some registers, but some registers are set via firmware in ACPI "_RST" method, so add reset handler before init_reg_v3_hw(). Link: https://lore.kernel.org/r/1567774537-20003-7-git-send-email-john.garry@huawe... Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 78fab27b0983..dcac0b97140d 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3605,15 +3605,21 @@ static int hisi_sas_v3_resume(struct pci_dev *pdev) pci_enable_wake(pdev, PCI_D0, 0); pci_restore_state(pdev); rc = pci_enable_device(pdev); - if (rc) + if (rc) { dev_err(dev, "enable device failed during resume (%d)\n", rc); + return rc; + } pci_set_master(pdev); scsi_unblock_requests(shost); clear_bit(HISI_SAS_REJECT_CMD_BIT, &hisi_hba->flags); sas_prep_resume_ha(sha); - init_reg_v3_hw(hisi_hba); + rc = hw_init_v3_hw(hisi_hba); + if (rc) { + scsi_remove_host(shost); + pci_disable_device(pdev); + } hisi_hba->hw->phys_init(hisi_hba); sas_resume_ha(sha); clear_bit(HISI_SAS_RESET_BIT, &hisi_hba->flags); -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 07/14] scsi: hisi_sas: Fix the conflict between device gone and host reset

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA When device gone, it will check whether it is during reset, if not, it will send internal task abort. Before internal task abort returned, reset begins, and it will check whether SAS_PHY_UNUSED is set, if not, it will call hisi_sas_init_device(), but at that time domain_device may already be freed or part of it is freed, so it may referenece null pointer in hisi_sas_init_device(). It may occur as follows: thread0 thread1 hisi_sas_dev_gone() check whether in RESET(no) internal task abort reset prep soft_reset ... (part of reset_done) internal task abort failed release resource anyway clear_itct device->lldd_dev=NULL hisi_sas_reset_init_all_device check sas_dev->dev_type is SAS_PHY_UNUSED and !device set dev_type SAS_PHY_UNUSED sas_free_device hisi_sas_init_device ... Semaphore hisi_hba.sema is used to sync the processes of device gone and host reset. To solve the issue, expand the scope that semaphore protects and let them never occur together. And also some places will check whether domain_device is NULL to judge whether the device is gone. So when device gone, need to clear sas_dev->sas_device. Link: https://lore.kernel.org/r/1567774537-20003-14-git-send-email-john.garry@huaw... Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 396eb230ffc1..154df004b8a0 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1069,6 +1069,7 @@ static void hisi_sas_dev_gone(struct domain_device *device) dev_info(dev, "dev[%d:%x] is gone\n", sas_dev->device_id, sas_dev->dev_type); + down(&hisi_hba->sem); if (!test_bit(HISI_SAS_RESET_BIT, &hisi_hba->flags)) { rc = hisi_sas_internal_task_abort(hisi_hba, device, HISI_SAS_INT_ABT_DEV, 0); @@ -1079,15 +1080,15 @@ static void hisi_sas_dev_gone(struct domain_device *device) dev_info(dev, "dev gone: release remain resources anyway.\n"); } - down(&hisi_hba->sem); hisi_hba->hw->clear_itct(hisi_hba, sas_dev); - up(&hisi_hba->sem); device->lldd_dev = NULL; } if (hisi_hba->hw->free_device) hisi_hba->hw->free_device(sas_dev); sas_dev->dev_type = SAS_PHY_UNUSED; + sas_dev->sas_device = NULL; + up(&hisi_hba->sem); if (rc == -EIO) { dev_err(dev, "internal abort timeout for dev gone.\n"); queue_work(hisi_hba->wq, &hisi_hba->rst_work); @@ -1586,11 +1587,11 @@ void hisi_sas_controller_reset_done(struct hisi_hba *hisi_hba) msleep(1000); hisi_sas_refresh_port_id(hisi_hba); clear_bit(HISI_SAS_REJECT_CMD_BIT, &hisi_hba->flags); - up(&hisi_hba->sem); if (hisi_hba->reject_stp_links_msk) hisi_sas_terminate_stp_reject(hisi_hba); hisi_sas_reset_init_all_devices(hisi_hba); + up(&hisi_hba->sem); scsi_unblock_requests(shost); clear_bit(HISI_SAS_RESET_BIT, &hisi_hba->flags); -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 08/14] scsi: hisi_sas: use wait_for_completion_timeout() when clearing ITCT

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA When injecting 2bit ecc errors, it will cause confusion inside SAS controller which needs host reset to recover it. If a device is gone at the same times inject 2bit ecc errors, we may not receive the ITCT interrupt so it will wait for completion in clear_itct_v3_hw() all the time. And host reset will also not occur because it can't require hisi_hba->sem, so the system will be suspended. To solve the issue, use wait_for_completion_timeout() instead of wait_for_completion(), and also don't mark the gone device as SAS_PHY_UNUSED when device gone. Link: https://lore.kernel.org/r/1571926105-74636-4-git-send-email-john.garry@huawe... Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas.h | 4 +++- drivers/scsi/hisi_sas/hisi_sas_main.c | 15 ++++++++++----- drivers/scsi/hisi_sas/hisi_sas_v1_hw.c | 4 +++- drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 12 ++++++++++-- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 11 +++++++++-- 5 files changed, 35 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index 012445ee5dc5..80cd8870477b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -77,6 +77,8 @@ #define HISI_SAS_SATA_PROTOCOL_FPDMA 0x8 #define HISI_SAS_SATA_PROTOCOL_ATAPI 0x10 +#define CLEAR_ITCT_TIMEOUT 20 + struct hisi_hba; enum { @@ -298,7 +300,7 @@ struct hisi_sas_hw { void (*phy_set_linkrate)(struct hisi_hba *hisi_hba, int phy_no, struct sas_phy_linkrates *linkrates); enum sas_linkrate (*phy_get_max_linkrate)(void); - void (*clear_itct)(struct hisi_hba *hisi_hba, + int (*clear_itct)(struct hisi_hba *hisi_hba, struct hisi_sas_device *dev); void (*free_device)(struct hisi_sas_device *sas_dev); int (*get_wideport_bitmap)(struct hisi_hba *hisi_hba, int port_id); diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 154df004b8a0..3df5d590aab8 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1064,14 +1064,15 @@ static void hisi_sas_dev_gone(struct domain_device *device) struct hisi_sas_device *sas_dev = device->lldd_dev; struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); struct device *dev = hisi_hba->dev; - int rc = 0; + int rc0 = 0; + int rc1 = 0; dev_info(dev, "dev[%d:%x] is gone\n", sas_dev->device_id, sas_dev->dev_type); down(&hisi_hba->sem); if (!test_bit(HISI_SAS_RESET_BIT, &hisi_hba->flags)) { - rc = hisi_sas_internal_task_abort(hisi_hba, device, + rc0 = hisi_sas_internal_task_abort(hisi_hba, device, HISI_SAS_INT_ABT_DEV, 0); hisi_sas_dereg_device(hisi_hba, device); @@ -1080,16 +1081,20 @@ static void hisi_sas_dev_gone(struct domain_device *device) dev_info(dev, "dev gone: release remain resources anyway.\n"); } - hisi_hba->hw->clear_itct(hisi_hba, sas_dev); + rc1 = hisi_hba->hw->clear_itct(hisi_hba, sas_dev); device->lldd_dev = NULL; } if (hisi_hba->hw->free_device) hisi_hba->hw->free_device(sas_dev); - sas_dev->dev_type = SAS_PHY_UNUSED; + + /* Don't mark it as SAS_PHY_UNUSED if failed to clear ITCT */ + if (!rc1) + sas_dev->dev_type = SAS_PHY_UNUSED; sas_dev->sas_device = NULL; up(&hisi_hba->sem); - if (rc == -EIO) { + + if (rc0 == -EIO) { dev_err(dev, "internal abort timeout for dev gone.\n"); queue_work(hisi_hba->wq, &hisi_hba->rst_work); } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c index 8266b388e167..d57e70a6968a 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c @@ -537,7 +537,7 @@ static void setup_itct_v1_hw(struct hisi_hba *hisi_hba, (0xff00ULL << ITCT_HDR_REJ_OPEN_TL_OFF)); } -static void clear_itct_v1_hw(struct hisi_hba *hisi_hba, +static int clear_itct_v1_hw(struct hisi_hba *hisi_hba, struct hisi_sas_device *sas_dev) { u64 dev_id = sas_dev->device_id; @@ -557,6 +557,8 @@ static void clear_itct_v1_hw(struct hisi_hba *hisi_hba, qw0 = cpu_to_le64(itct->qw0); qw0 &= ~ITCT_HDR_VALID_MSK; itct->qw0 = cpu_to_le64(qw0); + + return 0; } static int reset_hw_v1_hw(struct hisi_hba *hisi_hba) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index 206a13f53aa5..e50fef647a63 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -978,13 +978,14 @@ static void setup_itct_v2_hw(struct hisi_hba *hisi_hba, (0x1ULL << ITCT_HDR_RTOLT_OFF)); } -static void clear_itct_v2_hw(struct hisi_hba *hisi_hba, +static int clear_itct_v2_hw(struct hisi_hba *hisi_hba, struct hisi_sas_device *sas_dev) { DECLARE_COMPLETION_ONSTACK(completion); u64 dev_id = sas_dev->device_id; struct hisi_sas_itct *itct = &hisi_hba->itct[dev_id]; u32 reg_val = hisi_sas_read32(hisi_hba, ENT_INT_SRC3); + struct device *dev = hisi_hba->dev; int i; sas_dev->completion = &completion; @@ -994,13 +995,20 @@ static void clear_itct_v2_hw(struct hisi_hba *hisi_hba, hisi_sas_write32(hisi_hba, ENT_INT_SRC3, ENT_INT_SRC3_ITC_INT_MSK); + /* need to set register twice to clear ITCT for v2 hw */ for (i = 0; i < 2; i++) { reg_val = ITCT_CLR_EN_MSK | (dev_id & ITCT_DEV_MSK); hisi_sas_write32(hisi_hba, ITCT_CLR, reg_val); - wait_for_completion(sas_dev->completion); + if (!wait_for_completion_timeout(sas_dev->completion, + CLEAR_ITCT_TIMEOUT * HZ)) { + dev_warn(dev, "failed to clear ITCT\n"); + return -ETIMEDOUT; + } memset(itct, 0, sizeof(struct hisi_sas_itct)); } + + return 0; } static void free_device_v2_hw(struct hisi_sas_device *sas_dev) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index dcac0b97140d..c3b57d0a6f24 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -856,13 +856,14 @@ static void setup_itct_v3_hw(struct hisi_hba *hisi_hba, (0x1ULL << ITCT_HDR_RTOLT_OFF)); } -static void clear_itct_v3_hw(struct hisi_hba *hisi_hba, +static int clear_itct_v3_hw(struct hisi_hba *hisi_hba, struct hisi_sas_device *sas_dev) { DECLARE_COMPLETION_ONSTACK(completion); u64 dev_id = sas_dev->device_id; struct hisi_sas_itct *itct = &hisi_hba->itct[dev_id]; u32 reg_val = hisi_sas_read32(hisi_hba, ENT_INT_SRC3); + struct device *dev = hisi_hba->dev; sas_dev->completion = &completion; @@ -875,8 +876,14 @@ static void clear_itct_v3_hw(struct hisi_hba *hisi_hba, reg_val = ITCT_CLR_EN_MSK | (dev_id & ITCT_DEV_MSK); hisi_sas_write32(hisi_hba, ITCT_CLR, reg_val); - wait_for_completion(sas_dev->completion); + if (!wait_for_completion_timeout(sas_dev->completion, + CLEAR_ITCT_TIMEOUT * HZ)) { + dev_warn(dev, "failed to clear ITCT\n"); + return -ETIMEDOUT; + } + memset(itct, 0, sizeof(struct hisi_sas_itct)); + return 0; } static void dereg_device_v3_hw(struct hisi_hba *hisi_hba, -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 09/14] scsi: hisi_sas: Directly trigger SCSI error handling for completion errors

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA Abort failed commands in completion path. This avoids having to wait for block layer timeouts and triggering the SCSI error handling thread. Link: https://lore.kernel.org/r/1594627471-235395-2-git-send-email-john.garry@huaw... Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_v1_hw.c | 4 +++- drivers/scsi/hisi_sas/hisi_sas_v2_hw.c | 4 +++- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c index d57e70a6968a..746f0aca5e3b 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c @@ -1265,8 +1265,10 @@ static int slot_complete_v1_hw(struct hisi_hba *hisi_hba, !(cmplt_hdr_data & CMPLT_HDR_RSPNS_XFRD_MSK)) { slot_err_v1_hw(hisi_hba, task, slot); - if (unlikely(slot->abort)) + if (unlikely(slot->abort)) { + sas_task_abort(task); return ts->stat; + } goto out; } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c index e50fef647a63..703a8a81176a 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c @@ -2414,8 +2414,10 @@ slot_complete_v2_hw(struct hisi_hba *hisi_hba, struct hisi_sas_slot *slot) error_info[0], error_info[1], error_info[2], error_info[3]); - if (unlikely(slot->abort)) + if (unlikely(slot->abort)) { + sas_task_abort(task); return ts->stat; + } goto out; } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index c3b57d0a6f24..96856fb5d0fa 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2415,8 +2415,10 @@ slot_complete_v3_hw(struct hisi_hba *hisi_hba, struct hisi_sas_slot *slot) dev_info(dev, "data underflow without sense, rsp_code:0x%x, rc:%d.\n", iu->resp_data[0], rc); } - if (unlikely(slot->abort)) + if (unlikely(slot->abort)) { + sas_task_abort(task); return ts->stat; + } goto out; } -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 10/14] scsi: hisi_sas: mask corresponding RAS interrupts for hilink DFX exception

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA BIOS has already masked correspondig RAS interrupts since hilink changed exception type from NFE to DFX, so do SAS driver. Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 96856fb5d0fa..791194cd120e 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -752,7 +752,7 @@ static void init_reg_v3_hw(struct hisi_hba *hisi_hba) upper_32_bits(hisi_hba->initial_fis_dma)); /* RAS registers init */ - hisi_sas_write32(hisi_hba, SAS_RAS_INTR0_MASK, 0x0); + hisi_sas_write32(hisi_hba, SAS_RAS_INTR0_MASK, 0xf80000); hisi_sas_write32(hisi_hba, SAS_RAS_INTR1_MASK, 0x0); hisi_sas_write32(hisi_hba, SAS_RAS_INTR2_MASK, 0x0); hisi_sas_write32(hisi_hba, CFG_SAS_RAS_INTR_MASK, 0x0); -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 11/14] scsi: hisi_sas: fix logic bug when alloc device with MAX device num == 1

From: Wang Chao <wangchao342@hisilicon.com> driver inclusion category: bugfix bugzilla: NA CVE: NA We find that when set HISI_SAS_MAX_ITCT_ENTRIES as 1, we don't evenallocate one device. This does not comply with the code logic designed by us. The log is as follows: [ 3.565847] hsi_sas_v3_hw 0000:74:02.0: Adding to iommu group 0 [ 3.582037] scsi host0: hisi_sas_v3_hw [ 4.794270] hisi_sas_v3_hw 0000:74:02.0: Enable MSI auo-affinity [ 4.872986] hisi_sas_v3_hw 0000:74:02.0: phyup: phy5 link_rate=11 [ 4.879057] hisi_sas_v3_hw 0000:74:02.0: phyup: phy0 link_rate=11 [ 4.879117] sas: phy-0:5 added to port-0:0, phy_mask:0x20 (500e004aaaaaaa1f) [ 4.885131] hisi_sas_v3_hw 0000:74:02.0: phyup: phy1 link_rate=11 [ 4.885145] sas: DOING DISCOVERY on port 0, pid:910 [ 4.891199] hisi_sas_v3_hw 0000:74:02.0: phyup: phy2 link_rate=11 [ 4.891203] hisi_sas_v3_hw 0000:74:02.0: phyup: phy3 link_rate=11 [ 4.891209] hisi_sas_v3_hw 0000:74:02.0: phyup: phy4 link_rate=11 [ 4.897510] hisi_sas_v3_hw 0000:74:02.0: fail alloc dev: max support 1 devices [ 4.903335] hisi_sas_v3_hw 0000:74:02.0: phyup: phy6 link_rate=11 [ 4.903340] hisi_sas_v3_hw 0000:74:02.0: phyup: phy7 link_rate=11 [ 4.909402] sas: driver on host 0000:74:02.0 cannot handle device 500e004aaaaaaa1f, error:-22 [ 4.937404] sas: DONE DISCOVERY on port 0, pid:910, result:-22 [ 4.937409] sas: broadcast received: 0 [ 4.937410] sas: phy0 matched wide port0 [ 4.937414] sas: phy-0:0 added to port-0:0, phy_mask:0x21 (500e004aaaaaaa1f) [ 4.937431] sas: DOING DISCOVERY on port 0, pid:910 [ 4.937581] hisi_sas_v3_hw 0000:74:02.0: fail alloc dev: max support 1 devices [ 4.944774] sas: driver on host 0000:74:02.0 cannot handle device 500e004aaaaaaa1f, error:-22 [ 4.953395] sas: DONE DISCOVERY on port 0, pid:910, result:-22 We find that this extreme case is not considered when implementing hisi_sas_alloc_dev(). Actually, the device ID is allocated from 1 instead of 0. As a result, if the value of HISI_SAS_MAX_ITCT_ENTRIES is 1, the driver considers that the number of devices has reached the upper limit and return sas_dev with NULL. Signed-off-by: Wang Chao <wangchao342@hisilicon.com> Reviewed-by: Zhu Xiongxiong <zhuxiongxiong@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- drivers/scsi/hisi_sas/hisi_sas_main.c | 19 ++++++++++--------- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 3df5d590aab8..124aa26fa4b8 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -685,30 +685,31 @@ static struct hisi_sas_device *hisi_sas_alloc_dev(struct domain_device *device) struct hisi_hba *hisi_hba = dev_to_hisi_hba(device); struct hisi_sas_device *sas_dev = NULL; unsigned long flags; - int last = hisi_hba->last_dev_id; int first = (hisi_hba->last_dev_id + 1) % HISI_SAS_MAX_DEVICES; + int dev_id; int i; spin_lock_irqsave(&hisi_hba->lock, flags); - for (i = first; i != last; i %= HISI_SAS_MAX_DEVICES) { - if (hisi_hba->devices[i].dev_type == SAS_PHY_UNUSED) { - int queue = i % hisi_hba->queue_count; + for (i = first; i < first + HISI_SAS_MAX_DEVICES; i++) { + dev_id = i % HISI_SAS_MAX_DEVICES; + if (hisi_hba->devices[dev_id].dev_type == SAS_PHY_UNUSED) { + int queue = dev_id % hisi_hba->queue_count; struct hisi_sas_dq *dq = &hisi_hba->dq[queue]; - hisi_hba->devices[i].device_id = i; - sas_dev = &hisi_hba->devices[i]; + hisi_hba->devices[dev_id].device_id = dev_id; + sas_dev = &hisi_hba->devices[dev_id]; sas_dev->dev_status = HISI_SAS_DEV_INIT; sas_dev->dev_type = device->dev_type; sas_dev->hisi_hba = hisi_hba; sas_dev->sas_device = device; sas_dev->dq = dq; spin_lock_init(&sas_dev->lock); - INIT_LIST_HEAD(&hisi_hba->devices[i].list); + INIT_LIST_HEAD(&hisi_hba->devices[dev_id].list); break; } - i++; } - hisi_hba->last_dev_id = i; + if (sas_dev) + hisi_hba->last_dev_id = i; spin_unlock_irqrestore(&hisi_hba->lock, flags); return sas_dev; diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 791194cd120e..3cf0255dccf1 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3399,6 +3399,7 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_out_ha; } + hisi_hba->last_dev_id = -1; phy_nr = port_nr = hisi_hba->n_phy; arr_phy = devm_kcalloc(dev, phy_nr, sizeof(void *), GFP_KERNEL); -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 12/14] blk-throttle: don't check whether or not lower limit is valid if CONFIG_BLK_DEV_THROTTLING_LOW is off

From: Yu Kuai <yukuai3@huawei.com> hulk inclusion category: bugfix bugzilla: 46357 CVE: NA --------------------------- blk_throtl_update_limit_valid() will search for descendants to see if 'LIMIT_LOW' of bps/iops and READ/WRITE is nonzero. However, they're always zero if CONFIG_BLK_DEV_THROTTLING_LOW is not set, furthermore, a lot of time will be wasted to iterate descendants. Thus do nothing in blk_throtl_update_limit_valid() in such situation. Signed-off-by: Yu Kuai <yukuai3@huawei.com> Reviewed-by: Tao Hou <houtao1@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- block/blk-throttle.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index caee658609d7..c7b4f905feb5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -568,6 +568,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -588,6 +589,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd) -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 13/14] powerpc/rtas: Restrict RTAS requests from userspace

From: Andrew Donnellan <ajd@linux.ibm.com> stable inclusion from linux-4.19.155 commit 94e8f0bbc475228c93d28b2e0f7e37303db80ffe CVE: CVE-2020-27777 -------------------------------- commit bd59380c5ba4147dcbaad3e582b55ccfd120b764 upstream. A number of userspace utilities depend on making calls to RTAS to retrieve information and update various things. The existing API through which we expose RTAS to userspace exposes more RTAS functionality than we actually need, through the sys_rtas syscall, which allows root (or anyone with CAP_SYS_ADMIN) to make any RTAS call they want with arbitrary arguments. Many RTAS calls take the address of a buffer as an argument, and it's up to the caller to specify the physical address of the buffer as an argument. We allocate a buffer (the "RMO buffer") in the Real Memory Area that RTAS can access, and then expose the physical address and size of this buffer in /proc/powerpc/rtas/rmo_buffer. Userspace is expected to read this address, poke at the buffer using /dev/mem, and pass an address in the RMO buffer to the RTAS call. However, there's nothing stopping the caller from specifying whatever address they want in the RTAS call, and it's easy to construct a series of RTAS calls that can overwrite arbitrary bytes (even without /dev/mem access). Additionally, there are some RTAS calls that do potentially dangerous things and for which there are no legitimate userspace use cases. In the past, this would not have been a particularly big deal as it was assumed that root could modify all system state freely, but with Secure Boot and lockdown we need to care about this. We can't fundamentally change the ABI at this point, however we can address this by implementing a filter that checks RTAS calls against a list of permitted calls and forces the caller to use addresses within the RMO buffer. The list is based off the list of calls that are used by the librtas userspace library, and has been tested with a number of existing userspace RTAS utilities. For compatibility with any applications we are not aware of that require other calls, the filter can be turned off at build time. Cc: stable@vger.kernel.org Reported-by: Daniel Axtens <dja@axtens.net> Signed-off-by: Andrew Donnellan <ajd@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20200820044512.7543-1-ajd@linux.ibm.com Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-by: Jason Yan <yanaijie@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- arch/powerpc/Kconfig | 13 ++++ arch/powerpc/kernel/rtas.c | 153 +++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 51f62414ce7c..3365514becd8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1009,6 +1009,19 @@ config FSL_RIO source "drivers/rapidio/Kconfig" +config PPC_RTAS_FILTER + bool "Enable filtering of RTAS syscalls" + default y + depends on PPC_RTAS + help + The RTAS syscall API has security issues that could be used to + compromise system integrity. This option enforces restrictions on the + RTAS calls and arguments passed by userspace programs to mitigate + these issues. + + Say Y unless you know what you are doing and the filter is causing + problems for you. + endmenu config NONSTATIC_KERNEL diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 95d1264ba795..7e0722b62cae 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1057,6 +1057,147 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, return NULL; } +#ifdef CONFIG_PPC_RTAS_FILTER + +/* + * The sys_rtas syscall, as originally designed, allows root to pass + * arbitrary physical addresses to RTAS calls. A number of RTAS calls + * can be abused to write to arbitrary memory and do other things that + * are potentially harmful to system integrity, and thus should only + * be used inside the kernel and not exposed to userspace. + * + * All known legitimate users of the sys_rtas syscall will only ever + * pass addresses that fall within the RMO buffer, and use a known + * subset of RTAS calls. + * + * Accordingly, we filter RTAS requests to check that the call is + * permitted, and that provided pointers fall within the RMO buffer. + * The rtas_filters list contains an entry for each permitted call, + * with the indexes of the parameters which are expected to contain + * addresses and sizes of buffers allocated inside the RMO buffer. + */ +struct rtas_filter { + const char *name; + int token; + /* Indexes into the args buffer, -1 if not used */ + int buf_idx1; + int size_idx1; + int buf_idx2; + int size_idx2; + + int fixed_size; +}; + +static struct rtas_filter rtas_filters[] __ro_after_init = { + { "ibm,activate-firmware", -1, -1, -1, -1, -1 }, + { "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */ + { "display-character", -1, -1, -1, -1, -1 }, + { "ibm,display-message", -1, 0, -1, -1, -1 }, + { "ibm,errinjct", -1, 2, -1, -1, -1, 1024 }, + { "ibm,close-errinjct", -1, -1, -1, -1, -1 }, + { "ibm,open-errinct", -1, -1, -1, -1, -1 }, + { "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 }, + { "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 }, + { "ibm,get-indices", -1, 2, 3, -1, -1 }, + { "get-power-level", -1, -1, -1, -1, -1 }, + { "get-sensor-state", -1, -1, -1, -1, -1 }, + { "ibm,get-system-parameter", -1, 1, 2, -1, -1 }, + { "get-time-of-day", -1, -1, -1, -1, -1 }, + { "ibm,get-vpd", -1, 0, -1, 1, 2 }, + { "ibm,lpar-perftools", -1, 2, 3, -1, -1 }, + { "ibm,platform-dump", -1, 4, 5, -1, -1 }, + { "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 }, + { "ibm,scan-log-dump", -1, 0, 1, -1, -1 }, + { "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 }, + { "ibm,set-eeh-option", -1, -1, -1, -1, -1 }, + { "set-indicator", -1, -1, -1, -1, -1 }, + { "set-power-level", -1, -1, -1, -1, -1 }, + { "set-time-for-power-on", -1, -1, -1, -1, -1 }, + { "ibm,set-system-parameter", -1, 1, -1, -1, -1 }, + { "set-time-of-day", -1, -1, -1, -1, -1 }, + { "ibm,suspend-me", -1, -1, -1, -1, -1 }, + { "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 }, + { "ibm,update-properties", -1, 0, -1, -1, -1, 4096 }, + { "ibm,physical-attestation", -1, 0, 1, -1, -1 }, +}; + +static bool in_rmo_buf(u32 base, u32 end) +{ + return base >= rtas_rmo_buf && + base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) && + base <= end && + end >= rtas_rmo_buf && + end < (rtas_rmo_buf + RTAS_RMOBUF_MAX); +} + +static bool block_rtas_call(int token, int nargs, + struct rtas_args *args) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) { + struct rtas_filter *f = &rtas_filters[i]; + u32 base, size, end; + + if (token != f->token) + continue; + + if (f->buf_idx1 != -1) { + base = be32_to_cpu(args->args[f->buf_idx1]); + if (f->size_idx1 != -1) + size = be32_to_cpu(args->args[f->size_idx1]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + + end = base + size - 1; + if (!in_rmo_buf(base, end)) + goto err; + } + + if (f->buf_idx2 != -1) { + base = be32_to_cpu(args->args[f->buf_idx2]); + if (f->size_idx2 != -1) + size = be32_to_cpu(args->args[f->size_idx2]); + else if (f->fixed_size) + size = f->fixed_size; + else + size = 1; + end = base + size - 1; + + /* + * Special case for ibm,configure-connector where the + * address can be 0 + */ + if (!strcmp(f->name, "ibm,configure-connector") && + base == 0) + return false; + + if (!in_rmo_buf(base, end)) + goto err; + } + + return false; + } + +err: + pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n"); + pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n", + token, nargs, current->comm); + return true; +} + +#else + +static bool block_rtas_call(int token, int nargs, + struct rtas_args *args) +{ + return false; +} + +#endif /* CONFIG_PPC_RTAS_FILTER */ + /* We assume to be passed big endian arguments */ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) { @@ -1094,6 +1235,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs) args.rets = &args.args[nargs]; memset(args.rets, 0, nret * sizeof(rtas_arg_t)); + if (block_rtas_call(token, nargs, &args)) + return -EINVAL; + /* Need to handle ibm,suspend_me call specially */ if (token == ibm_suspend_me_token) { @@ -1155,6 +1299,9 @@ void __init rtas_initialize(void) unsigned long rtas_region = RTAS_INSTANTIATE_MAX; u32 base, size, entry; int no_base, no_size, no_entry; +#ifdef CONFIG_PPC_RTAS_FILTER + int i; +#endif /* Get RTAS dev node and fill up our "rtas" structure with infos * about it. @@ -1190,6 +1337,12 @@ void __init rtas_initialize(void) #ifdef CONFIG_RTAS_ERROR_LOGGING rtas_last_error_token = rtas_token("rtas-last-error"); #endif + +#ifdef CONFIG_PPC_RTAS_FILTER + for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) { + rtas_filters[i].token = rtas_token(rtas_filters[i].name); + } +#endif } int __init early_init_dt_scan_rtas(unsigned long node, -- 2.25.1

Yang Yingliang

9:27 a.m.

New subject: [PATCH 14/14] refcount_t: Add ACQUIRE ordering on success for dec(sub)_and_test() variants

From: Elena Reshetova <elena.reshetova@intel.com> mainline inclusion from mainline-v5.1-rc1 commit 47b8f3ab9c49daa824af848f9e02889662d8638f category: bugfix bugzilla: NA CVE: NA ------------------------------------------------- This adds an smp_acquire__after_ctrl_dep() barrier on successful decrease of refcounter value from 1 to 0 for refcount_dec(sub)_and_test variants and therefore gives stronger memory ordering guarantees than prior versions of these functions. Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will.deacon@arm.com> Cc: dvyukov@google.com Cc: keescook@chromium.org Cc: stern@rowland.harvard.edu Link: https://lkml.kernel.org/r/1548847131-27854-2-git-send-email-elena.reshetova@... Signed-off-by: Ingo Molnar <mingo@kernel.org> Conflicts: arch/x86/include/asm/refcount.h [yyl: adjust context] Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Reviewed-by: Hanjun Guo <guohanjun@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- Documentation/core-api/refcount-vs-atomic.rst | 24 ++++++++++++++++--- arch/x86/include/asm/refcount.h | 18 ++++++++++++-- lib/refcount.c | 18 ++++++++++---- 3 files changed, 50 insertions(+), 10 deletions(-) diff --git a/Documentation/core-api/refcount-vs-atomic.rst b/Documentation/core-api/refcount-vs-atomic.rst index 322851bada16..976e85adffe8 100644 --- a/Documentation/core-api/refcount-vs-atomic.rst +++ b/Documentation/core-api/refcount-vs-atomic.rst @@ -54,6 +54,13 @@ must propagate to all other CPUs before the release operation (A-cumulative property). This is implemented using :c:func:`smp_store_release`. +An ACQUIRE memory ordering guarantees that all post loads and +stores (all po-later instructions) on the same CPU are +completed after the acquire operation. It also guarantees that all +po-later stores on the same CPU must propagate to all other CPUs +after the acquire operation executes. This is implemented using +:c:func:`smp_acquire__after_ctrl_dep`. + A control dependency (on success) for refcounters guarantees that if a reference for an object was successfully obtained (reference counter increment or addition happened, function returned true), @@ -119,13 +126,24 @@ Memory ordering guarantees changes: result of obtaining pointer to the object! -case 5) - decrement-based RMW ops that return a value ------------------------------------------------------ +case 5) - generic dec/sub decrement-based RMW ops that return a value +--------------------------------------------------------------------- Function changes: * :c:func:`atomic_dec_and_test` --> :c:func:`refcount_dec_and_test` * :c:func:`atomic_sub_and_test` --> :c:func:`refcount_sub_and_test` + +Memory ordering guarantees changes: + + * fully ordered --> RELEASE ordering + ACQUIRE ordering on success + + +case 6) other decrement-based RMW ops that return a value +--------------------------------------------------------- + +Function changes: + * no atomic counterpart --> :c:func:`refcount_dec_if_one` * ``atomic_add_unless(&var, -1, 1)`` --> ``refcount_dec_not_one(&var)`` @@ -136,7 +154,7 @@ Memory ordering guarantees changes: .. note:: :c:func:`atomic_add_unless` only provides full order on success. -case 6) - lock-based RMW +case 7) - lock-based RMW ------------------------ Function changes: diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h index 19b90521954c..88061df7add6 100644 --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -67,14 +67,28 @@ static __always_inline void refcount_dec(refcount_t *r) static __always_inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r) { - GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, + bool ret = GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO, r->refs.counter, "er", i, "%0", e, "cx"); + + if (ret) { + smp_acquire__after_ctrl_dep(); + return true; + } + + return false; } static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) { - GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, + bool ret = GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO, r->refs.counter, "%0", e, "cx"); + + if (ret) { + smp_acquire__after_ctrl_dep(); + return true; + } + + return false; } static __always_inline __must_check diff --git a/lib/refcount.c b/lib/refcount.c index ebcf8cd49e05..6e904af0fb3e 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -33,6 +33,9 @@ * Note that the allocator is responsible for ordering things between free() * and alloc(). * + * The decrements dec_and_test() and sub_and_test() also provide acquire + * ordering on success. + * */ #include <linux/mutex.h> @@ -164,8 +167,8 @@ EXPORT_SYMBOL(refcount_inc_checked); * at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done - * before, and provides a control dependency such that free() must come after. - * See the comment on top. + * before, and provides an acquire ordering on success such that free() + * must come after. * * Use of this function is not recommended for the normal reference counting * use case in which references are taken and released one at a time. In these @@ -190,7 +193,12 @@ bool refcount_sub_and_test_checked(unsigned int i, refcount_t *r) } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); - return !new; + if (!new) { + smp_acquire__after_ctrl_dep(); + return true; + } + return false; + } EXPORT_SYMBOL(refcount_sub_and_test_checked); @@ -202,8 +210,8 @@ EXPORT_SYMBOL(refcount_sub_and_test_checked); * decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done - * before, and provides a control dependency such that free() must come after. - * See the comment on top. + * before, and provides an acquire ordering on success such that free() + * must come after. * * Return: true if the resulting refcount is 0, false otherwise */ -- 2.25.1

1690

Age (days ago)

1690

Last active (days ago)

List overview

13 comments

1 participants

participants (1)

Yang Yingliang

[PATCH 01/14] blk-cgroup: prevent rcu_sched detected stalls warnings in blkg_destroy_all()

tags

participants (1)