From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 186851, https://gitee.com/openeuler/kernel/issues/I5L01G CVE: NA
--------------------------------
Currently, in virtio_scsi, if 'bd->last' is not set to true while dispatching request, such io will stay in driver's queue, and driver will wait for block layer to dispatch more rqs. However, if block layer failed to dispatch more rq, it should trigger commit_rqs to inform driver.
There is a problem in blk_mq_try_issue_list_directly() that commit_rqs won't be called:
// assume that queue_depth is set to 1, list contains two rq blk_mq_try_issue_list_directly blk_mq_request_issue_directly // dispatch first rq // last is false __blk_mq_try_issue_directly blk_mq_get_dispatch_budget // succeed to get first budget __blk_mq_issue_directly scsi_queue_rq cmd->flags |= SCMD_LAST virtscsi_queuecommand kick = (sc->flags & SCMD_LAST) != 0 // kick is false, first rq won't issue to disk queued++
blk_mq_request_issue_directly // dispatch second rq __blk_mq_try_issue_directly blk_mq_get_dispatch_budget // failed to get second budget ret == BLK_STS_RESOURCE blk_mq_request_bypass_insert // errors is still 0
if (!list_empty(list) || errors && ...) // won't pass, commit_rqs won't be called
In this situation, first rq relied on second rq to dispatch, while second rq relied on first rq to complete, thus they will both hung.
Fix the problem by also treat 'BLK_STS_*RESOURCE' as 'errors' since it means that request is not queued successfully.
Same problem exists in blk_mq_dispatch_rq_list(), 'BLK_STS_*RESOURCE' can't be treated as 'errors' here, fix the problem by calling commit_rqs if queue_rq return 'BLK_STS_*RESOURCE'.
Fixes: d666ba98f849 ("blk-mq: add mq_ops->commit_rqs()") Signed-off-by: Yu Kuai yukuai3@huawei.com Link: https://lore.kernel.org/all/20220726122224.1790882-1-yukuai1@huaweicloud.com... Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c index 1941ffc4db85..b9827b3d3f63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1484,7 +1484,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ - if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued) + if ((!list_empty(list) || errors || needs_resource || + ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) q->mq_ops->commit_rqs(hctx); /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -2224,6 +2225,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); if (ret != BLK_STS_OK) { + errors++; if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_request_bypass_insert(rq, false, @@ -2231,7 +2233,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, break; } blk_mq_end_request(rq, ret); - errors++; } else queued++; }
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 187345, https://gitee.com/openeuler/kernel/issues/I5L5ZG CVE: NA
--------------------------------
Otherwise, null pointer crash can be triggered to handle bio in blk_mq_submit_bio() while queue is not initialized.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 588e8b43efab..c047d5fcb325 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2149,12 +2149,16 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
switch (type) { case DM_TYPE_REQUEST_BASED: - md->disk->fops = &dm_rq_blk_dops; r = dm_mq_init_request_queue(md, t); if (r) { DMERR("Cannot initialize queue for request-based dm mapped device"); return r; } + /* + * Change the fops after queue is initialized, so that bio won't + * issued by rq-based path until that. + */ + md->disk->fops = &dm_rq_blk_dops; break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED:
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 187345, https://gitee.com/openeuler/kernel/issues/I5L5ZG CVE: NA
--------------------------------
Commit d7c2ddc8456d ("block: fix that part scan is disabled in device_add_disk()") introduce a regression:
Test procedures: dmsetup create test --notable dmsetup remove test
Test result: dmsetup will stuck forever
Root cause: before: 1) dmsetup creat add_disk_add_disk_no_queue_reg() scan partitions uevent 2) blk_register_queue -> notable will not call this 3) dmsetup remove wait for uevent
after: 1) dmsetup creat add_disk_add_disk_no_queue_reg() 2) blk_register_queue() -> notable will not call this scan_partitions uevent 3) dmsetup remove wait for uevent -> impossible for notable
Fix the problem by moving scan_partitions and uevent from blk_register_queue() to the end of add_disk_add_disk_no_queue_reg().
Fixes: d7c2ddc8456d ("block: fix that part scan is disabled in device_add_disk()") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-sysfs.c | 45 --------------------------------------------- block/genhd.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 45 deletions(-)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 548d758365c6..b809c0bf7686 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -821,38 +821,6 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, };
-static void disk_scan_partitions(struct gendisk *disk) -{ - struct block_device *bdev; - - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; - - set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); -} - -static void disk_init_partition(struct gendisk *disk) -{ - struct device *ddev = disk_to_dev(disk); - struct disk_part_iter piter; - struct hd_struct *part; - - disk_scan_partitions(disk); - - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); -} - /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -942,22 +910,9 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock);
- - /* - * Set the flag at last, so that block devcie can't be opened - * before it's registration is done. - */ - disk->flags |= GENHD_FL_UP; ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); - /* - * Init partitions after releasing 'sysfs_dir_lock', otherwise lockdep - * will be confused because it will treat 'bd_mutex' from different - * devices as the same lock. - */ - if (!ret) - disk_init_partition(disk);
return ret; } diff --git a/block/genhd.c b/block/genhd.c index 021c9c2d7231..70e24c554b31 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -736,6 +736,38 @@ static void register_disk(struct device *parent, struct gendisk *disk, } }
+static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void disk_init_partition(struct gendisk *disk) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); +} + /** * __device_add_disk - add disk information to kernel list * @parent: parent device for the disk @@ -814,6 +846,13 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
disk_add_events(disk); blk_integrity_add(disk); + + /* + * Set the flag at last, so that block devcie can't be opened + * before it's registration is done. + */ + disk->flags |= GENHD_FL_UP; + disk_init_partition(disk); }
void device_add_disk(struct device *parent, struct gendisk *disk,
From: Wenchao Hao haowenchao@huawei.com
mainline inclusion from mainline-v5.18-rc1 commit ad515cada7dac3cdf5e1ad77a0ed696f5f34e0ab category: bugfix bugzilla: 187381, https://gitee.com/openeuler/kernel/issues/I5LBBP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
- iscsi_alloc_conn(): Allocate and initialize iscsi_cls_conn
- iscsi_add_conn(): Expose iscsi_cls_conn to userspace via sysfs
- iscsi_remove_conn(): Remove iscsi_cls_conn from sysfs
Link: https://lore.kernel.org/r/20220310015759.3296841-2-haowenchao@huawei.com Reviewed-by: Mike Christie michael.christie@oracle.com Signed-off-by: Wenchao Hao haowenchao@huawei.com Signed-off-by: Wu Bo wubo40@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com
Conflict: commit 1a709181c1a0 ("[Huawei] scsi: iscsi: fix kabi broken in struct iscsi_transport") introduce some conflicts. Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/scsi_transport_iscsi.c | 101 ++++++++++++++++++++++++++++ include/scsi/scsi_transport_iscsi.h | 4 ++ 2 files changed, 105 insertions(+)
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index e23cb62ab216..b5a8d274dec8 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2384,6 +2384,107 @@ void iscsi_free_session(struct iscsi_cls_session *session) } EXPORT_SYMBOL_GPL(iscsi_free_session);
+/** + * iscsi_alloc_conn - alloc iscsi class connection + * @session: iscsi cls session + * @dd_size: private driver data size + * @cid: connection id + */ +struct iscsi_cls_conn * +iscsi_alloc_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) +{ + struct iscsi_transport *transport = session->transport; + struct iscsi_cls_conn *conn; + struct iscsi_cls_conn_wrapper *conn_wrapper; + + conn_wrapper = kzalloc(sizeof(*conn_wrapper) + dd_size, GFP_KERNEL); + if (!conn_wrapper) + return NULL; + + conn = &conn_wrapper->conn; + if (dd_size) + conn->dd_data = &conn_wrapper[1]; + + mutex_init(&conn->ep_mutex); + spin_lock_init(&conn_wrapper->lock); + INIT_LIST_HEAD(&conn->conn_list); + INIT_WORK(&conn_wrapper->cleanup_work, iscsi_cleanup_conn_work_fn); + conn->transport = transport; + conn->cid = cid; + WRITE_ONCE(conn->state, ISCSI_CONN_DOWN); + + /* this is released in the dev's release function */ + if (!get_device(&session->dev)) + goto free_conn; + + dev_set_name(&conn->dev, "connection%d:%u", session->sid, cid); + device_initialize(&conn->dev); + conn->dev.parent = &session->dev; + conn->dev.release = iscsi_conn_release; + + return conn; + +free_conn: + kfree(conn); + return NULL; +} +EXPORT_SYMBOL_GPL(iscsi_alloc_conn); + +/** + * iscsi_add_conn - add iscsi class connection + * @conn: iscsi cls connection + * + * This will expose iscsi_cls_conn to sysfs so make sure the related + * resources for sysfs attributes are initialized before calling this. + */ +int iscsi_add_conn(struct iscsi_cls_conn *conn) +{ + int err; + unsigned long flags; + struct iscsi_cls_session *session = iscsi_dev_to_session(conn->dev.parent); + + err = device_add(&conn->dev); + if (err) { + iscsi_cls_session_printk(KERN_ERR, session, "could not " + "register connection's dev\n"); + return err; + } + err = transport_register_device(&conn->dev); + if (err) { + iscsi_cls_session_printk(KERN_ERR, session, "could not " + "register transport's dev\n"); + device_del(&conn->dev); + return err; + } + + spin_lock_irqsave(&connlock, flags); + list_add(&conn->conn_list, &connlist); + spin_unlock_irqrestore(&connlock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(iscsi_add_conn); + +/** + * iscsi_remove_conn - remove iscsi class connection from sysfs + * @conn: iscsi cls connection + * + * Remove iscsi_cls_conn from sysfs, and wait for previous + * read/write of iscsi_cls_conn's attributes in sysfs to finish. + */ +void iscsi_remove_conn(struct iscsi_cls_conn *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&connlock, flags); + list_del(&conn->conn_list); + spin_unlock_irqrestore(&connlock, flags); + + transport_unregister_device(&conn->dev); + device_del(&conn->dev); +} +EXPORT_SYMBOL_GPL(iscsi_remove_conn); + /** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index f9297176dcb8..1f7574d89822 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -468,6 +468,10 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); +extern struct iscsi_cls_conn *iscsi_alloc_conn(struct iscsi_cls_session *sess, + int dd_size, uint32_t cid); +extern int iscsi_add_conn(struct iscsi_cls_conn *conn); +extern void iscsi_remove_conn(struct iscsi_cls_conn *conn); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern void iscsi_put_conn(struct iscsi_cls_conn *conn);
From: Wenchao Hao haowenchao@huawei.com
mainline inclusion from mainline-v5.18-rc1 commit 7dae459f5e56a89ab01413ae055595c982713349 category: bugfix bugzilla: 187381, https://gitee.com/openeuler/kernel/issues/I5LBBP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
iscsi_create_conn() exposed iscsi_cls_conn to sysfs prior to initialization of iscsi_conn's dd_data. When userspace tried to access an attribute such as the connect address, a NULL pointer dereference was observed.
Do not add iscsi_cls_conn to sysfs until it has been initialized. Remove iscsi_create_conn() since it is no longer used.
Link: https://lore.kernel.org/r/20220310015759.3296841-3-haowenchao@huawei.com Reviewed-by: Mike Christie michael.christie@oracle.com Signed-off-by: Wenchao Hao haowenchao@huawei.com Signed-off-by: Wu Bo wubo40@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com
Conflict: iscsi_create_conn() is not removed Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/libiscsi.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index e361856509d5..bbf2ca613dae 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -3032,8 +3032,9 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size, struct iscsi_conn *conn; struct iscsi_cls_conn *cls_conn; char *data; + int err;
- cls_conn = iscsi_create_conn(cls_session, sizeof(*conn) + dd_size, + cls_conn = iscsi_alloc_conn(cls_session, sizeof(*conn) + dd_size, conn_idx); if (!cls_conn) return NULL; @@ -3073,13 +3074,21 @@ iscsi_conn_setup(struct iscsi_cls_session *cls_session, int dd_size,
init_waitqueue_head(&session->ehwait);
+ err = iscsi_add_conn(cls_conn); + if (err) + goto login_task_add_dev_fail; + return cls_conn;
+login_task_add_dev_fail: + free_pages((unsigned long) conn->data, + get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); + login_task_data_alloc_fail: kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task, sizeof(void*)); login_task_alloc_fail: - iscsi_destroy_conn(cls_conn); + iscsi_put_conn(cls_conn); return NULL; } EXPORT_SYMBOL_GPL(iscsi_conn_setup);
From: Wenchao Hao haowenchao@huawei.com
mainline inclusion from mainline-v5.18-rc1 commit 8709c323091be019f76a49cf783052a5636aca85 category: bugfix bugzilla: 187381, https://gitee.com/openeuler/kernel/issues/I5LBBP CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Commit 1b8d0300a3e9 ("scsi: libiscsi: Fix UAF in iscsi_conn_get_param()/iscsi_conn_teardown()") fixed an UAF in iscsi_conn_get_param() and introduced 2 tmp_xxx varibles.
We can gracefully fix this UAF with the help of device_del(). Calling iscsi_remove_conn() at the beginning of iscsi_conn_teardown would make userspace unable to see iscsi_cls_conn. This way we we can free memory safely.
Remove iscsi_destroy_conn() since it is no longer used.
Link: https://lore.kernel.org/r/20220310015759.3296841-4-haowenchao@huawei.com Reviewed-by: Mike Christie michael.christie@oracle.com Signed-off-by: Wenchao Hao haowenchao@huawei.com Signed-off-by: Wu Bo wubo40@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com
Conflict: iscsi_destroy_conn() is not removed. Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/libiscsi.c | 10 +++++----- drivers/scsi/scsi_transport_iscsi.c | 6 +++++- 2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index bbf2ca613dae..176842a869f1 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -3104,8 +3104,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) { struct iscsi_conn *conn = cls_conn->dd_data; struct iscsi_session *session = conn->session; - char *tmp_persistent_address = conn->persistent_address; - char *tmp_local_ipaddr = conn->local_ipaddr; + + iscsi_remove_conn(cls_conn);
del_timer_sync(&conn->transport_timer);
@@ -3127,6 +3127,8 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) spin_lock_bh(&session->frwd_lock); free_pages((unsigned long) conn->data, get_order(ISCSI_DEF_MAX_RECV_SEG_LEN)); + kfree(conn->persistent_address); + kfree(conn->local_ipaddr); /* regular RX path uses back_lock */ spin_lock_bh(&session->back_lock); kfifo_in(&session->cmdpool.queue, (void*)&conn->login_task, @@ -3137,9 +3139,7 @@ void iscsi_conn_teardown(struct iscsi_cls_conn *cls_conn) spin_unlock_bh(&session->frwd_lock); mutex_unlock(&session->eh_mutex);
- iscsi_destroy_conn(cls_conn); - kfree(tmp_persistent_address); - kfree(tmp_local_ipaddr); + iscsi_put_conn(cls_conn); } EXPORT_SYMBOL_GPL(iscsi_conn_teardown);
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index b5a8d274dec8..a213362524c9 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2168,7 +2168,11 @@ static int iscsi_iter_destroy_conn_fn(struct device *dev, void *data) { if (!iscsi_is_conn_dev(dev)) return 0; - return iscsi_destroy_conn(iscsi_dev_to_conn(dev)); + + iscsi_remove_conn(iscsi_dev_to_conn(dev)); + iscsi_put_conn(iscsi_dev_to_conn(dev)); + + return 0; }
void iscsi_remove_session(struct iscsi_cls_session *session)
From: Yixing Liu liuyixing1@huawei.com
mainline inclusion from mainline-v5.18-rc1 commit 70f92521584f1d1e8268311ee84413307b0fdea8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5IZO5 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/commit/?id=70f...
----------------------------------------------------------------------
Before destroying MPT, the reserved loopback QPs send loopback IOs (one write operation per SL). Completing these loopback IOs represents that there isn't any outstanding request in MPT, then it's safe to destroy MPT.
Link: https://lore.kernel.org/r/20220310042835.38634-1-liangwenpeng@huawei.com Signed-off-by: Yixing Liu liuyixing1@huawei.com Signed-off-by: Wenpeng Liang liangwenpeng@huawei.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Zhengfeng Luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 311 +++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 20 ++ drivers/infiniband/hw/hns/hns_roce_mr.c | 6 +- 4 files changed, 335 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f600475dd1fc..2d29221055be 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -627,6 +627,7 @@ struct hns_roce_qp { u32 next_sge; enum ib_mtu path_mtu; u32 max_inline_data; + u8 free_mr_en;
/* 0: flush needed, 1: unneeded */ unsigned long flush_flag; @@ -894,6 +895,7 @@ struct hns_roce_hw { enum ib_qp_state new_state); int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); + void (*dereg_mr)(struct hns_roce_dev *hr_dev); int (*init_eq)(struct hns_roce_dev *hr_dev); void (*cleanup_eq)(struct hns_roce_dev *hr_dev); int (*write_srqc)(struct hns_roce_srq *srq, void *mb_buf); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 045dde54974d..7eda0f7a12cd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2616,6 +2616,194 @@ static void free_dip_list(struct hns_roce_dev *hr_dev) spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); }
+static void free_mr_exit(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + if (free_mr->rsv_qp[i]) { + ret = ib_destroy_qp(free_mr->rsv_qp[i]); + if (ret) + ibdev_err(&hr_dev->ib_dev, + "failed to destroy qp in free mr.\n"); + + free_mr->rsv_qp[i] = NULL; + } + } + + if (free_mr->rsv_cq) { + ib_destroy_cq(free_mr->rsv_cq); + free_mr->rsv_cq = NULL; + } + + if (free_mr->rsv_pd) { + ib_dealloc_pd(free_mr->rsv_pd); + free_mr->rsv_pd = NULL; + } +} + +static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_cq_init_attr cq_init_attr = {}; + struct ib_qp_init_attr qp_init_attr = {}; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + int ret; + int i; + + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ibdev_err(ibdev, "failed to create pd for free mr.\n"); + return PTR_ERR(pd); + } + free_mr->rsv_pd = pd; + + cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_init_attr); + if (IS_ERR(cq)) { + ibdev_err(ibdev, "failed to create cq for free mr.\n"); + ret = PTR_ERR(cq); + goto create_failed; + } + free_mr->rsv_cq = cq; + + qp_init_attr.qp_type = IB_QPT_RC; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.send_cq = free_mr->rsv_cq; + qp_init_attr.recv_cq = free_mr->rsv_cq; + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + qp_init_attr.cap.max_send_wr = HNS_ROCE_FREE_MR_USED_SQWQE_NUM; + qp_init_attr.cap.max_send_sge = HNS_ROCE_FREE_MR_USED_SQSGE_NUM; + qp_init_attr.cap.max_recv_wr = HNS_ROCE_FREE_MR_USED_RQWQE_NUM; + qp_init_attr.cap.max_recv_sge = HNS_ROCE_FREE_MR_USED_RQSGE_NUM; + + qp = ib_create_qp(free_mr->rsv_pd, &qp_init_attr); + if (IS_ERR(qp)) { + ibdev_err(ibdev, "failed to create qp for free mr.\n"); + ret = PTR_ERR(qp); + goto create_failed; + } + + free_mr->rsv_qp[i] = qp; + } + + return 0; + +create_failed: + free_mr_exit(hr_dev); + + return ret; +} + +static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, + struct ib_qp_attr *attr, int sl_num) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_qp *hr_qp; + int loopback; + int mask; + int ret; + + hr_qp = to_hr_qp(free_mr->rsv_qp[sl_num]); + hr_qp->free_mr_en = 1; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS; + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; + ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + if (ret) { + ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", + ret); + return ret; + } + + loopback = hr_dev->loop_idc; + /* Set qpc lbi = 1 incidate loopback IO */ + hr_dev->loop_idc = 1; + + mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | + IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; + attr->qp_state = IB_QPS_RTR; + attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; + attr->path_mtu = IB_MTU_256; + attr->dest_qp_num = hr_qp->qpn; + attr->rq_psn = HNS_ROCE_FREE_MR_USED_PSN; + + rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num); + + ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + hr_dev->loop_idc = loopback; + if (ret) { + ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", + ret); + return ret; + } + + mask = IB_QP_STATE | IB_QP_SQ_PSN | IB_QP_RETRY_CNT | IB_QP_TIMEOUT | + IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; + attr->qp_state = IB_QPS_RTS; + attr->sq_psn = HNS_ROCE_FREE_MR_USED_PSN; + attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; + attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; + ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + if (ret) + ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", + ret); + + return ret; +} + +static int free_mr_modify_qp(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_qp_attr attr = {}; + int ret; + int i; + + rdma_ah_set_grh(&attr.ah_attr, NULL, 0, 0, 1, 0); + rdma_ah_set_static_rate(&attr.ah_attr, 3); + rdma_ah_set_port_num(&attr.ah_attr, 1); + + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + ret = free_mr_modify_rsv_qp(hr_dev, &attr, i); + if (ret) + return ret; + } + + return 0; +} + +static int free_mr_init(struct hns_roce_dev *hr_dev) +{ + int ret; + + ret = free_mr_alloc_res(hr_dev); + if (ret) + return ret; + + ret = free_mr_modify_qp(hr_dev); + if (ret) + goto err_modify_qp; + + return 0; + +err_modify_qp: + free_mr_exit(hr_dev); + + return ret; +} + static int get_hem_table(struct hns_roce_dev *hr_dev) { unsigned int qpc_count; @@ -3160,6 +3348,98 @@ static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) return 0; }
+static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct ib_device *ibdev = &hr_dev->ib_dev; + const struct ib_send_wr *bad_wr; + struct ib_rdma_wr rdma_wr = {}; + struct ib_send_wr *send_wr; + int ret; + + send_wr = &rdma_wr.wr; + send_wr->opcode = IB_WR_RDMA_WRITE; + + ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr); + if (ret) { + ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n", + ret); + return ret; + } + + return 0; +} + +static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); + +static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_wc wc[ARRAY_SIZE(free_mr->rsv_qp)]; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_qp *hr_qp; + unsigned long end; + int cqe_cnt = 0; + int npolled; + int ret; + int i; + + /* + * If the device initialization is not complete or in the uninstall + * process, then there is no need to execute free mr. + */ + if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT || + priv->handle->rinfo.instance_state == HNS_ROCE_STATE_INIT || + hr_dev->state == HNS_ROCE_DEVICE_STATE_UNINIT) + return; + + mutex_lock(&free_mr->mutex); + + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + hr_qp = to_hr_qp(free_mr->rsv_qp[i]); + + ret = free_mr_post_send_lp_wqe(hr_qp); + if (ret) { + ibdev_err(ibdev, + "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", + hr_qp->qpn, ret); + break; + } + + cqe_cnt++; + } + + end = msecs_to_jiffies(HNS_ROCE_V2_FREE_MR_TIMEOUT) + jiffies; + while (cqe_cnt) { + npolled = hns_roce_v2_poll_cq(free_mr->rsv_cq, cqe_cnt, wc); + if (npolled < 0) { + ibdev_err(ibdev, + "failed to poll cqe for free mr, remain %d cqe.\n", + cqe_cnt); + goto out; + } + + if (time_after(jiffies, end)) { + ibdev_err(ibdev, + "failed to poll cqe for free mr and timeout, remain %d cqe.\n", + cqe_cnt); + goto out; + } + cqe_cnt -= npolled; + } + +out: + mutex_unlock(&free_mr->mutex); +} + +static void hns_roce_v2_dereg_mr(struct hns_roce_dev *hr_dev) +{ + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_send_cmd_to_hw(hr_dev); +} + static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n) { return hns_roce_buf_offset(hr_cq->mtr.kmem, n * hr_cq->cqe_size); @@ -4579,6 +4859,18 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, u8 hr_port; int ret;
+ /* + * If free_mr_en of qp is set, it means that this qp comes from + * free mr. This qp will perform the loopback operation. + * In the loopback scenario, only sl needs to be set. + */ + if (hr_qp->free_mr_en) { + hr_reg_write(context, QPC_SL, rdma_ah_get_sl(&attr->ah_attr)); + hr_reg_clear(qpc_mask, QPC_SL); + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + return 0; + } + ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num : hr_qp->port + 1; hr_port = ib_port - 1; is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) && @@ -6251,6 +6543,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .set_hem = hns_roce_v2_set_hem, .clear_hem = hns_roce_v2_clear_hem, .modify_qp = hns_roce_v2_modify_qp, + .dereg_mr = hns_roce_v2_dereg_mr, .qp_flow_control_init = hns_roce_v2_qp_flow_control_init, .init_eq = hns_roce_v2_init_eq_table, .cleanup_eq = hns_roce_v2_cleanup_eq_table, @@ -6332,14 +6625,25 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) ret = hns_roce_init(hr_dev); if (ret) { dev_err(hr_dev->dev, "RoCE Engine init failed!\n"); - goto error_failed_get_cfg; + goto error_failed_cfg; + } + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + ret = free_mr_init(hr_dev); + if (ret) { + dev_err(hr_dev->dev, "failed to init free mr!\n"); + goto error_failed_roce_init; + } }
handle->priv = hr_dev;
return 0;
-error_failed_get_cfg: +error_failed_roce_init: + hns_roce_exit(hr_dev); + +error_failed_cfg: kfree(hr_dev->priv);
error_failed_kzalloc: @@ -6361,6 +6665,9 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev);
+ if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index e5df163f56e0..339b1b82f906 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -139,6 +139,18 @@ enum { #define CMD_CSQ_DESC_NUM 1024 #define CMD_CRQ_DESC_NUM 1024
+/* Free mr used parameters */ +#define HNS_ROCE_FREE_MR_USED_CQE_NUM 128 +#define HNS_ROCE_FREE_MR_USED_QP_NUM 0x8 +#define HNS_ROCE_FREE_MR_USED_PSN 0x0808 +#define HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT 0x7 +#define HNS_ROCE_FREE_MR_USED_QP_TIMEOUT 0x12 +#define HNS_ROCE_FREE_MR_USED_SQWQE_NUM 128 +#define HNS_ROCE_FREE_MR_USED_SQSGE_NUM 0x2 +#define HNS_ROCE_FREE_MR_USED_RQWQE_NUM 128 +#define HNS_ROCE_FREE_MR_USED_RQSGE_NUM 0x2 +#define HNS_ROCE_V2_FREE_MR_TIMEOUT 4500 + enum { NO_ARMED = 0x0, REG_NXT_CEQE = 0x2, @@ -1317,10 +1329,18 @@ struct hns_roce_link_table { #define HNS_ROCE_EXT_LLM_ENTRY(addr, id) (((id) << (64 - 12)) | ((addr) >> 12)) #define HNS_ROCE_EXT_LLM_MIN_PAGES(que_num) ((que_num) * 4 + 2)
+struct hns_roce_v2_free_mr { + struct ib_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; + struct ib_cq *rsv_cq; + struct ib_pd *rsv_pd; + struct mutex mutex; +}; + struct hns_roce_v2_priv { struct hnae3_handle *handle; struct hns_roce_v2_cmq cmq; struct hns_roce_link_table ext_llm; + struct hns_roce_v2_free_mr free_mr; };
struct hns_roce_dip { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 1e36ac383ea3..d1a9200a312d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -119,8 +119,7 @@ static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr); }
-static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, - struct hns_roce_mr *mr) +static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { struct ib_device *ibdev = &hr_dev->ib_dev; int ret; @@ -338,6 +337,9 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); struct hns_roce_mr *mr = to_hr_mr(ibmr);
+ if (hr_dev->hw->dereg_mr) + hr_dev->hw->dereg_mr(hr_dev); + hns_roce_mr_free(hr_dev, mr); kfree(mr);
From: zhengfeng luo luozhengfeng@h-partners.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5MQLB
----------------------------------------------------------------------
If the gid or mtu are changed after the driver is initialized, an error will happen as following:
[ 29.612240] __ib_cache_gid_add: unable to add gid fe80:0000:0000:0000:4600:4dff:fe22:abb5 error=-28 [61807.380991] hns3 0000:7d:00.0 hns_0: attr path_mtu(1)invalid while modify qp
Fixes: 70f92521584f ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT")
Signed-off-by: Yixing Liu liuyixing1@huawei.com Signed-off-by: Haoyue Xu xuhaoyue1@hisilicon.com Signed-off-by: zhengfeng luo luozhengfeng@h-partners.com Reviewed-by: Yangyang Li liyangyang20@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 193 +++++++++++++++------ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 12 +- 2 files changed, 150 insertions(+), 55 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 7eda0f7a12cd..de7873551ffc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2616,16 +2616,112 @@ static void free_dip_list(struct hns_roce_dev *hr_dev) spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); }
+static struct ib_pd *free_mr_init_pd(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_pd *hr_pd; + struct ib_pd *pd; + int ret; + + hr_pd = kzalloc(sizeof(*hr_pd), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_pd)) + return NULL; + + pd = &hr_pd->ibpd; + pd->device = ibdev; + + ret = hns_roce_alloc_pd(pd, NULL); + if (ret) { + ibdev_err(ibdev, "failed to create pd for free mr.\n"); + kfree(hr_pd); + return NULL; + } + + free_mr->rsv_pd = to_hr_pd(pd); + free_mr->rsv_pd->ibpd.device = &hr_dev->ib_dev; + free_mr->rsv_pd->ibpd.uobject = NULL; + free_mr->rsv_pd->ibpd.__internal_mr = NULL; + atomic_set(&free_mr->rsv_pd->ibpd.usecnt, 0); + + return pd; +} + +static struct ib_cq *free_mr_init_cq(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_cq_init_attr cq_init_attr = {}; + struct hns_roce_cq *hr_cq; + struct ib_cq *cq; + int ret; + + cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; + + hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL); + if (ZERO_OR_NULL_PTR(hr_cq)) + return NULL; + + cq = &hr_cq->ib_cq; + cq->device = ibdev; + + ret = hns_roce_create_cq(cq, &cq_init_attr, NULL); + if (ret) { + ibdev_err(ibdev, "failed to create cq for free mr.\n"); + kfree(hr_cq); + return NULL; + } + + free_mr->rsv_cq = to_hr_cq(cq); + free_mr->rsv_cq->ib_cq.device = &hr_dev->ib_dev; + free_mr->rsv_cq->ib_cq.uobject = NULL; + free_mr->rsv_cq->ib_cq.comp_handler = NULL; + free_mr->rsv_cq->ib_cq.event_handler = NULL; + free_mr->rsv_cq->ib_cq.cq_context = NULL; + atomic_set(&free_mr->rsv_cq->ib_cq.usecnt, 0); + + return cq; +} + +static struct hns_roce_qp *create_free_mr_qp(struct hns_roce_dev *hr_dev, + struct ib_pd *pd, struct ib_cq *cq) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_qp_init_attr init_attr = {}; + struct ib_qp *qp; + + init_attr.qp_type = IB_QPT_RC; + init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr.send_cq = cq; + init_attr.recv_cq = cq; + init_attr.cap.max_send_wr = HNS_ROCE_FREE_MR_USED_SQWQE_NUM; + init_attr.cap.max_send_sge = HNS_ROCE_FREE_MR_USED_SQSGE_NUM; + init_attr.cap.max_recv_wr = HNS_ROCE_FREE_MR_USED_RQWQE_NUM; + init_attr.cap.max_recv_sge = HNS_ROCE_FREE_MR_USED_RQSGE_NUM; + + qp = hns_roce_create_qp(pd, &init_attr, NULL); + if (IS_ERR_OR_NULL(qp)) { + ibdev_err(ibdev, "failed to create qp for free mr.\n"); + return NULL; + } + + return to_hr_qp(qp); +} + static void free_mr_exit(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct hns_roce_qp *hr_qp; int ret; int i;
for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { if (free_mr->rsv_qp[i]) { - ret = ib_destroy_qp(free_mr->rsv_qp[i]); + hr_qp = to_hr_qp(&free_mr->rsv_qp[i]->ibqp); + ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, NULL); if (ret) ibdev_err(&hr_dev->ib_dev, "failed to destroy qp in free mr.\n"); @@ -2635,13 +2731,14 @@ static void free_mr_exit(struct hns_roce_dev *hr_dev) }
if (free_mr->rsv_cq) { - ib_destroy_cq(free_mr->rsv_cq); - free_mr->rsv_cq = NULL; + hns_roce_destroy_cq(&free_mr->rsv_cq->ib_cq, NULL); + kfree(free_mr->rsv_cq); }
if (free_mr->rsv_pd) { - ib_dealloc_pd(free_mr->rsv_pd); + hns_roce_dealloc_pd(&free_mr->rsv_pd->ibpd, NULL); free_mr->rsv_pd = NULL; + kfree(free_mr->rsv_pd); } }
@@ -2649,55 +2746,40 @@ static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) { struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; - struct ib_device *ibdev = &hr_dev->ib_dev; - struct ib_cq_init_attr cq_init_attr = {}; - struct ib_qp_init_attr qp_init_attr = {}; struct ib_pd *pd; struct ib_cq *cq; - struct ib_qp *qp; int ret; int i;
- pd = ib_alloc_pd(ibdev, 0); - if (IS_ERR(pd)) { - ibdev_err(ibdev, "failed to create pd for free mr.\n"); - return PTR_ERR(pd); - } - free_mr->rsv_pd = pd; + pd = free_mr_init_pd(hr_dev); + if (!pd) + return -ENOMEM;
- cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; - cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_init_attr); - if (IS_ERR(cq)) { - ibdev_err(ibdev, "failed to create cq for free mr.\n"); - ret = PTR_ERR(cq); - goto create_failed; + cq = free_mr_init_cq(hr_dev); + if (!cq) { + ret = -ENOMEM; + goto create_failed_cq; } - free_mr->rsv_cq = cq;
- qp_init_attr.qp_type = IB_QPT_RC; - qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; - qp_init_attr.send_cq = free_mr->rsv_cq; - qp_init_attr.recv_cq = free_mr->rsv_cq; for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { - qp_init_attr.cap.max_send_wr = HNS_ROCE_FREE_MR_USED_SQWQE_NUM; - qp_init_attr.cap.max_send_sge = HNS_ROCE_FREE_MR_USED_SQSGE_NUM; - qp_init_attr.cap.max_recv_wr = HNS_ROCE_FREE_MR_USED_RQWQE_NUM; - qp_init_attr.cap.max_recv_sge = HNS_ROCE_FREE_MR_USED_RQSGE_NUM; - - qp = ib_create_qp(free_mr->rsv_pd, &qp_init_attr); - if (IS_ERR(qp)) { - ibdev_err(ibdev, "failed to create qp for free mr.\n"); - ret = PTR_ERR(qp); - goto create_failed; + free_mr->rsv_qp[i] = create_free_mr_qp(hr_dev, pd, cq); + if (!free_mr->rsv_qp[i]) { + ret = -ENOMEM; + goto create_failed_qp; } - - free_mr->rsv_qp[i] = qp; + free_mr->rsv_qp[i]->ibqp.recv_cq = cq; + free_mr->rsv_qp[i]->ibqp.send_cq = cq; }
return 0;
-create_failed: - free_mr_exit(hr_dev); +create_failed_qp: + hns_roce_destroy_cq(cq, NULL); + kfree(cq); + +create_failed_cq: + hns_roce_dealloc_pd(pd, NULL); + kfree(pd);
return ret; } @@ -2709,18 +2791,19 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_qp *hr_qp; - int loopback; - int mask; - int ret; + int loopback, mask, ret;
- hr_qp = to_hr_qp(free_mr->rsv_qp[sl_num]); + hr_qp = to_hr_qp(&free_mr->rsv_qp[sl_num]->ibqp); hr_qp->free_mr_en = 1; + hr_qp->ibqp.device = ibdev; + hr_qp->ibqp.qp_type = IB_QPT_RC;
mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS; attr->qp_state = IB_QPS_INIT; attr->port_num = 1; attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, + IB_QPS_INIT); if (ret) { ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", ret); @@ -2741,7 +2824,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev,
rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num);
- ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT, + IB_QPS_RTR); hr_dev->loop_idc = loopback; if (ret) { ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", @@ -2755,7 +2839,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, attr->sq_psn = HNS_ROCE_FREE_MR_USED_PSN; attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; - ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_RTR, + IB_QPS_RTS); if (ret) ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", ret); @@ -2786,8 +2871,12 @@ static int free_mr_modify_qp(struct hns_roce_dev *hr_dev)
static int free_mr_init(struct hns_roce_dev *hr_dev) { + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; int ret;
+ mutex_init(&free_mr->mutex); + ret = free_mr_alloc_res(hr_dev); if (ret) return ret; @@ -3398,7 +3487,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) mutex_lock(&free_mr->mutex);
for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { - hr_qp = to_hr_qp(free_mr->rsv_qp[i]); + hr_qp = free_mr->rsv_qp[i];
ret = free_mr_post_send_lp_wqe(hr_qp); if (ret) { @@ -3413,7 +3502,7 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)
end = msecs_to_jiffies(HNS_ROCE_V2_FREE_MR_TIMEOUT) + jiffies; while (cqe_cnt) { - npolled = hns_roce_v2_poll_cq(free_mr->rsv_cq, cqe_cnt, wc); + npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc); if (npolled < 0) { ibdev_err(ibdev, "failed to poll cqe for free mr, remain %d cqe.\n", @@ -5388,9 +5477,9 @@ static inline int modify_qp_is_ok(struct hns_roce_qp *hr_qp) hr_qp->state != IB_QPS_RESET); }
-static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct ib_udata *udata) +int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct ib_udata *udata) { struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_cq *send_cq, *recv_cq; @@ -5432,7 +5521,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, return ret; }
-static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 339b1b82f906..8aa56c24e677 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1330,9 +1330,9 @@ struct hns_roce_link_table { #define HNS_ROCE_EXT_LLM_MIN_PAGES(que_num) ((que_num) * 4 + 2)
struct hns_roce_v2_free_mr { - struct ib_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; - struct ib_cq *rsv_cq; - struct ib_pd *rsv_pd; + struct hns_roce_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; + struct hns_roce_cq *rsv_cq; + struct hns_roce_pd *rsv_pd; struct mutex mutex; };
@@ -1458,6 +1458,12 @@ struct hns_roce_sccc_clr_done { int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn, int *buffer);
+int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); + +int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct ib_udata *udata); + static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) {
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5QH0X CVE: NA
--------------------------------
Following process: Init: v2_read_file_info: <3> dqi_free_blk 0 dqi_free_entry 5 dqi_blks 6
Step 1. chown bin f_a -> dquot_acquire -> v2_write_dquot: qtree_write_dquot do_insert_tree find_free_dqentry get_free_dqblk write_blk(info->dqi_blocks) // info->dqi_blocks = 6, failure. The content in physical block (corresponding to blk 6) is random.
Step 2. chown root f_a -> dquot_transfer -> dqput_all -> dqput -> ext4_release_dquot -> v2_release_dquot -> qtree_delete_dquot: dquot_release remove_tree free_dqentry put_free_dqblk(6) info->dqi_free_blk = blk // info->dqi_free_blk = 6
Step 3. drop cache (buffer head for block 6 is released)
Step 4. chown bin f_b -> dquot_acquire -> commit_dqblk -> v2_write_dquot: qtree_write_dquot do_insert_tree find_free_dqentry get_free_dqblk dh = (struct qt_disk_dqdbheader *)buf blk = info->dqi_free_blk // 6 ret = read_blk(info, blk, buf) // The content of buf is random info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free) // random blk
Step 5. chown bin f_c -> notify_change -> ext4_setattr -> dquot_transfer: dquot = dqget -> acquire_dquot -> ext4_acquire_dquot -> dquot_acquire -> commit_dqblk -> v2_write_dquot -> dq_insert_tree: do_insert_tree find_free_dqentry get_free_dqblk blk = info->dqi_free_blk // If blk < 0 and blk is not an error code, it will be returned as dquot
transfer_to[USRQUOTA] = dquot // A random negative value __dquot_transfer(transfer_to) dquot_add_inodes(transfer_to[cnt]) spin_lock(&dquot->dq_dqb_lock) // page fault
, which will lead to kernel page fault: Quota error (device sda): qtree_write_dquot: Error -8000 occurred while creating quota BUG: unable to handle page fault for address: ffffffffffffe120 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page Oops: 0002 [#1] PREEMPT SMP CPU: 0 PID: 5974 Comm: chown Not tainted 6.0.0-rc1-00004 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:_raw_spin_lock+0x3a/0x90 Call Trace: dquot_add_inodes+0x28/0x270 __dquot_transfer+0x377/0x840 dquot_transfer+0xde/0x540 ext4_setattr+0x405/0x14d0 notify_change+0x68e/0x9f0 chown_common+0x300/0x430 __x64_sys_fchownat+0x29/0x40
In order to avoid accessing invalid quota memory address, this patch adds block number checking of next/prev free block read from quota file.
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216372 Fixes: 1da177e4c3f4152 ("Linux-2.6.12-rc2") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/quota/quota_tree.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+)
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 1a188fbdf34e..15b8e1a39015 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -80,6 +80,35 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) return ret; }
+static inline int do_check_range(struct super_block *sb, uint val, uint max_val) +{ + if (val >= max_val) { + quota_error(sb, "Getting block too big (%u >= %u)", + val, max_val); + return -EUCLEAN; + } + + return 0; +} + +static int check_free_block(struct qtree_mem_dqinfo *info, + struct qt_disk_dqdbheader *dh) +{ + int err = 0; + uint nextblk, prevblk; + + nextblk = le32_to_cpu(dh->dqdh_next_free); + err = do_check_range(info->dqi_sb, nextblk, info->dqi_blocks); + if (err) + return err; + prevblk = le32_to_cpu(dh->dqdh_prev_free); + err = do_check_range(info->dqi_sb, prevblk, info->dqi_blocks); + if (err) + return err; + + return err; +} + /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { @@ -94,6 +123,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info) ret = read_blk(info, blk, buf); if (ret < 0) goto out_buf; + ret = check_free_block(info, dh); + if (ret) + goto out_buf; info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { @@ -241,6 +273,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, *err = read_blk(info, blk, buf); if (*err < 0) goto out_buf; + *err = check_free_block(info, dh); + if (*err) + goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { @@ -433,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; + ret = check_free_block(info, dh); + if (ret) + goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ ret = remove_free_dqentry(info, buf, blk);
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5QH0X CVE: NA
--------------------------------
Cleanup all block checking places, replace them with helper function do_check_range().
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/quota/quota_tree.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 15b8e1a39015..c200161554db 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -80,11 +80,12 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) return ret; }
-static inline int do_check_range(struct super_block *sb, uint val, uint max_val) +static inline int do_check_range(struct super_block *sb, uint val, + uint min_val, uint max_val) { - if (val >= max_val) { - quota_error(sb, "Getting block too big (%u >= %u)", - val, max_val); + if (val < min_val || val >= max_val) { + quota_error(sb, "Getting block %u out of range %u-%u", + val, min_val, max_val); return -EUCLEAN; }
@@ -98,11 +99,11 @@ static int check_free_block(struct qtree_mem_dqinfo *info, uint nextblk, prevblk;
nextblk = le32_to_cpu(dh->dqdh_next_free); - err = do_check_range(info->dqi_sb, nextblk, info->dqi_blocks); + err = do_check_range(info->dqi_sb, nextblk, 0, info->dqi_blocks); if (err) return err; prevblk = le32_to_cpu(dh->dqdh_prev_free); - err = do_check_range(info->dqi_sb, prevblk, info->dqi_blocks); + err = do_check_range(info->dqi_sb, prevblk, 0, info->dqi_blocks); if (err) return err;
@@ -527,12 +528,10 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); - if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - newblk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, newblk, QT_TREEOFF, + info->dqi_blocks); + if (ret) goto out_buf; - }
if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); @@ -633,12 +632,9 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; - if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { - quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", - blk, info->dqi_blocks); - ret = -EUCLEAN; + ret = do_check_range(dquot->dq_sb, blk, QT_TREEOFF, info->dqi_blocks); + if (ret) goto out_buf; - }
if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1);
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5QH0X CVE: NA
--------------------------------
It would be better to do more sanity checking (eg. dqdh_entries, block no.) for the content read from quota file, which can prevent corrupting the quota file.
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/quota/quota_tree.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-)
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c200161554db..06e3ec528b2c 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -80,12 +80,12 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) return ret; }
-static inline int do_check_range(struct super_block *sb, uint val, - uint min_val, uint max_val) +static inline int do_check_range(struct super_block *sb, const char *val_name, + uint val, uint min_val, uint max_val) { if (val < min_val || val >= max_val) { - quota_error(sb, "Getting block %u out of range %u-%u", - val, min_val, max_val); + quota_error(sb, "Getting %s %u out of range %u-%u", + val_name, val, min_val, max_val); return -EUCLEAN; }
@@ -99,11 +99,13 @@ static int check_free_block(struct qtree_mem_dqinfo *info, uint nextblk, prevblk;
nextblk = le32_to_cpu(dh->dqdh_next_free); - err = do_check_range(info->dqi_sb, nextblk, 0, info->dqi_blocks); + err = do_check_range(info->dqi_sb, "dqdh_next_free", nextblk, 0, + info->dqi_blocks); if (err) return err; prevblk = le32_to_cpu(dh->dqdh_prev_free); - err = do_check_range(info->dqi_sb, prevblk, 0, info->dqi_blocks); + err = do_check_range(info->dqi_sb, "dqdh_prev_free", prevblk, 0, + info->dqi_blocks); if (err) return err;
@@ -277,6 +279,11 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, *err = check_free_block(info, dh); if (*err) goto out_buf; + *err = do_check_range(info->dqi_sb, "dqdh_entries", + le16_to_cpu(dh->dqdh_entries), 0, + qtree_dqstr_in_blk(info)); + if (*err) + goto out_buf; } else { blk = get_free_dqblk(info); if ((int)blk < 0) { @@ -358,6 +365,10 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } ref = (__le32 *)buf; newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + ret = do_check_range(dquot->dq_sb, "block", newblk, 0, + info->dqi_blocks); + if (ret) + goto out_buf; if (!newblk) newson = 1; if (depth == info->dqi_qtree_depth - 1) { @@ -470,6 +481,11 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, } dh = (struct qt_disk_dqdbheader *)buf; ret = check_free_block(info, dh); + if (ret) + goto out_buf; + ret = do_check_range(info->dqi_sb, "dqdh_entries", + le16_to_cpu(dh->dqdh_entries), 1, + qtree_dqstr_in_blk(info) + 1); if (ret) goto out_buf; le16_add_cpu(&dh->dqdh_entries, -1); @@ -528,7 +544,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); - ret = do_check_range(dquot->dq_sb, newblk, QT_TREEOFF, + ret = do_check_range(dquot->dq_sb, "block", newblk, QT_TREEOFF, info->dqi_blocks); if (ret) goto out_buf; @@ -632,7 +648,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; - ret = do_check_range(dquot->dq_sb, blk, QT_TREEOFF, info->dqi_blocks); + ret = do_check_range(dquot->dq_sb, "block", blk, QT_TREEOFF, + info->dqi_blocks); if (ret) goto out_buf;
@@ -748,7 +765,13 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, goto out_buf; } for (i = __get_index(info, *id, depth); i < epb; i++) { - if (ref[i] == cpu_to_le32(0)) { + uint blk_no = le32_to_cpu(ref[i]); + + ret = do_check_range(info->dqi_sb, "block", blk_no, 0, + info->dqi_blocks); + if (ret) + goto out_buf; + if (blk_no == 0) { *id += level_inc; continue; } @@ -756,7 +779,7 @@ static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, ret = 0; goto out_buf; } - ret = find_next_id(info, id, le32_to_cpu(ref[i]), depth + 1); + ret = find_next_id(info, id, blk_no, depth + 1); if (ret != -ENOENT) break; }
From: Daniel Sneddon daniel.sneddon@linux.intel.com
stable inclusion from stable-v5.10.136 commit 509c2c9fe75ea7493eebbb6bb2f711f37530ae19 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5N1SO CVE: CVE-2022-26373
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 2b1299322016731d56807aa49254a5ea3080b6b3 upstream.
tl;dr: The Enhanced IBRS mitigation for Spectre v2 does not work as documented for RET instructions after VM exits. Mitigate it with a new one-entry RSB stuffing mechanism and a new LFENCE.
== Background ==
Indirect Branch Restricted Speculation (IBRS) was designed to help mitigate Branch Target Injection and Speculative Store Bypass, i.e. Spectre, attacks. IBRS prevents software run in less privileged modes from affecting branch prediction in more privileged modes. IBRS requires the MSR to be written on every privilege level change.
To overcome some of the performance issues of IBRS, Enhanced IBRS was introduced. eIBRS is an "always on" IBRS, in other words, just turn it on once instead of writing the MSR on every privilege level change. When eIBRS is enabled, more privileged modes should be protected from less privileged modes, including protecting VMMs from guests.
== Problem ==
Here's a simplification of how guests are run on Linux' KVM:
void run_kvm_guest(void) { // Prepare to run guest VMRESUME(); // Clean up after guest runs }
The execution flow for that would look something like this to the processor:
1. Host-side: call run_kvm_guest() 2. Host-side: VMRESUME 3. Guest runs, does "CALL guest_function" 4. VM exit, host runs again 5. Host might make some "cleanup" function calls 6. Host-side: RET from run_kvm_guest()
Now, when back on the host, there are a couple of possible scenarios of post-guest activity the host needs to do before executing host code:
* on pre-eIBRS hardware (legacy IBRS, or nothing at all), the RSB is not touched and Linux has to do a 32-entry stuffing.
* on eIBRS hardware, VM exit with IBRS enabled, or restoring the host IBRS=1 shortly after VM exit, has a documented side effect of flushing the RSB except in this PBRSB situation where the software needs to stuff the last RSB entry "by hand".
IOW, with eIBRS supported, host RET instructions should no longer be influenced by guest behavior after the host retires a single CALL instruction.
However, if the RET instructions are "unbalanced" with CALLs after a VM exit as is the RET in #6, it might speculatively use the address for the instruction after the CALL in #3 as an RSB prediction. This is a problem since the (untrusted) guest controls this address.
Balanced CALL/RET instruction pairs such as in step #5 are not affected.
== Solution ==
The PBRSB issue affects a wide variety of Intel processors which support eIBRS. But not all of them need mitigation. Today, X86_FEATURE_RSB_VMEXIT triggers an RSB filling sequence that mitigates PBRSB. Systems setting RSB_VMEXIT need no further mitigation - i.e., eIBRS systems which enable legacy IBRS explicitly.
However, such systems (X86_FEATURE_IBRS_ENHANCED) do not set RSB_VMEXIT and most of them need a new mitigation.
Therefore, introduce a new feature flag X86_FEATURE_RSB_VMEXIT_LITE which triggers a lighter-weight PBRSB mitigation versus RSB_VMEXIT.
The lighter-weight mitigation performs a CALL instruction which is immediately followed by a speculative execution barrier (INT3). This steers speculative execution to the barrier -- just like a retpoline -- which ensures that speculation can never reach an unbalanced RET. Then, ensure this CALL is retired before continuing execution with an LFENCE.
In other words, the window of exposure is opened at VM exit where RET behavior is troublesome. While the window is open, force RSB predictions sampling for RET targets to a dead end at the INT3. Close the window with the LFENCE.
There is a subset of eIBRS systems which are not vulnerable to PBRSB. Add these systems to the cpu_vuln_whitelist[] as NO_EIBRS_PBRSB. Future systems that aren't vulnerable will set ARCH_CAP_PBRSB_NO.
[ bp: Massage, incorporate review comments from Andy Cooper. ]
Signed-off-by: Daniel Sneddon daniel.sneddon@linux.intel.com Co-developed-by: Pawan Gupta pawan.kumar.gupta@linux.intel.com Signed-off-by: Pawan Gupta pawan.kumar.gupta@linux.intel.com Signed-off-by: Borislav Petkov bp@suse.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
conflict: arch/x86/include/asm/cpufeatures.h
Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Reviewed-by: Liao Chang liaochang1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/hw-vuln/spectre.rst | 8 ++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/msr-index.h | 4 + arch/x86/include/asm/nospec-branch.h | 17 +++- arch/x86/kernel/cpu/bugs.c | 86 ++++++++++++++----- arch/x86/kernel/cpu/common.c | 12 ++- arch/x86/kvm/vmx/vmenter.S | 8 +- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/include/asm/msr-index.h | 4 + 9 files changed, 113 insertions(+), 29 deletions(-)
diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 6bd97cd50d62..7e061ed449aa 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -422,6 +422,14 @@ The possible values in this file are: 'RSB filling' Protection of RSB on context switch enabled ============= ===========================================
+ - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status: + + =========================== ======================================================= + 'PBRSB-eIBRS: SW sequence' CPU is affected and protection of RSB on VMEXIT enabled + 'PBRSB-eIBRS: Vulnerable' CPU is vulnerable + 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB + =========================== ======================================================= + Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will report vulnerability. diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d943aafbf27a..1e2c1b5fa7a4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -299,6 +299,7 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -434,5 +435,6 @@ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ #define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
#endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index eff86bbfdf55..03622e5e1dec 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -148,6 +148,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. + */
#define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 32e25cac72b2..3c615488b740 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -118,13 +118,28 @@ #endif .endm
+.macro ISSUE_UNBALANCED_RET_GUARD + ANNOTATE_INTRA_FUNCTION_CALL + call .Lunbalanced_ret_guard_@ + int3 +.Lunbalanced_ret_guard_@: + add $(BITS_PER_LONG/8), %_ASM_SP + lfence +.endm + /* * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP * monstrosity above, manually. */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 +.ifb \ftr2 ALTERNATIVE "jmp .Lskip_rsb_@", "", \ftr +.else + ALTERNATIVE_2 "jmp .Lskip_rsb_@", "", \ftr, "jmp .Lunbalanced_@", \ftr2 +.endif __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) +.Lunbalanced_@: + ISSUE_UNBALANCED_RET_GUARD .Lskip_rsb_@: .endm
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bc6382f5ec27..d4cb9ff639aa 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1290,6 +1290,53 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) } }
+static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1438,28 +1485,7 @@ static void __init spectre_v2_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
- /* - * Similar to context switches, there are two types of RSB attacks - * after vmexit: - * - * 1) RSB underflow - * - * 2) Poisoned RSB entry - * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. - * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. - * - * eIBRS, on the other hand, has RSB-poisoning protections, so it - * doesn't need RSB clearing after vmexit. - */ - if (boot_cpu_has(X86_FEATURE_RETPOLINE) || - boot_cpu_has(X86_FEATURE_KERNEL_IBRS)) - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
/* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -2202,6 +2228,19 @@ static char *ibpb_state(void) return ""; }
+static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2214,12 +2253,13 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sprintf(buf, "%s%s%s%s%s%s\n", + return sprintf(buf, "%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), spectre_v2_module_string()); }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2aff9eab7ca6..1ee24f9541a5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1024,6 +1024,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) +#define NO_EIBRS_PBRSB BIT(9)
#define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1064,7 +1065,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1074,7 +1075,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), @@ -1252,6 +1255,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); }
+ if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 857fa0fc49fa..982138bebb70 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -197,11 +197,13 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) * entries and (in some cases) RSB underflow. * * eIBRS has its own protection against poisoned RSB, so it doesn't - * need the RSB filling sequence. But it does need to be enabled - * before the first unbalanced RET. + * need the RSB filling sequence. But it does need to be enabled, and a + * single call to retire, before the first unbalanced RET. */
- FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ + X86_FEATURE_RSB_VMEXIT_LITE +
pop %_ASM_ARG2 /* @flags */ pop %_ASM_ARG1 /* @vmx */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 54ba20492ad1..ec53f52a06a5 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -296,6 +296,7 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 53373ca3b487..b8954262d767 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -148,6 +148,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. + */
#define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /*
From: Pawan Gupta pawan.kumar.gupta@linux.intel.com
stable inclusion from stable-v5.10.136 commit 1bea03b44ea2267988cce064f5887b01d421b28c category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5N1SO CVE: CVE-2022-26373
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit ba6e31af2be96c4d0536f2152ed6f7b6c11bca47 upstream.
RSB fill sequence does not have any protection for miss-prediction of conditional branch at the end of the sequence. CPU can speculatively execute code immediately after the sequence, while RSB filling hasn't completed yet.
#define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ 773: /* speculation trap */ \ UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 773b; \ 772: \ ANNOTATE_INTRA_FUNCTION_CALL; \ call 774f; \ 775: /* speculation trap */ \ UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 775b; \ 774: \ add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ jnz 771b; <----- CPU can miss-predict here.
Before RSB is filled, RETs that come in program order after this macro can be executed speculatively, making them vulnerable to RSB-based attacks.
Mitigate it by adding an LFENCE after the conditional branch to prevent speculation while RSB is being filled.
Suggested-by: Andrew Cooper andrew.cooper3@citrix.com Signed-off-by: Pawan Gupta pawan.kumar.gupta@linux.intel.com Signed-off-by: Borislav Petkov bp@suse.de Signed-off-by: Daniel Sneddon daniel.sneddon@linux.intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Reviewed-by: Liao Chang liaochang1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/nospec-branch.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 3c615488b740..92771640706c 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -60,7 +60,9 @@ 774: \ add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ - jnz 771b; + jnz 771b; \ + /* barrier for jnz misprediction */ \ + lfence;
#ifdef __ASSEMBLY__
From: Hyunwoo Kim imv4bel@gmail.com
mainline inclusion from mainline-v6.0-rc5 commit 9cb636b5f6a8cc6d1b50809ec8f8d33ae0c84c95 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5QI0W CVE: CVE-2022-40307
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/fs...
---------------------------
A race condition may occur if the user calls close() on another thread during a write() operation on the device node of the efi capsule.
This is a race condition that occurs between the efi_capsule_write() and efi_capsule_flush() functions of efi_capsule_fops, which ultimately results in UAF.
So, the page freeing process is modified to be done in efi_capsule_release() instead of efi_capsule_flush().
Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Hyunwoo Kim imv4bel@gmail.com Link: https://lore.kernel.org/all/20220907102920.GA88602@ubuntu/ Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Xia Longlong xialonglong1@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/firmware/efi/capsule-loader.c | 31 ++++++--------------------- 1 file changed, 7 insertions(+), 24 deletions(-)
diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c index 4dde8edd53b6..3e8d4b51a814 100644 --- a/drivers/firmware/efi/capsule-loader.c +++ b/drivers/firmware/efi/capsule-loader.c @@ -242,29 +242,6 @@ static ssize_t efi_capsule_write(struct file *file, const char __user *buff, return ret; }
-/** - * efi_capsule_flush - called by file close or file flush - * @file: file pointer - * @id: not used - * - * If a capsule is being partially uploaded then calling this function - * will be treated as upload termination and will free those completed - * buffer pages and -ECANCELED will be returned. - **/ -static int efi_capsule_flush(struct file *file, fl_owner_t id) -{ - int ret = 0; - struct capsule_info *cap_info = file->private_data; - - if (cap_info->index > 0) { - pr_err("capsule upload not complete\n"); - efi_free_all_buff_pages(cap_info); - ret = -ECANCELED; - } - - return ret; -} - /** * efi_capsule_release - called by file close * @inode: not used @@ -277,6 +254,13 @@ static int efi_capsule_release(struct inode *inode, struct file *file) { struct capsule_info *cap_info = file->private_data;
+ if (cap_info->index > 0 && + (cap_info->header.headersize == 0 || + cap_info->count < cap_info->total_size)) { + pr_err("capsule upload not complete\n"); + efi_free_all_buff_pages(cap_info); + } + kfree(cap_info->pages); kfree(cap_info->phys); kfree(file->private_data); @@ -324,7 +308,6 @@ static const struct file_operations efi_capsule_fops = { .owner = THIS_MODULE, .open = efi_capsule_open, .write = efi_capsule_write, - .flush = efi_capsule_flush, .release = efi_capsule_release, .llseek = no_llseek, };
From: Like Xu likexu@tencent.com
mainline inclusion from mainline-v5.14 commit e79f49c37ccf273c8aba733f803b3774ebfbe581 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5RD6Y CVE: NA
-------------
Based on our observations, after any vm-exit associated with vPMU, there are at least two or more perf interfaces to be called for guest counter emulation, such as perf_event_{pause, read_value, period}(), and each one will {lock, unlock} the same perf_event_ctx. The frequency of calls becomes more severe when guest use counters in a multiplexed manner.
Holding a lock once and completing the KVM request operations in the perf context would introduce a set of impractical new interfaces. So we can further optimize the vPMU implementation by avoiding repeated calls to these interfaces in the KVM context for at least one pattern:
After we call perf_event_pause() once, the event will be disabled and its internal count will be reset to 0. So there is no need to pause it again or read its value. Once the event is paused, event period will not be updated until the next time it's resumed or reprogrammed. And there is also no need to call perf_event_period twice for a non-running counter, considering the perf_event for a running counter is never paused.
Based on this implementation, for the following common usage of sampling 4 events using perf on a 4u8g guest:
echo 0 > /proc/sys/kernel/watchdog echo 25 > /proc/sys/kernel/perf_cpu_time_max_percent echo 10000 > /proc/sys/kernel/perf_event_max_sample_rate echo 0 > /proc/sys/kernel/perf_cpu_time_max_percent for i in `seq 1 1 10` do taskset -c 0 perf record \ -e cpu-cycles -e instructions -e branch-instructions -e cache-misses \ /root/br_instr a done
the average latency of the guest NMI handler is reduced from 37646.7 ns to 32929.3 ns (~1.14x speed up) on the Intel ICX server. Also, in addition to collecting more samples, no loss of sampling accuracy was observed compared to before the optimization.
Signed-off-by: Like Xu likexu@tencent.com Message-Id: 20210728120705.6855-1-likexu@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Acked-by: Peter Zijlstra peterz@infradead.org Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/pmu.c | 5 ++++- arch/x86/kvm/pmu.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- 4 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5103638cc911..c54e81d2a409 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -427,6 +427,7 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; + bool is_paused; };
struct kvm_pmu { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f2c3869475d9..4b7cdd549b4b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; }
static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter;
- if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return;
/* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; }
static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
/* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false;
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index cd35624595bf..e8620ec6014d 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -54,7 +54,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running;
counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 44cd13790810..6427d95de01c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -438,13 +438,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0;
From: Like Xu likexu@tencent.com
mainline inclusion from mainline-v5.18 commit 75189d1de1b377e580ebd2d2c55914631eac9c64 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5RD6Y CVE: NA
-------------
NMI-watchdog is one of the favorite features of kernel developers, but it does not work in AMD guest even with vPMU enabled and worse, the system misrepresents this capability via /proc.
This is a PMC emulation error. KVM does not pass the latest valid value to perf_event in time when guest NMI-watchdog is running, thus the perf_event corresponding to the watchdog counter will enter the old state at some point after the first guest NMI injection, forcing the hardware register PMC0 to be constantly written to 0x800000000001.
Meanwhile, the running counter should accurately reflect its new value based on the latest coordinated pmc->counter (from vPMC's point of view) rather than the value written directly by the guest.
Fixes: 168d918f2643 ("KVM: x86: Adjust counter sample period after a wrmsr") Reported-by: Dongli Cao caodongli@kingsoft.com Signed-off-by: Like Xu likexu@tencent.com Reviewed-by: Yanan Wang wangyanan55@huawei.com Tested-by: Yanan Wang wangyanan55@huawei.com Reviewed-by: Jim Mattson jmattson@google.com Message-Id: 20220409015226.38619-1-likexu@tencent.com Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Reviewed-by: Keqian Zhu zhukeqian1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/pmu.h | 9 +++++++++ arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 8 ++------ 3 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index e8620ec6014d..005f580ca5e7 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -141,6 +141,15 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) return sample_period; }
+static inline void pmc_update_sample_period(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event || pmc->is_paused) + return; + + perf_event_period(pmc->perf_event, + get_sample_period(pmc, pmc->counter)); +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 49e5be735f14..663d943f85db 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -270,6 +270,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { pmc->counter += data - pmc_read_counter(pmc); + pmc_update_sample_period(pmc); return 0; } /* MSR_EVNTSELn */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 6427d95de01c..a38df824262c 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -438,15 +438,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event && !pmc->is_paused) - perf_event_period(pmc->perf_event, - get_sample_period(pmc, data)); + pmc_update_sample_period(pmc); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event && !pmc->is_paused) - perf_event_period(pmc->perf_event, - get_sample_period(pmc, data)); + pmc_update_sample_period(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel)