[PATCH OLK-6.6 0/4] migration/hisilicon: some bugfix

From: JiangShui Yang <yangjiangshui@h-partners.com> Longfang Liu (3): migration: bugfix live migration function without VF device driver migration: resolve duplicate migration states hisi_acc_vfio_pci: update device driver status Weili Qian (1): migration: fix VF reset timeout issue .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 62 +++++++++++++++---- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 2 + 2 files changed, 51 insertions(+), 13 deletions(-) -- 2.43.0

driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICX1ZY CVE: NA ---------------------------------------------------------------------- The current live migration function will cause RAS problems in direct migration without loading the accelerator driver in the VM. The reason is that the loading status of the driver in the VM is not read in pre_copy. Instead, read it in stop_copy. This results in the migrating end being unable to obtain this status value, and thus the live migration recovery operation will still be performed. This error occurs because the source side skips migration and no data is migrated over. Therefore, this status value needs to be read in the pre_copy stage. Fixes: ee3a5b2359e0 ("hisi_acc_vfio_pci: add new vfio_pci driver for HiSilicon ACC devices") Signed-off-by: Longfang Liu <liulongfang@huawei.com> Signed-off-by: JiangShui Yang <yangjiangshui@h-partners.com> --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0030ab31ef15..bee6c9d46362 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -439,6 +439,7 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct acc_vf_data *vf_data) { struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; struct device *dev = &pf_qm->pdev->dev; int vf_id = hisi_acc_vdev->vf_id; int ret; @@ -465,6 +466,13 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return ret; } + /* Get VF driver insmod state */ + ret = qm_read_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); + if (ret) { + dev_err(dev, "failed to read QM_VF_STATE!\n"); + return ret; + } + return 0; } @@ -748,6 +756,9 @@ static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf = hisi_acc_vdev->resuming_migf; int ret; + if (hisi_acc_vdev->vf_qm_state != QM_READY) + return 0; + /* Recover data to VF */ ret = vf_qm_load_data(hisi_acc_vdev, migf); if (ret) { -- 2.43.0

From: Weili Qian <qianweili@huawei.com> driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICX1ZY CVE: NA ---------------------------------------------------------------------- If device error occurs during live migration, qemu will reset the VF. At this time, VF reset and device reset are performed simultaneously. The VF reset will timeout. Therefore, the QM_RESETTING flag is used to ensure that VF reset and device reset are performed serially. Fixes: 4406f46c9bcd ("hisi_acc_vfio_pci: Use its own PCI reset_done error handler") Signed-off-by: Weili Qian <qianweili@huawei.com> Signed-off-by: JiangShui Yang <yangjiangshui@h-partners.com> --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 24 +++++++++++++++++++ .../vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 2 ++ 2 files changed, 26 insertions(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index bee6c9d46362..30b55e84f81e 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1190,9 +1190,32 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, return 0; } +static void hisi_acc_vf_pci_reset_prepare(struct pci_dev *pdev) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); + struct hisi_qm *qm = hisi_acc_vdev->pf_qm; + struct device *dev = &qm->pdev->dev; + u32 delay = 0; + + /* All reset requests need to be queued for processing */ + while (test_and_set_bit(QM_RESETTING, &qm->misc_ctl)) { + msleep(++delay); + if (delay > QM_RESET_WAIT_TIMEOUT) { + dev_err(dev, "reset prepare failed\n"); + return; + } + } + + hisi_acc_vdev->set_reset_flag = true; +} + static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) { struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); + struct hisi_qm *qm = hisi_acc_vdev->pf_qm; + + if (hisi_acc_vdev->set_reset_flag) + clear_bit(QM_RESETTING, &qm->misc_ctl); if (!hisi_acc_vdev->core_device.vdev.mig_ops) return; @@ -1740,6 +1763,7 @@ static const struct pci_device_id hisi_acc_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, hisi_acc_vfio_pci_table); static const struct pci_error_handlers hisi_acc_vf_err_handlers = { + .reset_prepare = hisi_acc_vf_pci_reset_prepare, .reset_done = hisi_acc_vf_pci_aer_reset_done, .error_detected = vfio_pci_core_aer_err_detected, }; diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 01a4cd7e8d9f..65c2a454665a 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -66,6 +66,7 @@ #define QM_EQC_PF_DW0 0x1c00 #define QM_AEQC_PF_DW0 0x1c20 +#define QM_RESET_WAIT_TIMEOUT 400 struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -117,6 +118,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; u8 match_done; + bool set_reset_flag; /* * io_base is only valid when dev_opened is true, * which is protected by open_mutex. -- 2.43.0

driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICX1ZY CVE: NA ---------------------------------------------------------------------- In special scenarios involving duplicate migrations, after the first migration is completed, if the original VF device is used again and then migrated to another destination, the state indicating data migration completion for the VF device is not reset. This results in the second migration to the destination being skipped without performing data migration. After the modification, it ensures that a complete data migration is performed after the subsequent migration. Fixes: ee3a5b2359e0 ("hisi_acc_vfio_pci: add new vfio_pci driver for HiSilicon ACC devices") Signed-off-by: Longfang Liu <liulongfang@huawei.com> Signed-off-by: JiangShui Yang <yangjiangshui@h-partners.com> --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 30b55e84f81e..24655ebf2b76 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1584,6 +1584,7 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) } hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; hisi_acc_vdev->dev_opened = true; + hisi_acc_vdev->match_done = 0; mutex_unlock(&hisi_acc_vdev->open_mutex); } -- 2.43.0

driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICX1ZY CVE: NA ---------------------------------------------------------------------- In the previous code, the assignment of the vf_qm_state status was moved from "vf_qm_check_match" in the pre_copy phase to "vf_qm_load_data". This causes the reset operation when starting the VM at the destination end to default the vf_qm_state of the device to QM_NOT_READY. This status is not modified during resume at the destination end, nor corrected during pre_copy, and remains QM_NOT_READY at the final stop_copy stage. This leads to the recovery operation "hisi_acc_vf_load_state" skipping the data write operation, resulting in the data address of the destination end device after migration being a random value. This random address causes the device to access an abnormal address when restoring qp, thereby triggering a ras error Fixes: ee3a5b2359e0 ("hisi_acc_vfio_pci: add new vfio_pci driver for HiSilicon ACC devices") Signed-off-by: Longfang Liu <liulongfang@huawei.com> Signed-off-by: JiangShui Yang <yangjiangshui@h-partners.com> --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 24655ebf2b76..1f548d963360 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -389,7 +389,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct device *dev = &vf_qm->pdev->dev; u32 que_iso_state; - int ret; + int qp_num, ret; if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) return 0; @@ -406,18 +406,18 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, } /* VF qp num check */ - ret = qm_get_vft(vf_qm, &vf_qm->qp_base); - if (ret <= 0) { + qp_num = qm_get_vft(vf_qm, &vf_qm->qp_base); + if (qp_num <= 0) { dev_err(dev, "failed to get vft qp nums\n"); - return ret; + return -EINVAL; } - if (ret != vf_data->qp_num) { + if (qp_num != vf_data->qp_num) { dev_err(dev, "failed to match VF qp num\n"); return -EINVAL; } - vf_qm->qp_num = ret; + vf_qm->qp_num = qp_num; /* VF isolation state check */ ret = qm_read_regs(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); @@ -431,6 +431,13 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } + ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_STATE\n"); + return ret; + } + + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; hisi_acc_vdev->match_done = true; return 0; } @@ -511,13 +518,6 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } - ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); - if (ret) { - dev_err(dev, "failed to write QM_VF_STATE\n"); - return ret; - } - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - qm->eqe_dma = vf_data->eqe_dma; qm->aeqe_dma = vf_data->aeqe_dma; qm->sqc_dma = vf_data->sqc_dma; -- 2.43.0

反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/17947 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/T7K... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/17947 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/T7K...
participants (2)
-
Longfang Liu
-
patchwork bot