From: Rong Wang w_angrong@163.com
kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5CO9A CVE: NA
---------------------------------
As pass through devices, hypervisor can`t control the status of device, and can`t track dirty memory DMA from device, during migration. The goal of this framework is to combine hardware to accomplish the task above.
qemu |status control and dirty memory report vfio |ops to hardware hardware
Signed-off-by: Rong Wang w_angrong@163.com Signed-off-by: HuHua Li 18245010845@163.com Signed-off-by: Ripeng Qiu 965412048@qq.com --- drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/vfio_pci.c | 54 +++ drivers/vfio/pci/vfio_pci_migration.c | 755 ++++++++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h | 14 +- drivers/vfio/vfio.c | 411 +++++++++++++++++- include/linux/vfio_pci_migration.h | 136 ++++++ 6 files changed, 1367 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_migration.c create mode 100644 include/linux/vfio_pci_migration.h
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 76d8ec0..80a777d 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,5 +1,5 @@
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio_pci_migration.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 51b791c..59d8280 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -30,6 +30,7 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#include <linux/vfio_pci_migration.h>
#include "vfio_pci_private.h"
@@ -296,6 +297,14 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
vfio_pci_probe_mmaps(vdev);
+ if (vfio_dev_migration_is_supported(pdev)) { + ret = vfio_pci_migration_init(vdev); + if (ret) { + dev_warn(&vdev->pdev->dev, "Failed to init vfio_pci_migration\n"); + vfio_pci_disable(vdev); + return ret; + } + } return 0; }
@@ -392,6 +401,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) out: pci_disable_device(pdev);
+ vfio_pci_migration_exit(vdev); vfio_pci_try_bus_reset(vdev);
if (!disable_idle_d3) @@ -642,6 +652,41 @@ struct vfio_devices { int max_index; };
+static long vfio_pci_handle_log_buf_ctl(struct vfio_pci_device *vdev, + const unsigned long arg) +{ + struct vfio_log_buf_ctl *log_buf_ctl = NULL; + struct vfio_log_buf_info *log_buf_info = NULL; + struct vf_migration_log_info migration_log_info; + long ret = 0; + + log_buf_ctl = (struct vfio_log_buf_ctl *)arg; + log_buf_info = (struct vfio_log_buf_info *)log_buf_ctl->data; + + switch (log_buf_ctl->flags) { + case VFIO_DEVICE_LOG_BUF_FLAG_START: + migration_log_info.dom_uuid = log_buf_info->uuid; + migration_log_info.buffer_size = + log_buf_info->buffer_size; + migration_log_info.sge_num = log_buf_info->addrs_size; + migration_log_info.sge_len = log_buf_info->frag_size; + migration_log_info.sgevec = log_buf_info->sgevec; + ret = vfio_pci_device_log_start(vdev, + &migration_log_info); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STOP: + ret = vfio_pci_device_log_stop(vdev, + log_buf_info->uuid); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY: + ret = vfio_pci_device_log_status_query(vdev); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -1142,6 +1187,8 @@ static long vfio_pci_ioctl(void *device_data,
return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, ioeventfd.fd); + } else if (cmd == VFIO_DEVICE_LOG_BUF_CTL) { + return vfio_pci_handle_log_buf_ctl(vdev, arg); }
return -ENOTTY; @@ -1566,6 +1613,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_power_state(pdev, PCI_D3hot); }
+ if (vfio_dev_migration_is_supported(pdev)) + ret = vfio_pci_device_init(pdev); + return ret; }
@@ -1591,6 +1641,10 @@ static void vfio_pci_remove(struct pci_dev *pdev)
if (!disable_idle_d3) pci_set_power_state(pdev, PCI_D0); + + if (vfio_dev_migration_is_supported(pdev)) { + vfio_pci_device_uninit(pdev); + } }
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/vfio_pci_migration.c b/drivers/vfio/pci/vfio_pci_migration.c new file mode 100644 index 0000000..f69cd13 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_migration.c @@ -0,0 +1,755 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_migration.h> + +#include "vfio_pci_private.h" + +static LIST_HEAD(vfio_pci_mig_drivers_list); +static DEFINE_MUTEX(vfio_pci_mig_drivers_mutex); + +static void vfio_pci_add_mig_drv(struct vfio_pci_vendor_mig_driver *mig_drv) +{ + mutex_lock(&vfio_pci_mig_drivers_mutex); + atomic_set(&mig_drv->count, 1); + list_add_tail(&mig_drv->list, &vfio_pci_mig_drivers_list); + mutex_unlock(&vfio_pci_mig_drivers_mutex); +} + +static void vfio_pci_remove_mig_drv(struct vfio_pci_vendor_mig_driver *mig_drv) +{ + mutex_lock(&vfio_pci_mig_drivers_mutex); + list_del(&mig_drv->list); + mutex_unlock(&vfio_pci_mig_drivers_mutex); +} + +static struct vfio_pci_vendor_mig_driver * + vfio_pci_find_mig_drv(struct pci_dev *pdev, struct module *module) +{ + struct vfio_pci_vendor_mig_driver *mig_drv = NULL; + + mutex_lock(&vfio_pci_mig_drivers_mutex); + list_for_each_entry(mig_drv, &vfio_pci_mig_drivers_list, list) { + if (mig_drv->owner == module) { + if (mig_drv->bus_num == pdev->bus->number) + goto out; + } + } + mig_drv = NULL; +out: + mutex_unlock(&vfio_pci_mig_drivers_mutex); + return mig_drv; +} + +static struct vfio_pci_vendor_mig_driver * + vfio_pci_get_mig_driver(struct pci_dev *pdev) +{ + struct vfio_pci_vendor_mig_driver *mig_drv = NULL; + struct pci_dev *pf_dev = pci_physfn(pdev); + + mutex_lock(&vfio_pci_mig_drivers_mutex); + list_for_each_entry(mig_drv, &vfio_pci_mig_drivers_list, list) { + if (mig_drv->bus_num == pf_dev->bus->number) + goto out; + } + mig_drv = NULL; +out: + mutex_unlock(&vfio_pci_mig_drivers_mutex); + return mig_drv; +} + +bool vfio_dev_migration_is_supported(struct pci_dev *pdev) +{ + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + mig_driver = vfio_pci_get_mig_driver(pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_warn(&pdev->dev, "unable to find a mig_drv module\n"); + return false; + } + + return true; +} + +int vfio_pci_device_log_start(struct vfio_pci_device *vdev, + struct vf_migration_log_info *log_info) +{ + struct vfio_pci_vendor_mig_driver *mig_driver; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (!mig_driver->dev_mig_ops->log_start || + (mig_driver->dev_mig_ops->log_start(vdev->pdev, + log_info) != 0)) { + dev_err(&vdev->pdev->dev, "failed to set log start\n"); + return -EFAULT; + } + + return 0; +} + +int vfio_pci_device_log_stop(struct vfio_pci_device *vdev, uint32_t uuid) +{ + struct vfio_pci_vendor_mig_driver *mig_driver; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (!mig_driver->dev_mig_ops->log_stop || + (mig_driver->dev_mig_ops->log_stop(vdev->pdev, uuid) != 0)) { + dev_err(&vdev->pdev->dev, "failed to set log stop\n"); + return -EFAULT; + } + + return 0; +} + +int vfio_pci_device_log_status_query(struct vfio_pci_device *vdev) +{ + struct vfio_pci_vendor_mig_driver *mig_driver; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (!mig_driver->dev_mig_ops->get_log_status || + (mig_driver->dev_mig_ops->get_log_status(vdev->pdev) != 0)) { + dev_err(&vdev->pdev->dev, "failed to get log status\n"); + return -EFAULT; + } + + return 0; +} + +int vfio_pci_device_init(struct pci_dev *pdev) +{ + struct vfio_pci_vendor_mig_driver *mig_drv; + + mig_drv = vfio_pci_get_mig_driver(pdev); + if (!mig_drv || !mig_drv->dev_mig_ops) { + dev_err(&pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (mig_drv->dev_mig_ops->init) + return mig_drv->dev_mig_ops->init(pdev); + + return -EFAULT; +} + +void vfio_pci_device_uninit(struct pci_dev *pdev) +{ + struct vfio_pci_vendor_mig_driver *mig_drv; + + mig_drv = vfio_pci_get_mig_driver(pdev); + if (!mig_drv || !mig_drv->dev_mig_ops) { + dev_err(&pdev->dev, "unable to find a mig_drv module\n"); + return; + } + + if (mig_drv->dev_mig_ops->uninit) + mig_drv->dev_mig_ops->uninit(pdev); +} + +static void vfio_pci_device_release(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) +{ + if (mig_drv->dev_mig_ops->release) + mig_drv->dev_mig_ops->release(pdev); +} + +static int vfio_pci_device_get_info(struct pci_dev *pdev, + struct vfio_device_migration_info *mig_info, + struct vfio_pci_vendor_mig_driver *mig_drv) +{ + if (mig_drv->dev_mig_ops->get_info) + return mig_drv->dev_mig_ops->get_info(pdev, mig_info); + return -EFAULT; +} + +static int vfio_pci_device_enable(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) +{ + if (!mig_drv->dev_mig_ops->enable || + (mig_drv->dev_mig_ops->enable(pdev) != 0)) { + return -EINVAL; + } + + return 0; +} + +static int vfio_pci_device_disable(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) +{ + if (!mig_drv->dev_mig_ops->disable || + (mig_drv->dev_mig_ops->disable(pdev) != 0)) + return -EINVAL; + + return 0; +} + +static int vfio_pci_device_pre_enable(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) +{ + if (!mig_drv->dev_mig_ops->pre_enable || + (mig_drv->dev_mig_ops->pre_enable(pdev) != 0)) + return -EINVAL; + + return 0; +} + +static int vfio_pci_device_state_save(struct pci_dev *pdev, + struct vfio_pci_migration_data *data) +{ + struct vfio_device_migration_info *mig_info = data->mig_ctl; + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + void *base = (void *)mig_info; + int ret = 0; + + if ((mig_info->device_state & VFIO_DEVICE_STATE_RUNNING) != 0) { + ret = vfio_pci_device_disable(pdev, mig_drv); + if (ret) { + dev_err(&pdev->dev, "failed to stop VF function!\n"); + return ret; + } + mig_info->device_state &= ~VFIO_DEVICE_STATE_RUNNING; + } + + if (mig_drv->dev_mig_ops && mig_drv->dev_mig_ops->save) { + ret = mig_drv->dev_mig_ops->save(pdev, base, + mig_info->data_offset, data->state_size); + if (ret) { + dev_err(&pdev->dev, "failed to save device state!\n"); + return -EINVAL; + } + } else { + return -EFAULT; + } + + mig_info->data_size = data->state_size; + mig_info->pending_bytes = mig_info->data_size; + return ret; +} + +static int vfio_pci_device_state_restore(struct vfio_pci_migration_data *data) +{ + struct vfio_device_migration_info *mig_info = data->mig_ctl; + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + struct pci_dev *pdev = data->vf_dev; + void *base = (void *)mig_info; + int ret; + + if (mig_drv->dev_mig_ops && mig_drv->dev_mig_ops->restore) { + ret = mig_drv->dev_mig_ops->restore(pdev, base, + mig_info->data_offset, mig_info->data_size); + if (ret) { + dev_err(&pdev->dev, "failed to restore device state!\n"); + return -EINVAL; + } + return 0; + } + + return -EFAULT; +} + +static int vfio_pci_set_device_state(struct vfio_pci_migration_data *data, + u32 state) +{ + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + struct pci_dev *pdev = data->vf_dev; + int ret = 0; + + if (state == mig_ctl->device_state) + return 0; + + if (!mig_drv->dev_mig_ops) + return -EINVAL; + + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + if (!(mig_ctl->device_state & + VFIO_DEVICE_STATE_RUNNING)) + ret = vfio_pci_device_enable(pdev, mig_drv); + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + /* + * (pre-copy) - device should start logging data. + */ + ret = 0; + break; + case VFIO_DEVICE_STATE_SAVING: + /* stop the vf function, save state */ + ret = vfio_pci_device_state_save(pdev, data); + break; + case VFIO_DEVICE_STATE_STOP: + if (mig_ctl->device_state & VFIO_DEVICE_STATE_RUNNING) + ret = vfio_pci_device_disable(pdev, mig_drv); + break; + case VFIO_DEVICE_STATE_RESUMING: + ret = vfio_pci_device_pre_enable(pdev, mig_drv); + break; + default: + ret = -EFAULT; + break; + } + + if (ret) + return ret; + + mig_ctl->device_state = state; + return 0; +} + +static ssize_t vfio_pci_handle_mig_dev_state( + struct vfio_pci_migration_data *data, + char __user *buf, size_t count, bool iswrite) +{ + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + u32 device_state; + int ret; + + if (count != sizeof(device_state)) + return -EINVAL; + + if (iswrite) { + if (copy_from_user(&device_state, buf, count)) + return -EFAULT; + + ret = vfio_pci_set_device_state(data, device_state); + if (ret) + return ret; + } else { + if (copy_to_user(buf, &mig_ctl->device_state, count)) + return -EFAULT; + } + + return count; +} + +static ssize_t vfio_pci_handle_mig_pending_bytes( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) +{ + u64 pending_bytes; + + if (count != sizeof(pending_bytes) || iswrite) + return -EINVAL; + + if (mig_info->device_state == + (VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING)) { + /* In pre-copy state we have no data to return for now, + * return 0 pending bytes + */ + pending_bytes = 0; + } else { + pending_bytes = mig_info->pending_bytes; + } + + if (copy_to_user(buf, &pending_bytes, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_pci_handle_mig_data_offset( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) +{ + u64 data_offset = mig_info->data_offset; + + if (count != sizeof(data_offset) || iswrite) + return -EINVAL; + + if (copy_to_user(buf, &data_offset, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_pci_handle_mig_data_size( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) +{ + u64 data_size; + + if (count != sizeof(data_size)) + return -EINVAL; + + if (iswrite) { + /* data_size is writable only during resuming state */ + if (mig_info->device_state != VFIO_DEVICE_STATE_RESUMING) + return -EINVAL; + + if (copy_from_user(&data_size, buf, sizeof(data_size))) + return -EFAULT; + + mig_info->data_size = data_size; + } else { + if (mig_info->device_state != VFIO_DEVICE_STATE_SAVING) + return -EINVAL; + + if (copy_to_user(buf, &mig_info->data_size, + sizeof(data_size))) + return -EFAULT; + } + + return count; +} + +static ssize_t vfio_pci_handle_mig_dev_cmd(struct vfio_pci_migration_data *data, + char __user *buf, size_t count, bool iswrite) +{ + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + struct pci_dev *pdev = data->vf_dev; + u32 device_cmd; + int ret = -EFAULT; + + if (count != sizeof(device_cmd) || !iswrite || !mig_drv->dev_mig_ops) + return -EINVAL; + + if (copy_from_user(&device_cmd, buf, count)) + return -EFAULT; + + switch (device_cmd) { + case VFIO_DEVICE_MIGRATION_CANCEL: + if (mig_drv->dev_mig_ops->cancel) + ret = mig_drv->dev_mig_ops->cancel(pdev); + break; + default: + dev_err(&pdev->dev, "cmd is invaild\n"); + return -EINVAL; + } + + if (ret != 0) + return ret; + + return count; +} + +static ssize_t vfio_pci_handle_mig_drv_version( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) +{ + u32 version_id = mig_info->version_id; + + if (count != sizeof(version_id) || iswrite) + return -EINVAL; + + if (copy_to_user(buf, &version_id, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_pci_handle_mig_data_rw( + struct vfio_pci_migration_data *data, + char __user *buf, size_t count, u64 pos, bool iswrite) +{ + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + void *data_addr = data->vf_data; + + if (count == 0) { + dev_err(&data->vf_dev->dev, "qemu operation data size error!\n"); + return -EINVAL; + } + + data_addr += pos - mig_ctl->data_offset; + if (iswrite) { + if (copy_from_user(data_addr, buf, count)) + return -EFAULT; + + mig_ctl->pending_bytes += count; + if (mig_ctl->pending_bytes > data->state_size) + return -EINVAL; + } else { + if (copy_to_user(buf, data_addr, count)) + return -EFAULT; + + if (mig_ctl->pending_bytes < count) + return -EINVAL; + + mig_ctl->pending_bytes -= count; + } + + return count; +} + +static ssize_t vfio_pci_dev_migrn_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int index = + VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_migration_data *data = + (struct vfio_pci_migration_data *)vdev->region[index].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + int ret; + + if (pos >= vdev->region[index].size) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[index].size - pos)); + if (pos >= VFIO_MIGRATION_REGION_DATA_OFFSET) + return vfio_pci_handle_mig_data_rw(data, + buf, count, pos, iswrite); + + switch (pos) { + case VFIO_DEVICE_MIGRATION_OFFSET(device_state): + ret = vfio_pci_handle_mig_dev_state(data, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(pending_bytes): + ret = vfio_pci_handle_mig_pending_bytes(mig_ctl, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(data_offset): + ret = vfio_pci_handle_mig_data_offset(mig_ctl, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(data_size): + ret = vfio_pci_handle_mig_data_size(mig_ctl, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(device_cmd): + ret = vfio_pci_handle_mig_dev_cmd(data, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(version_id): + ret = vfio_pci_handle_mig_drv_version(mig_ctl, + buf, count, iswrite); + break; + default: + dev_err(&vdev->pdev->dev, "invalid pos offset\n"); + ret = -EFAULT; + break; + } + + if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING && + mig_ctl->pending_bytes == data->state_size && + mig_ctl->data_size == data->state_size) { + if (vfio_pci_device_state_restore(data) != 0) { + dev_err(&vdev->pdev->dev, "Failed to restore device state!\n"); + return -EFAULT; + } + mig_ctl->pending_bytes = 0; + mig_ctl->data_size = 0; + } + + return ret; +} + +static void vfio_pci_dev_migrn_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_migration_data *data = region->data; + + if (data) { + kfree(data->mig_ctl); + kfree(data); + } +} + +static const struct vfio_pci_regops vfio_pci_migration_regops = { + .rw = vfio_pci_dev_migrn_rw, + .release = vfio_pci_dev_migrn_release, +}; + +static int vfio_pci_migration_info_init(struct pci_dev *pdev, + struct vfio_device_migration_info *mig_info, + struct vfio_pci_vendor_mig_driver *mig_drv) +{ + int ret; + + ret = vfio_pci_device_get_info(pdev, mig_info, mig_drv); + if (ret) { + dev_err(&pdev->dev, "failed to get device info\n"); + return ret; + } + + if (mig_info->data_size > VFIO_MIGRATION_BUFFER_MAX_SIZE) { + dev_err(&pdev->dev, "mig_info->data_size %llu is invalid\n", + mig_info->data_size); + return -EINVAL; + } + + mig_info->data_offset = VFIO_MIGRATION_REGION_DATA_OFFSET; + return ret; +} + +static int vfio_device_mig_data_init(struct vfio_pci_device *vdev, + struct vfio_pci_migration_data *data) +{ + struct vfio_device_migration_info *mig_ctl; + u64 mig_offset; + int ret; + + mig_ctl = kzalloc(sizeof(*mig_ctl), GFP_KERNEL); + if (!mig_ctl) + return -ENOMEM; + + ret = vfio_pci_migration_info_init(vdev->pdev, mig_ctl, + data->mig_driver); + if (ret) { + dev_err(&vdev->pdev->dev, "get device info error!\n"); + goto err; + } + + mig_offset = sizeof(struct vfio_device_migration_info); + data->state_size = mig_ctl->data_size; + data->mig_ctl = krealloc(mig_ctl, mig_offset + data->state_size, + GFP_KERNEL); + if (!data->mig_ctl) { + ret = -ENOMEM; + goto err; + } + + data->vf_data = (void *)((char *)data->mig_ctl + mig_offset); + memset(data->vf_data, 0, data->state_size); + data->mig_ctl->data_size = 0; + + ret = vfio_pci_register_dev_region(vdev, VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, + &vfio_pci_migration_regops, mig_offset + data->state_size, + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE, data); + if (ret) { + kfree(data->mig_ctl); + return ret; + } + + return 0; +err: + kfree(mig_ctl); + return ret; +} + +int vfio_pci_migration_init(struct vfio_pci_device *vdev) +{ + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + struct vfio_pci_migration_data *data = NULL; + struct pci_dev *pdev = vdev->pdev; + int ret; + + mig_driver = vfio_pci_get_mig_driver(pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&pdev->dev, "unable to find a mig_driver module\n"); + return -EINVAL; + } + + if (!try_module_get(mig_driver->owner)) { + pr_err("module %s is not live\n", mig_driver->owner->name); + return -ENODEV; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + module_put(mig_driver->owner); + return -ENOMEM; + } + + data->mig_driver = mig_driver; + data->vf_dev = pdev; + + ret = vfio_device_mig_data_init(vdev, data); + if (ret) { + dev_err(&pdev->dev, "failed to init vfio device migration data!\n"); + goto err; + } + + return ret; +err: + kfree(data); + module_put(mig_driver->owner); + return ret; +} + +void vfio_pci_migration_exit(struct vfio_pci_device *vdev) +{ + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_warn(&vdev->pdev->dev, "mig_driver is not found\n"); + return; + } + + if (module_refcount(mig_driver->owner) > 0) { + vfio_pci_device_release(vdev->pdev, mig_driver); + module_put(mig_driver->owner); + } +} + +int vfio_pci_register_migration_ops(struct vfio_device_migration_ops *ops, + struct module *mod, struct pci_dev *pdev) +{ + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + if (!ops || !mod || !pdev) + return -EINVAL; + + mig_driver = vfio_pci_find_mig_drv(pdev, mod); + if (mig_driver) { + pr_info("%s migration ops has already been registered\n", + mod->name); + atomic_add(1, &mig_driver->count); + return 0; + } + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mig_driver = kzalloc(sizeof(*mig_driver), GFP_KERNEL); + if (!mig_driver) { + module_put(THIS_MODULE); + return -ENOMEM; + } + + mig_driver->pdev = pdev; + mig_driver->bus_num = pdev->bus->number; + mig_driver->owner = mod; + mig_driver->dev_mig_ops = ops; + + vfio_pci_add_mig_drv(mig_driver); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_register_migration_ops); + +void vfio_pci_unregister_migration_ops(struct module *mod, struct pci_dev *pdev) +{ + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + if (!mod || !pdev) + return; + + mig_driver = vfio_pci_find_mig_drv(pdev, mod); + if (!mig_driver) { + pr_err("mig_driver is not found\n"); + return; + } + + if (atomic_sub_and_test(1, &mig_driver->count)) { + vfio_pci_remove_mig_drv(mig_driver); + kfree(mig_driver); + module_put(THIS_MODULE); + pr_info("%s succeed to unregister migration ops\n", + THIS_MODULE->name); + } +} +EXPORT_SYMBOL_GPL(vfio_pci_unregister_migration_ops); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 17d2bae..03af269 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -15,6 +15,7 @@ #include <linux/pci.h> #include <linux/irqbypass.h> #include <linux/types.h> +#include <linux/vfio_pci_migration.h>
#ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -55,7 +56,7 @@ struct vfio_pci_irq_ctx { struct vfio_pci_region;
struct vfio_pci_regops { - size_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, + ssize_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); void (*release)(struct vfio_pci_device *vdev, struct vfio_pci_region *region); @@ -173,4 +174,15 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif + +extern bool vfio_dev_migration_is_supported(struct pci_dev *pdev); +extern int vfio_pci_migration_init(struct vfio_pci_device *vdev); +extern void vfio_pci_migration_exit(struct vfio_pci_device *vdev); +extern int vfio_pci_device_log_start(struct vfio_pci_device *vdev, + struct vf_migration_log_info *log_info); +extern int vfio_pci_device_log_stop(struct vfio_pci_device *vdev, + uint32_t uuid); +extern int vfio_pci_device_log_status_query(struct vfio_pci_device *vdev); +extern int vfio_pci_device_init(struct pci_dev *pdev); +extern void vfio_pci_device_uninit(struct pci_dev *pdev); #endif /* VFIO_PCI_PRIVATE_H */ diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 7a386fb..35f2a29 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include <linux/vfio_pci_migration.h> #include <linux/wait.h> #include <linux/sched/signal.h>
@@ -40,6 +41,9 @@ #define DRIVER_AUTHOR "Alex Williamson alex.williamson@redhat.com" #define DRIVER_DESC "VFIO - User Level meta-driver"
+#define LOG_BUF_FRAG_SIZE (2 * 1024 * 1024) // fix to 2M +#define LOG_BUF_MAX_ADDRS_SIZE 128 // max vm ram size is 1T + static struct vfio { struct class *class; struct list_head iommu_drivers_list; @@ -57,6 +61,14 @@ struct vfio_iommu_driver { struct list_head vfio_next; };
+struct vfio_log_buf { + struct vfio_log_buf_info info; + int fd; + int buffer_state; + int device_state; + unsigned long *cpu_addrs; +}; + struct vfio_container { struct kref kref; struct list_head group_list; @@ -64,6 +76,7 @@ struct vfio_container { struct vfio_iommu_driver *iommu_driver; void *iommu_data; bool noiommu; + struct vfio_log_buf log_buf; };
struct vfio_unbound_dev { @@ -1158,8 +1171,398 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, return ret; }
+static long vfio_dispatch_cmd_to_devices(const struct vfio_container *container, + unsigned int cmd, unsigned long arg) +{ + struct vfio_group *group = NULL; + struct vfio_device *device = NULL; + long ret = -ENXIO; + + list_for_each_entry(group, &container->group_list, container_next) { + list_for_each_entry(device, &group->device_list, group_next) { + ret = device->ops->ioctl(device->device_data, cmd, arg); + if (ret) { + pr_err("dispatch cmd to devices failed\n"); + return ret; + } + } + } + return ret; +} + +static long vfio_log_buf_start(struct vfio_container *container) +{ + struct vfio_log_buf_ctl log_buf_ctl; + long ret; + + log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info); + log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_START; + log_buf_ctl.data = (void *)&container->log_buf.info; + ret = vfio_dispatch_cmd_to_devices(container, VFIO_DEVICE_LOG_BUF_CTL, + (unsigned long)&log_buf_ctl); + if (ret) + return ret; + + container->log_buf.device_state = 1; + return 0; +} + +static long vfio_log_buf_stop(struct vfio_container *container) +{ + struct vfio_log_buf_ctl log_buf_ctl; + long ret; + + if (container->log_buf.device_state == 0) { + pr_warn("device already stopped\n"); + return 0; + } + + log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info); + log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_STOP; + log_buf_ctl.data = (void *)&container->log_buf.info; + ret = vfio_dispatch_cmd_to_devices(container, VFIO_DEVICE_LOG_BUF_CTL, + (unsigned long)&log_buf_ctl); + if (ret) + return ret; + + container->log_buf.device_state = 0; + return 0; +} + +static long vfio_log_buf_query(struct vfio_container *container) +{ + struct vfio_log_buf_ctl log_buf_ctl; + + log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info); + log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY; + log_buf_ctl.data = (void *)&container->log_buf.info; + + return vfio_dispatch_cmd_to_devices(container, + VFIO_DEVICE_LOG_BUF_CTL, (unsigned long)&log_buf_ctl); +} + +static int vfio_log_buf_fops_mmap(struct file *filep, + struct vm_area_struct *vma) +{ + struct vfio_container *container = filep->private_data; + struct vfio_log_buf *log_buf = &container->log_buf; + unsigned long frag_pg_size; + unsigned long frag_offset; + phys_addr_t pa; + int ret = -EINVAL; + + if (!log_buf->cpu_addrs) { + pr_err("mmap before setup, please setup log buf first\n"); + return ret; + } + + if (log_buf->info.frag_size < PAGE_SIZE) { + pr_err("mmap frag size should not less than page size!\n"); + return ret; + } + + frag_pg_size = log_buf->info.frag_size / PAGE_SIZE; + frag_offset = vma->vm_pgoff / frag_pg_size; + + if (frag_offset >= log_buf->info.addrs_size) { + pr_err("mmap offset out of range!\n"); + return ret; + } + + if (vma->vm_end - vma->vm_start != log_buf->info.frag_size) { + pr_err("mmap size error, should be aligned with frag size!\n"); + return ret; + } + + pa = virt_to_phys((void *)log_buf->cpu_addrs[frag_offset]); + ret = remap_pfn_range(vma, vma->vm_start, + pa >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + if (ret) + pr_err("remap_pfn_range error!\n"); + return ret; +} + +static struct device *vfio_get_dev(struct vfio_container *container) +{ + struct vfio_group *group = NULL; + struct vfio_device *device = NULL; + + list_for_each_entry(group, &container->group_list, container_next) { + list_for_each_entry(device, &group->device_list, group_next) { + return device->dev; + } + } + return NULL; +} + +static void vfio_log_buf_release_dma(struct device *dev, + struct vfio_log_buf *log_buf) +{ + int i; + + for (i = 0; i < log_buf->info.addrs_size; i++) { + if ((log_buf->cpu_addrs && log_buf->cpu_addrs[i] != 0) && + (log_buf->info.sgevec && + log_buf->info.sgevec[i].addr != 0)) { + dma_free_coherent(dev, log_buf->info.frag_size, + (void *)log_buf->cpu_addrs[i], + log_buf->info.sgevec[i].addr); + log_buf->cpu_addrs[i] = 0; + log_buf->info.sgevec[i].addr = 0; + } + } +} + +static long vfio_log_buf_alloc_dma(struct vfio_log_buf_info *info, + struct vfio_log_buf *log_buf, struct device *dev) +{ + int i; + + for (i = 0; i < info->addrs_size; i++) { + log_buf->cpu_addrs[i] = (unsigned long)dma_alloc_coherent(dev, + info->frag_size, &log_buf->info.sgevec[i].addr, + GFP_KERNEL); + log_buf->info.sgevec[i].len = info->frag_size; + if (log_buf->cpu_addrs[i] == 0 || + log_buf->info.sgevec[i].addr == 0) { + return -ENOMEM; + } + } + return 0; +} + +static long vfio_log_buf_alloc_addrs(struct vfio_log_buf_info *info, + struct vfio_log_buf *log_buf) +{ + log_buf->info.sgevec = kcalloc(info->addrs_size, + sizeof(struct vfio_log_buf_sge), GFP_KERNEL); + if (!log_buf->info.sgevec) + return -ENOMEM; + + log_buf->cpu_addrs = kcalloc(info->addrs_size, + sizeof(unsigned long), GFP_KERNEL); + if (!log_buf->cpu_addrs) { + kfree(log_buf->info.sgevec); + log_buf->info.sgevec = NULL; + return -ENOMEM; + } + + return 0; +} + +static long vfio_log_buf_info_valid(struct vfio_log_buf_info *info) +{ + if (info->addrs_size > LOG_BUF_MAX_ADDRS_SIZE || + info->addrs_size == 0) { + pr_err("can`t support vm ram size larger than 1T or equal to 0\n"); + return -EINVAL; + } + if (info->frag_size != LOG_BUF_FRAG_SIZE) { + pr_err("only support %d frag size\n", LOG_BUF_FRAG_SIZE); + return -EINVAL; + } + return 0; +} + +static long vfio_log_buf_setup(struct vfio_container *container, + unsigned long data) +{ + struct vfio_log_buf_info info; + struct vfio_log_buf *log_buf = &container->log_buf; + struct device *dev = NULL; + long ret; + + if (log_buf->info.sgevec) { + pr_warn("log buf already setup\n"); + return 0; + } + + if (copy_from_user(&info, (void __user *)data, + sizeof(struct vfio_log_buf_info))) + return -EFAULT; + + ret = vfio_log_buf_info_valid(&info); + if (ret) + return ret; + + ret = vfio_log_buf_alloc_addrs(&info, log_buf); + if (ret) + goto err_out; + + dev = vfio_get_dev(container); + if (!dev) { + pr_err("can`t get dev\n"); + goto err_free_addrs; + } + + ret = vfio_log_buf_alloc_dma(&info, log_buf, dev); + if (ret) + goto err_free_dma_array; + + log_buf->info.uuid = info.uuid; + log_buf->info.buffer_size = info.buffer_size; + log_buf->info.frag_size = info.frag_size; + log_buf->info.addrs_size = info.addrs_size; + log_buf->buffer_state = 1; + return 0; + +err_free_dma_array: + vfio_log_buf_release_dma(dev, log_buf); +err_free_addrs: + kfree(log_buf->cpu_addrs); + log_buf->cpu_addrs = NULL; + kfree(log_buf->info.sgevec); + log_buf->info.sgevec = NULL; +err_out: + return -ENOMEM; +} + +static long vfio_log_buf_release_buffer(struct vfio_container *container) +{ + struct vfio_log_buf *log_buf = &container->log_buf; + struct device *dev = NULL; + + if (log_buf->buffer_state == 0) { + pr_warn("buffer already released\n"); + return 0; + } + + dev = vfio_get_dev(container); + if (!dev) { + pr_err("can`t get dev\n"); + return -EFAULT; + } + + vfio_log_buf_release_dma(dev, log_buf); + + kfree(log_buf->cpu_addrs); + log_buf->cpu_addrs = NULL; + + kfree(log_buf->info.sgevec); + log_buf->info.sgevec = NULL; + + log_buf->buffer_state = 0; + return 0; +} + +static int vfio_log_buf_release(struct inode *inode, struct file *filep) +{ + struct vfio_container *container = filep->private_data; + + vfio_log_buf_stop(container); + vfio_log_buf_release_buffer(container); + memset(&container->log_buf, 0, sizeof(struct vfio_log_buf)); + return 0; +} + +static long vfio_ioctl_handle_log_buf_ctl(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_log_buf_ctl log_buf_ctl; + long ret = 0; + + if (copy_from_user(&log_buf_ctl, (void __user *)arg, + sizeof(struct vfio_log_buf_ctl))) + return -EFAULT; + + switch (log_buf_ctl.flags) { + case VFIO_DEVICE_LOG_BUF_FLAG_SETUP: + ret = vfio_log_buf_setup(container, + (unsigned long)log_buf_ctl.data); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_RELEASE: + ret = vfio_log_buf_release_buffer(container); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_START: + ret = vfio_log_buf_start(container); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STOP: + ret = vfio_log_buf_stop(container); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY: + ret = vfio_log_buf_query(container); + break; + default: + pr_err("log buf control flag incorrect\n"); + ret = -EINVAL; + break; + } + return ret; +} + +static long vfio_log_buf_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_container *container = filep->private_data; + long ret = -EINVAL; + + switch (cmd) { + case VFIO_LOG_BUF_CTL: + ret = vfio_ioctl_handle_log_buf_ctl(container, arg); + break; + default: + pr_err("log buf control cmd incorrect\n"); + break; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static long vfio_log_buf_fops_compat_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)compat_ptr(arg); + return vfio_log_buf_fops_unl_ioctl(filep, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static const struct file_operations vfio_log_buf_fops = { + .owner = THIS_MODULE, + .mmap = vfio_log_buf_fops_mmap, + .unlocked_ioctl = vfio_log_buf_fops_unl_ioctl, + .release = vfio_log_buf_release, +#ifdef CONFIG_COMPAT + .compat_ioctl = vfio_log_buf_fops_compat_ioctl, +#endif +}; + +static int vfio_get_log_buf_fd(struct vfio_container *container, + unsigned long arg) +{ + struct file *filep = NULL; + int ret; + + if (container->log_buf.fd > 0) + return container->log_buf.fd; + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) { + pr_err("get_unused_fd_flags get fd failed\n"); + return ret; + } + + filep = anon_inode_getfile("[vfio-log-buf]", &vfio_log_buf_fops, + container, O_RDWR); + if (IS_ERR(filep)) { + pr_err("anon_inode_getfile failed\n"); + put_unused_fd(ret); + ret = PTR_ERR(filep); + return ret; + } + + filep->f_mode |= (FMODE_READ | FMODE_WRITE | FMODE_LSEEK); + + fd_install(ret, filep); + + container->log_buf.fd = ret; + return ret; +} + static long vfio_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { struct vfio_container *container = filep->private_data; struct vfio_iommu_driver *driver; @@ -1179,6 +1582,9 @@ static long vfio_fops_unl_ioctl(struct file *filep, case VFIO_SET_IOMMU: ret = vfio_ioctl_set_iommu(container, arg); break; + case VFIO_GET_LOG_BUF_FD: + ret = vfio_get_log_buf_fd(container, arg); + break; default: driver = container->iommu_driver; data = container->iommu_data; @@ -1210,6 +1616,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) INIT_LIST_HEAD(&container->group_list); init_rwsem(&container->group_lock); kref_init(&container->kref); + memset(&container->log_buf, 0, sizeof(struct vfio_log_buf));
filep->private_data = container;
@@ -1219,9 +1626,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep) { struct vfio_container *container = filep->private_data; - filep->private_data = NULL; - vfio_container_put(container);
return 0; diff --git a/include/linux/vfio_pci_migration.h b/include/linux/vfio_pci_migration.h new file mode 100644 index 0000000..464ffb4 --- /dev/null +++ b/include/linux/vfio_pci_migration.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#ifndef VFIO_PCI_MIGRATION_H +#define VFIO_PCI_MIGRATION_H + +#include <linux/types.h> +#include <linux/pci.h> + +#define VFIO_REGION_TYPE_MIGRATION (3) +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ +#define VFIO_REGION_SUBTYPE_MIGRATION (1) + +#define VFIO_MIGRATION_BUFFER_MAX_SIZE SZ_256K +#define VFIO_MIGRATION_REGION_DATA_OFFSET \ + (sizeof(struct vfio_device_migration_info)) +#define VFIO_DEVICE_MIGRATION_OFFSET(x) \ + offsetof(struct vfio_device_migration_info, x) + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ +#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING) + __u32 reserved; + + __u32 device_cmd; + __u32 version_id; + + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +enum { + VFIO_DEVICE_STOP = 0xffff0001, + VFIO_DEVICE_CONTINUE, + VFIO_DEVICE_MIGRATION_CANCEL, +}; + +struct vfio_log_buf_sge { + __u64 len; + __u64 addr; +}; + +struct vfio_log_buf_info { + __u32 uuid; + __u64 buffer_size; + __u64 addrs_size; + __u64 frag_size; + struct vfio_log_buf_sge *sgevec; +}; + +struct vfio_log_buf_ctl { + __u32 argsz; + __u32 flags; + #define VFIO_DEVICE_LOG_BUF_FLAG_SETUP (1 << 0) + #define VFIO_DEVICE_LOG_BUF_FLAG_RELEASE (1 << 1) + #define VFIO_DEVICE_LOG_BUF_FLAG_START (1 << 2) + #define VFIO_DEVICE_LOG_BUF_FLAG_STOP (1 << 3) + #define VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY (1 << 4) + void *data; +}; +#define VFIO_LOG_BUF_CTL _IO(VFIO_TYPE, VFIO_BASE + 21) +#define VFIO_GET_LOG_BUF_FD _IO(VFIO_TYPE, VFIO_BASE + 22) +#define VFIO_DEVICE_LOG_BUF_CTL _IO(VFIO_TYPE, VFIO_BASE + 23) + +struct vf_migration_log_info { + __u32 dom_uuid; + __u64 buffer_size; + __u64 sge_len; + __u64 sge_num; + struct vfio_log_buf_sge *sgevec; +}; + +struct vfio_device_migration_ops { + /* Get device information */ + int (*get_info)(struct pci_dev *pdev, + struct vfio_device_migration_info *info); + /* Enable a vf device */ + int (*enable)(struct pci_dev *pdev); + /* Disable a vf device */ + int (*disable)(struct pci_dev *pdev); + /* Save a vf device */ + int (*save)(struct pci_dev *pdev, void *base, + uint64_t off, uint64_t count); + /* Resuming a vf device */ + int (*restore)(struct pci_dev *pdev, void *base, + uint64_t off, uint64_t count); + /* Log start a vf device */ + int (*log_start)(struct pci_dev *pdev, + struct vf_migration_log_info *log_info); + /* Log stop a vf device */ + int (*log_stop)(struct pci_dev *pdev, uint32_t uuid); + /* Get vf device log status */ + int (*get_log_status)(struct pci_dev *pdev); + /* Pre enable a vf device(load_setup, before restore a vf) */ + int (*pre_enable)(struct pci_dev *pdev); + /* Cancel a vf device when live migration failed (rollback) */ + int (*cancel)(struct pci_dev *pdev); + /* Init a vf device */ + int (*init)(struct pci_dev *pdev); + /* Uninit a vf device */ + void (*uninit)(struct pci_dev *pdev); + /* Release a vf device */ + void (*release)(struct pci_dev *pdev); +}; + +struct vfio_pci_vendor_mig_driver { + struct pci_dev *pdev; + unsigned char bus_num; + struct vfio_device_migration_ops *dev_mig_ops; + struct module *owner; + atomic_t count; + struct list_head list; +}; + +struct vfio_pci_migration_data { + u64 state_size; + struct pci_dev *vf_dev; + struct vfio_pci_vendor_mig_driver *mig_driver; + struct vfio_device_migration_info *mig_ctl; + void *vf_data; +}; + +int vfio_pci_register_migration_ops(struct vfio_device_migration_ops *ops, + struct module *mod, struct pci_dev *pdev); +void vfio_pci_unregister_migration_ops(struct module *mod, + struct pci_dev *pdev); + +#endif /* VFIO_PCI_MIGRATION_H */
Looks good to me.
Reviewed-by: Bao Yonglei baoyonglei@huawei.com
-----邮件原件----- 发件人: RongWang [mailto:w_angrong@163.com] 发送时间: 2022年6月24日 11:13 收件人: kernel@openeuler.org 抄送: wangrong (T) wangrong68@huawei.com; qiulaibin qiulaibin@huawei.com; Baoyonglei baoyonglei@huawei.com 主题: [PATCH] vfio: framework supporting vfio device hot migration
From: Rong Wang w_angrong@163.com
kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5CO9A CVE: NA
---------------------------------
As pass through devices, hypervisor can`t control the status of device, and can`t track dirty memory DMA from device, during migration. The goal of this framework is to combine hardware to accomplish the task above.
qemu |status control and dirty memory report vfio |ops to hardware hardware
Signed-off-by: Rong Wang w_angrong@163.com Signed-off-by: HuHua Li 18245010845@163.com Signed-off-by: Ripeng Qiu 965412048@qq.com --- drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/vfio_pci.c | 54 +++ drivers/vfio/pci/vfio_pci_migration.c | 755 ++++++++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h | 14 +- drivers/vfio/vfio.c | 411 +++++++++++++++++- include/linux/vfio_pci_migration.h | 136 ++++++ 6 files changed, 1367 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_migration.c create mode 100644 include/linux/vfio_pci_migration.h
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 76d8ec0..80a777d 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,5 +1,5 @@
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o +vfio_pci_config.o vfio_pci_migration.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 51b791c..59d8280 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -30,6 +30,7 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#include <linux/vfio_pci_migration.h>
#include "vfio_pci_private.h"
@@ -296,6 +297,14 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
vfio_pci_probe_mmaps(vdev);
+ if (vfio_dev_migration_is_supported(pdev)) { + ret = vfio_pci_migration_init(vdev); + if (ret) { + dev_warn(&vdev->pdev->dev, "Failed to init vfio_pci_migration\n"); + vfio_pci_disable(vdev); + return ret; + } + } return 0; }
@@ -392,6 +401,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) out: pci_disable_device(pdev);
+ vfio_pci_migration_exit(vdev); vfio_pci_try_bus_reset(vdev);
if (!disable_idle_d3) @@ -642,6 +652,41 @@ struct vfio_devices { int max_index; };
+static long vfio_pci_handle_log_buf_ctl(struct vfio_pci_device *vdev, + const unsigned long arg) +{ + struct vfio_log_buf_ctl *log_buf_ctl = NULL; + struct vfio_log_buf_info *log_buf_info = NULL; + struct vf_migration_log_info migration_log_info; + long ret = 0; + + log_buf_ctl = (struct vfio_log_buf_ctl *)arg; + log_buf_info = (struct vfio_log_buf_info *)log_buf_ctl->data; + + switch (log_buf_ctl->flags) { + case VFIO_DEVICE_LOG_BUF_FLAG_START: + migration_log_info.dom_uuid = log_buf_info->uuid; + migration_log_info.buffer_size = + log_buf_info->buffer_size; + migration_log_info.sge_num = log_buf_info->addrs_size; + migration_log_info.sge_len = log_buf_info->frag_size; + migration_log_info.sgevec = log_buf_info->sgevec; + ret = vfio_pci_device_log_start(vdev, + &migration_log_info); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STOP: + ret = vfio_pci_device_log_stop(vdev, + log_buf_info->uuid); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY: + ret = vfio_pci_device_log_status_query(vdev); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -1142,6 +1187,8 @@ static long vfio_pci_ioctl(void *device_data,
return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, ioeventfd.fd); + } else if (cmd == VFIO_DEVICE_LOG_BUF_CTL) { + return vfio_pci_handle_log_buf_ctl(vdev, arg); }
return -ENOTTY; @@ -1566,6 +1613,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_power_state(pdev, PCI_D3hot); }
+ if (vfio_dev_migration_is_supported(pdev)) + ret = vfio_pci_device_init(pdev); + return ret; }
@@ -1591,6 +1641,10 @@ static void vfio_pci_remove(struct pci_dev *pdev)
if (!disable_idle_d3) pci_set_power_state(pdev, PCI_D0); + + if (vfio_dev_migration_is_supported(pdev)) { + vfio_pci_device_uninit(pdev); + } }
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/vfio_pci_migration.c b/drivers/vfio/pci/vfio_pci_migration.c new file mode 100644 index 0000000..f69cd13 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_migration.c @@ -0,0 +1,755 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_migration.h> + +#include "vfio_pci_private.h" + +static LIST_HEAD(vfio_pci_mig_drivers_list); +static DEFINE_MUTEX(vfio_pci_mig_drivers_mutex); + +static void vfio_pci_add_mig_drv(struct vfio_pci_vendor_mig_driver +*mig_drv) { + mutex_lock(&vfio_pci_mig_drivers_mutex); + atomic_set(&mig_drv->count, 1); + list_add_tail(&mig_drv->list, &vfio_pci_mig_drivers_list); + mutex_unlock(&vfio_pci_mig_drivers_mutex); +} + +static void vfio_pci_remove_mig_drv(struct vfio_pci_vendor_mig_driver +*mig_drv) { + mutex_lock(&vfio_pci_mig_drivers_mutex); + list_del(&mig_drv->list); + mutex_unlock(&vfio_pci_mig_drivers_mutex); +} + +static struct vfio_pci_vendor_mig_driver * + vfio_pci_find_mig_drv(struct pci_dev *pdev, struct module *module) { + struct vfio_pci_vendor_mig_driver *mig_drv = NULL; + + mutex_lock(&vfio_pci_mig_drivers_mutex); + list_for_each_entry(mig_drv, &vfio_pci_mig_drivers_list, list) { + if (mig_drv->owner == module) { + if (mig_drv->bus_num == pdev->bus->number) + goto out; + } + } + mig_drv = NULL; +out: + mutex_unlock(&vfio_pci_mig_drivers_mutex); + return mig_drv; +} + +static struct vfio_pci_vendor_mig_driver * + vfio_pci_get_mig_driver(struct pci_dev *pdev) { + struct vfio_pci_vendor_mig_driver *mig_drv = NULL; + struct pci_dev *pf_dev = pci_physfn(pdev); + + mutex_lock(&vfio_pci_mig_drivers_mutex); + list_for_each_entry(mig_drv, &vfio_pci_mig_drivers_list, list) { + if (mig_drv->bus_num == pf_dev->bus->number) + goto out; + } + mig_drv = NULL; +out: + mutex_unlock(&vfio_pci_mig_drivers_mutex); + return mig_drv; +} + +bool vfio_dev_migration_is_supported(struct pci_dev *pdev) { + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + mig_driver = vfio_pci_get_mig_driver(pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_warn(&pdev->dev, "unable to find a mig_drv module\n"); + return false; + } + + return true; +} + +int vfio_pci_device_log_start(struct vfio_pci_device *vdev, + struct vf_migration_log_info *log_info) { + struct vfio_pci_vendor_mig_driver *mig_driver; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (!mig_driver->dev_mig_ops->log_start || + (mig_driver->dev_mig_ops->log_start(vdev->pdev, + log_info) != 0)) { + dev_err(&vdev->pdev->dev, "failed to set log start\n"); + return -EFAULT; + } + + return 0; +} + +int vfio_pci_device_log_stop(struct vfio_pci_device *vdev, uint32_t +uuid) { + struct vfio_pci_vendor_mig_driver *mig_driver; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (!mig_driver->dev_mig_ops->log_stop || + (mig_driver->dev_mig_ops->log_stop(vdev->pdev, uuid) != 0)) { + dev_err(&vdev->pdev->dev, "failed to set log stop\n"); + return -EFAULT; + } + + return 0; +} + +int vfio_pci_device_log_status_query(struct vfio_pci_device *vdev) { + struct vfio_pci_vendor_mig_driver *mig_driver; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (!mig_driver->dev_mig_ops->get_log_status || + (mig_driver->dev_mig_ops->get_log_status(vdev->pdev) != 0)) { + dev_err(&vdev->pdev->dev, "failed to get log status\n"); + return -EFAULT; + } + + return 0; +} + +int vfio_pci_device_init(struct pci_dev *pdev) { + struct vfio_pci_vendor_mig_driver *mig_drv; + + mig_drv = vfio_pci_get_mig_driver(pdev); + if (!mig_drv || !mig_drv->dev_mig_ops) { + dev_err(&pdev->dev, "unable to find a mig_drv module\n"); + return -EFAULT; + } + + if (mig_drv->dev_mig_ops->init) + return mig_drv->dev_mig_ops->init(pdev); + + return -EFAULT; +} + +void vfio_pci_device_uninit(struct pci_dev *pdev) { + struct vfio_pci_vendor_mig_driver *mig_drv; + + mig_drv = vfio_pci_get_mig_driver(pdev); + if (!mig_drv || !mig_drv->dev_mig_ops) { + dev_err(&pdev->dev, "unable to find a mig_drv module\n"); + return; + } + + if (mig_drv->dev_mig_ops->uninit) + mig_drv->dev_mig_ops->uninit(pdev); +} + +static void vfio_pci_device_release(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) { + if (mig_drv->dev_mig_ops->release) + mig_drv->dev_mig_ops->release(pdev); +} + +static int vfio_pci_device_get_info(struct pci_dev *pdev, + struct vfio_device_migration_info *mig_info, + struct vfio_pci_vendor_mig_driver *mig_drv) { + if (mig_drv->dev_mig_ops->get_info) + return mig_drv->dev_mig_ops->get_info(pdev, mig_info); + return -EFAULT; +} + +static int vfio_pci_device_enable(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) { + if (!mig_drv->dev_mig_ops->enable || + (mig_drv->dev_mig_ops->enable(pdev) != 0)) { + return -EINVAL; + } + + return 0; +} + +static int vfio_pci_device_disable(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) { + if (!mig_drv->dev_mig_ops->disable || + (mig_drv->dev_mig_ops->disable(pdev) != 0)) + return -EINVAL; + + return 0; +} + +static int vfio_pci_device_pre_enable(struct pci_dev *pdev, + struct vfio_pci_vendor_mig_driver *mig_drv) { + if (!mig_drv->dev_mig_ops->pre_enable || + (mig_drv->dev_mig_ops->pre_enable(pdev) != 0)) + return -EINVAL; + + return 0; +} + +static int vfio_pci_device_state_save(struct pci_dev *pdev, + struct vfio_pci_migration_data *data) +{ + struct vfio_device_migration_info *mig_info = data->mig_ctl; + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + void *base = (void *)mig_info; + int ret = 0; + + if ((mig_info->device_state & VFIO_DEVICE_STATE_RUNNING) != 0) { + ret = vfio_pci_device_disable(pdev, mig_drv); + if (ret) { + dev_err(&pdev->dev, "failed to stop VF function!\n"); + return ret; + } + mig_info->device_state &= ~VFIO_DEVICE_STATE_RUNNING; + } + + if (mig_drv->dev_mig_ops && mig_drv->dev_mig_ops->save) { + ret = mig_drv->dev_mig_ops->save(pdev, base, + mig_info->data_offset, data->state_size); + if (ret) { + dev_err(&pdev->dev, "failed to save device state!\n"); + return -EINVAL; + } + } else { + return -EFAULT; + } + + mig_info->data_size = data->state_size; + mig_info->pending_bytes = mig_info->data_size; + return ret; +} + +static int vfio_pci_device_state_restore(struct vfio_pci_migration_data +*data) { + struct vfio_device_migration_info *mig_info = data->mig_ctl; + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + struct pci_dev *pdev = data->vf_dev; + void *base = (void *)mig_info; + int ret; + + if (mig_drv->dev_mig_ops && mig_drv->dev_mig_ops->restore) { + ret = mig_drv->dev_mig_ops->restore(pdev, base, + mig_info->data_offset, mig_info->data_size); + if (ret) { + dev_err(&pdev->dev, "failed to restore device state!\n"); + return -EINVAL; + } + return 0; + } + + return -EFAULT; +} + +static int vfio_pci_set_device_state(struct vfio_pci_migration_data *data, + u32 state) +{ + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + struct pci_dev *pdev = data->vf_dev; + int ret = 0; + + if (state == mig_ctl->device_state) + return 0; + + if (!mig_drv->dev_mig_ops) + return -EINVAL; + + switch (state) { + case VFIO_DEVICE_STATE_RUNNING: + if (!(mig_ctl->device_state & + VFIO_DEVICE_STATE_RUNNING)) + ret = vfio_pci_device_enable(pdev, mig_drv); + break; + case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING: + /* + * (pre-copy) - device should start logging data. + */ + ret = 0; + break; + case VFIO_DEVICE_STATE_SAVING: + /* stop the vf function, save state */ + ret = vfio_pci_device_state_save(pdev, data); + break; + case VFIO_DEVICE_STATE_STOP: + if (mig_ctl->device_state & VFIO_DEVICE_STATE_RUNNING) + ret = vfio_pci_device_disable(pdev, mig_drv); + break; + case VFIO_DEVICE_STATE_RESUMING: + ret = vfio_pci_device_pre_enable(pdev, mig_drv); + break; + default: + ret = -EFAULT; + break; + } + + if (ret) + return ret; + + mig_ctl->device_state = state; + return 0; +} + +static ssize_t vfio_pci_handle_mig_dev_state( + struct vfio_pci_migration_data *data, + char __user *buf, size_t count, bool iswrite) { + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + u32 device_state; + int ret; + + if (count != sizeof(device_state)) + return -EINVAL; + + if (iswrite) { + if (copy_from_user(&device_state, buf, count)) + return -EFAULT; + + ret = vfio_pci_set_device_state(data, device_state); + if (ret) + return ret; + } else { + if (copy_to_user(buf, &mig_ctl->device_state, count)) + return -EFAULT; + } + + return count; +} + +static ssize_t vfio_pci_handle_mig_pending_bytes( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) { + u64 pending_bytes; + + if (count != sizeof(pending_bytes) || iswrite) + return -EINVAL; + + if (mig_info->device_state == + (VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING)) { + /* In pre-copy state we have no data to return for now, + * return 0 pending bytes + */ + pending_bytes = 0; + } else { + pending_bytes = mig_info->pending_bytes; + } + + if (copy_to_user(buf, &pending_bytes, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_pci_handle_mig_data_offset( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) { + u64 data_offset = mig_info->data_offset; + + if (count != sizeof(data_offset) || iswrite) + return -EINVAL; + + if (copy_to_user(buf, &data_offset, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_pci_handle_mig_data_size( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) { + u64 data_size; + + if (count != sizeof(data_size)) + return -EINVAL; + + if (iswrite) { + /* data_size is writable only during resuming state */ + if (mig_info->device_state != VFIO_DEVICE_STATE_RESUMING) + return -EINVAL; + + if (copy_from_user(&data_size, buf, sizeof(data_size))) + return -EFAULT; + + mig_info->data_size = data_size; + } else { + if (mig_info->device_state != VFIO_DEVICE_STATE_SAVING) + return -EINVAL; + + if (copy_to_user(buf, &mig_info->data_size, + sizeof(data_size))) + return -EFAULT; + } + + return count; +} + +static ssize_t vfio_pci_handle_mig_dev_cmd(struct vfio_pci_migration_data *data, + char __user *buf, size_t count, bool iswrite) { + struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver; + struct pci_dev *pdev = data->vf_dev; + u32 device_cmd; + int ret = -EFAULT; + + if (count != sizeof(device_cmd) || !iswrite || !mig_drv->dev_mig_ops) + return -EINVAL; + + if (copy_from_user(&device_cmd, buf, count)) + return -EFAULT; + + switch (device_cmd) { + case VFIO_DEVICE_MIGRATION_CANCEL: + if (mig_drv->dev_mig_ops->cancel) + ret = mig_drv->dev_mig_ops->cancel(pdev); + break; + default: + dev_err(&pdev->dev, "cmd is invaild\n"); + return -EINVAL; + } + + if (ret != 0) + return ret; + + return count; +} + +static ssize_t vfio_pci_handle_mig_drv_version( + struct vfio_device_migration_info *mig_info, + char __user *buf, size_t count, bool iswrite) { + u32 version_id = mig_info->version_id; + + if (count != sizeof(version_id) || iswrite) + return -EINVAL; + + if (copy_to_user(buf, &version_id, count)) + return -EFAULT; + + return count; +} + +static ssize_t vfio_pci_handle_mig_data_rw( + struct vfio_pci_migration_data *data, + char __user *buf, size_t count, u64 pos, bool iswrite) { + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + void *data_addr = data->vf_data; + + if (count == 0) { + dev_err(&data->vf_dev->dev, "qemu operation data size error!\n"); + return -EINVAL; + } + + data_addr += pos - mig_ctl->data_offset; + if (iswrite) { + if (copy_from_user(data_addr, buf, count)) + return -EFAULT; + + mig_ctl->pending_bytes += count; + if (mig_ctl->pending_bytes > data->state_size) + return -EINVAL; + } else { + if (copy_to_user(buf, data_addr, count)) + return -EFAULT; + + if (mig_ctl->pending_bytes < count) + return -EINVAL; + + mig_ctl->pending_bytes -= count; + } + + return count; +} + +static ssize_t vfio_pci_dev_migrn_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, loff_t *ppos, bool iswrite) { + unsigned int index = + VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; + struct vfio_pci_migration_data *data = + (struct vfio_pci_migration_data *)vdev->region[index].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + struct vfio_device_migration_info *mig_ctl = data->mig_ctl; + int ret; + + if (pos >= vdev->region[index].size) + return -EINVAL; + + count = min(count, (size_t)(vdev->region[index].size - pos)); + if (pos >= VFIO_MIGRATION_REGION_DATA_OFFSET) + return vfio_pci_handle_mig_data_rw(data, + buf, count, pos, iswrite); + + switch (pos) { + case VFIO_DEVICE_MIGRATION_OFFSET(device_state): + ret = vfio_pci_handle_mig_dev_state(data, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(pending_bytes): + ret = vfio_pci_handle_mig_pending_bytes(mig_ctl, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(data_offset): + ret = vfio_pci_handle_mig_data_offset(mig_ctl, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(data_size): + ret = vfio_pci_handle_mig_data_size(mig_ctl, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(device_cmd): + ret = vfio_pci_handle_mig_dev_cmd(data, + buf, count, iswrite); + break; + case VFIO_DEVICE_MIGRATION_OFFSET(version_id): + ret = vfio_pci_handle_mig_drv_version(mig_ctl, + buf, count, iswrite); + break; + default: + dev_err(&vdev->pdev->dev, "invalid pos offset\n"); + ret = -EFAULT; + break; + } + + if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING && + mig_ctl->pending_bytes == data->state_size && + mig_ctl->data_size == data->state_size) { + if (vfio_pci_device_state_restore(data) != 0) { + dev_err(&vdev->pdev->dev, "Failed to restore device state!\n"); + return -EFAULT; + } + mig_ctl->pending_bytes = 0; + mig_ctl->data_size = 0; + } + + return ret; +} + +static void vfio_pci_dev_migrn_release(struct vfio_pci_device *vdev, + struct vfio_pci_region *region) +{ + struct vfio_pci_migration_data *data = region->data; + + if (data) { + kfree(data->mig_ctl); + kfree(data); + } +} + +static const struct vfio_pci_regops vfio_pci_migration_regops = { + .rw = vfio_pci_dev_migrn_rw, + .release = vfio_pci_dev_migrn_release, }; + +static int vfio_pci_migration_info_init(struct pci_dev *pdev, + struct vfio_device_migration_info *mig_info, + struct vfio_pci_vendor_mig_driver *mig_drv) { + int ret; + + ret = vfio_pci_device_get_info(pdev, mig_info, mig_drv); + if (ret) { + dev_err(&pdev->dev, "failed to get device info\n"); + return ret; + } + + if (mig_info->data_size > VFIO_MIGRATION_BUFFER_MAX_SIZE) { + dev_err(&pdev->dev, "mig_info->data_size %llu is invalid\n", + mig_info->data_size); + return -EINVAL; + } + + mig_info->data_offset = VFIO_MIGRATION_REGION_DATA_OFFSET; + return ret; +} + +static int vfio_device_mig_data_init(struct vfio_pci_device *vdev, + struct vfio_pci_migration_data *data) +{ + struct vfio_device_migration_info *mig_ctl; + u64 mig_offset; + int ret; + + mig_ctl = kzalloc(sizeof(*mig_ctl), GFP_KERNEL); + if (!mig_ctl) + return -ENOMEM; + + ret = vfio_pci_migration_info_init(vdev->pdev, mig_ctl, + data->mig_driver); + if (ret) { + dev_err(&vdev->pdev->dev, "get device info error!\n"); + goto err; + } + + mig_offset = sizeof(struct vfio_device_migration_info); + data->state_size = mig_ctl->data_size; + data->mig_ctl = krealloc(mig_ctl, mig_offset + data->state_size, + GFP_KERNEL); + if (!data->mig_ctl) { + ret = -ENOMEM; + goto err; + } + + data->vf_data = (void *)((char *)data->mig_ctl + mig_offset); + memset(data->vf_data, 0, data->state_size); + data->mig_ctl->data_size = 0; + + ret = vfio_pci_register_dev_region(vdev, VFIO_REGION_TYPE_MIGRATION, + VFIO_REGION_SUBTYPE_MIGRATION, + &vfio_pci_migration_regops, mig_offset + data->state_size, + VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE, data); + if (ret) { + kfree(data->mig_ctl); + return ret; + } + + return 0; +err: + kfree(mig_ctl); + return ret; +} + +int vfio_pci_migration_init(struct vfio_pci_device *vdev) { + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + struct vfio_pci_migration_data *data = NULL; + struct pci_dev *pdev = vdev->pdev; + int ret; + + mig_driver = vfio_pci_get_mig_driver(pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_err(&pdev->dev, "unable to find a mig_driver module\n"); + return -EINVAL; + } + + if (!try_module_get(mig_driver->owner)) { + pr_err("module %s is not live\n", mig_driver->owner->name); + return -ENODEV; + } + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + module_put(mig_driver->owner); + return -ENOMEM; + } + + data->mig_driver = mig_driver; + data->vf_dev = pdev; + + ret = vfio_device_mig_data_init(vdev, data); + if (ret) { + dev_err(&pdev->dev, "failed to init vfio device migration data!\n"); + goto err; + } + + return ret; +err: + kfree(data); + module_put(mig_driver->owner); + return ret; +} + +void vfio_pci_migration_exit(struct vfio_pci_device *vdev) { + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + mig_driver = vfio_pci_get_mig_driver(vdev->pdev); + if (!mig_driver || !mig_driver->dev_mig_ops) { + dev_warn(&vdev->pdev->dev, "mig_driver is not found\n"); + return; + } + + if (module_refcount(mig_driver->owner) > 0) { + vfio_pci_device_release(vdev->pdev, mig_driver); + module_put(mig_driver->owner); + } +} + +int vfio_pci_register_migration_ops(struct vfio_device_migration_ops *ops, + struct module *mod, struct pci_dev *pdev) { + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + if (!ops || !mod || !pdev) + return -EINVAL; + + mig_driver = vfio_pci_find_mig_drv(pdev, mod); + if (mig_driver) { + pr_info("%s migration ops has already been registered\n", + mod->name); + atomic_add(1, &mig_driver->count); + return 0; + } + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mig_driver = kzalloc(sizeof(*mig_driver), GFP_KERNEL); + if (!mig_driver) { + module_put(THIS_MODULE); + return -ENOMEM; + } + + mig_driver->pdev = pdev; + mig_driver->bus_num = pdev->bus->number; + mig_driver->owner = mod; + mig_driver->dev_mig_ops = ops; + + vfio_pci_add_mig_drv(mig_driver); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_register_migration_ops); + +void vfio_pci_unregister_migration_ops(struct module *mod, struct +pci_dev *pdev) { + struct vfio_pci_vendor_mig_driver *mig_driver = NULL; + + if (!mod || !pdev) + return; + + mig_driver = vfio_pci_find_mig_drv(pdev, mod); + if (!mig_driver) { + pr_err("mig_driver is not found\n"); + return; + } + + if (atomic_sub_and_test(1, &mig_driver->count)) { + vfio_pci_remove_mig_drv(mig_driver); + kfree(mig_driver); + module_put(THIS_MODULE); + pr_info("%s succeed to unregister migration ops\n", + THIS_MODULE->name); + } +} +EXPORT_SYMBOL_GPL(vfio_pci_unregister_migration_ops); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 17d2bae..03af269 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -15,6 +15,7 @@ #include <linux/pci.h> #include <linux/irqbypass.h> #include <linux/types.h> +#include <linux/vfio_pci_migration.h>
#ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -55,7 +56,7 @@ struct vfio_pci_irq_ctx { struct vfio_pci_region;
struct vfio_pci_regops { - size_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, + ssize_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); void (*release)(struct vfio_pci_device *vdev, struct vfio_pci_region *region); @@ -173,4 +174,15 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif + +extern bool vfio_dev_migration_is_supported(struct pci_dev *pdev); +extern int vfio_pci_migration_init(struct vfio_pci_device *vdev); +extern void vfio_pci_migration_exit(struct vfio_pci_device *vdev); +extern int vfio_pci_device_log_start(struct vfio_pci_device *vdev, + struct vf_migration_log_info *log_info); extern int +vfio_pci_device_log_stop(struct vfio_pci_device *vdev, + uint32_t uuid); +extern int vfio_pci_device_log_status_query(struct vfio_pci_device +*vdev); extern int vfio_pci_device_init(struct pci_dev *pdev); extern +void vfio_pci_device_uninit(struct pci_dev *pdev); #endif /* VFIO_PCI_PRIVATE_H */ diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 7a386fb..35f2a29 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include <linux/vfio_pci_migration.h> #include <linux/wait.h> #include <linux/sched/signal.h>
@@ -40,6 +41,9 @@ #define DRIVER_AUTHOR "Alex Williamson alex.williamson@redhat.com" #define DRIVER_DESC "VFIO - User Level meta-driver"
+#define LOG_BUF_FRAG_SIZE (2 * 1024 * 1024) // fix to 2M #define +LOG_BUF_MAX_ADDRS_SIZE 128 // max vm ram size is 1T + static struct vfio { struct class *class; struct list_head iommu_drivers_list; @@ -57,6 +61,14 @@ struct vfio_iommu_driver { struct list_head vfio_next; };
+struct vfio_log_buf { + struct vfio_log_buf_info info; + int fd; + int buffer_state; + int device_state; + unsigned long *cpu_addrs; +}; + struct vfio_container { struct kref kref; struct list_head group_list; @@ -64,6 +76,7 @@ struct vfio_container { struct vfio_iommu_driver *iommu_driver; void *iommu_data; bool noiommu; + struct vfio_log_buf log_buf; };
struct vfio_unbound_dev { @@ -1158,8 +1171,398 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, return ret; }
+static long vfio_dispatch_cmd_to_devices(const struct vfio_container *container, + unsigned int cmd, unsigned long arg) +{ + struct vfio_group *group = NULL; + struct vfio_device *device = NULL; + long ret = -ENXIO; + + list_for_each_entry(group, &container->group_list, container_next) { + list_for_each_entry(device, &group->device_list, group_next) { + ret = device->ops->ioctl(device->device_data, cmd, arg); + if (ret) { + pr_err("dispatch cmd to devices failed\n"); + return ret; + } + } + } + return ret; +} + +static long vfio_log_buf_start(struct vfio_container *container) { + struct vfio_log_buf_ctl log_buf_ctl; + long ret; + + log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info); + log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_START; + log_buf_ctl.data = (void *)&container->log_buf.info; + ret = vfio_dispatch_cmd_to_devices(container, VFIO_DEVICE_LOG_BUF_CTL, + (unsigned long)&log_buf_ctl); + if (ret) + return ret; + + container->log_buf.device_state = 1; + return 0; +} + +static long vfio_log_buf_stop(struct vfio_container *container) { + struct vfio_log_buf_ctl log_buf_ctl; + long ret; + + if (container->log_buf.device_state == 0) { + pr_warn("device already stopped\n"); + return 0; + } + + log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info); + log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_STOP; + log_buf_ctl.data = (void *)&container->log_buf.info; + ret = vfio_dispatch_cmd_to_devices(container, VFIO_DEVICE_LOG_BUF_CTL, + (unsigned long)&log_buf_ctl); + if (ret) + return ret; + + container->log_buf.device_state = 0; + return 0; +} + +static long vfio_log_buf_query(struct vfio_container *container) { + struct vfio_log_buf_ctl log_buf_ctl; + + log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info); + log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY; + log_buf_ctl.data = (void *)&container->log_buf.info; + + return vfio_dispatch_cmd_to_devices(container, + VFIO_DEVICE_LOG_BUF_CTL, (unsigned long)&log_buf_ctl); } + +static int vfio_log_buf_fops_mmap(struct file *filep, + struct vm_area_struct *vma) +{ + struct vfio_container *container = filep->private_data; + struct vfio_log_buf *log_buf = &container->log_buf; + unsigned long frag_pg_size; + unsigned long frag_offset; + phys_addr_t pa; + int ret = -EINVAL; + + if (!log_buf->cpu_addrs) { + pr_err("mmap before setup, please setup log buf first\n"); + return ret; + } + + if (log_buf->info.frag_size < PAGE_SIZE) { + pr_err("mmap frag size should not less than page size!\n"); + return ret; + } + + frag_pg_size = log_buf->info.frag_size / PAGE_SIZE; + frag_offset = vma->vm_pgoff / frag_pg_size; + + if (frag_offset >= log_buf->info.addrs_size) { + pr_err("mmap offset out of range!\n"); + return ret; + } + + if (vma->vm_end - vma->vm_start != log_buf->info.frag_size) { + pr_err("mmap size error, should be aligned with frag size!\n"); + return ret; + } + + pa = virt_to_phys((void *)log_buf->cpu_addrs[frag_offset]); + ret = remap_pfn_range(vma, vma->vm_start, + pa >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + if (ret) + pr_err("remap_pfn_range error!\n"); + return ret; +} + +static struct device *vfio_get_dev(struct vfio_container *container) { + struct vfio_group *group = NULL; + struct vfio_device *device = NULL; + + list_for_each_entry(group, &container->group_list, container_next) { + list_for_each_entry(device, &group->device_list, group_next) { + return device->dev; + } + } + return NULL; +} + +static void vfio_log_buf_release_dma(struct device *dev, + struct vfio_log_buf *log_buf) +{ + int i; + + for (i = 0; i < log_buf->info.addrs_size; i++) { + if ((log_buf->cpu_addrs && log_buf->cpu_addrs[i] != 0) && + (log_buf->info.sgevec && + log_buf->info.sgevec[i].addr != 0)) { + dma_free_coherent(dev, log_buf->info.frag_size, + (void *)log_buf->cpu_addrs[i], + log_buf->info.sgevec[i].addr); + log_buf->cpu_addrs[i] = 0; + log_buf->info.sgevec[i].addr = 0; + } + } +} + +static long vfio_log_buf_alloc_dma(struct vfio_log_buf_info *info, + struct vfio_log_buf *log_buf, struct device *dev) { + int i; + + for (i = 0; i < info->addrs_size; i++) { + log_buf->cpu_addrs[i] = (unsigned long)dma_alloc_coherent(dev, + info->frag_size, &log_buf->info.sgevec[i].addr, + GFP_KERNEL); + log_buf->info.sgevec[i].len = info->frag_size; + if (log_buf->cpu_addrs[i] == 0 || + log_buf->info.sgevec[i].addr == 0) { + return -ENOMEM; + } + } + return 0; +} + +static long vfio_log_buf_alloc_addrs(struct vfio_log_buf_info *info, + struct vfio_log_buf *log_buf) +{ + log_buf->info.sgevec = kcalloc(info->addrs_size, + sizeof(struct vfio_log_buf_sge), GFP_KERNEL); + if (!log_buf->info.sgevec) + return -ENOMEM; + + log_buf->cpu_addrs = kcalloc(info->addrs_size, + sizeof(unsigned long), GFP_KERNEL); + if (!log_buf->cpu_addrs) { + kfree(log_buf->info.sgevec); + log_buf->info.sgevec = NULL; + return -ENOMEM; + } + + return 0; +} + +static long vfio_log_buf_info_valid(struct vfio_log_buf_info *info) { + if (info->addrs_size > LOG_BUF_MAX_ADDRS_SIZE || + info->addrs_size == 0) { + pr_err("can`t support vm ram size larger than 1T or equal to 0\n"); + return -EINVAL; + } + if (info->frag_size != LOG_BUF_FRAG_SIZE) { + pr_err("only support %d frag size\n", LOG_BUF_FRAG_SIZE); + return -EINVAL; + } + return 0; +} + +static long vfio_log_buf_setup(struct vfio_container *container, + unsigned long data) +{ + struct vfio_log_buf_info info; + struct vfio_log_buf *log_buf = &container->log_buf; + struct device *dev = NULL; + long ret; + + if (log_buf->info.sgevec) { + pr_warn("log buf already setup\n"); + return 0; + } + + if (copy_from_user(&info, (void __user *)data, + sizeof(struct vfio_log_buf_info))) + return -EFAULT; + + ret = vfio_log_buf_info_valid(&info); + if (ret) + return ret; + + ret = vfio_log_buf_alloc_addrs(&info, log_buf); + if (ret) + goto err_out; + + dev = vfio_get_dev(container); + if (!dev) { + pr_err("can`t get dev\n"); + goto err_free_addrs; + } + + ret = vfio_log_buf_alloc_dma(&info, log_buf, dev); + if (ret) + goto err_free_dma_array; + + log_buf->info.uuid = info.uuid; + log_buf->info.buffer_size = info.buffer_size; + log_buf->info.frag_size = info.frag_size; + log_buf->info.addrs_size = info.addrs_size; + log_buf->buffer_state = 1; + return 0; + +err_free_dma_array: + vfio_log_buf_release_dma(dev, log_buf); +err_free_addrs: + kfree(log_buf->cpu_addrs); + log_buf->cpu_addrs = NULL; + kfree(log_buf->info.sgevec); + log_buf->info.sgevec = NULL; +err_out: + return -ENOMEM; +} + +static long vfio_log_buf_release_buffer(struct vfio_container +*container) { + struct vfio_log_buf *log_buf = &container->log_buf; + struct device *dev = NULL; + + if (log_buf->buffer_state == 0) { + pr_warn("buffer already released\n"); + return 0; + } + + dev = vfio_get_dev(container); + if (!dev) { + pr_err("can`t get dev\n"); + return -EFAULT; + } + + vfio_log_buf_release_dma(dev, log_buf); + + kfree(log_buf->cpu_addrs); + log_buf->cpu_addrs = NULL; + + kfree(log_buf->info.sgevec); + log_buf->info.sgevec = NULL; + + log_buf->buffer_state = 0; + return 0; +} + +static int vfio_log_buf_release(struct inode *inode, struct file +*filep) { + struct vfio_container *container = filep->private_data; + + vfio_log_buf_stop(container); + vfio_log_buf_release_buffer(container); + memset(&container->log_buf, 0, sizeof(struct vfio_log_buf)); + return 0; +} + +static long vfio_ioctl_handle_log_buf_ctl(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_log_buf_ctl log_buf_ctl; + long ret = 0; + + if (copy_from_user(&log_buf_ctl, (void __user *)arg, + sizeof(struct vfio_log_buf_ctl))) + return -EFAULT; + + switch (log_buf_ctl.flags) { + case VFIO_DEVICE_LOG_BUF_FLAG_SETUP: + ret = vfio_log_buf_setup(container, + (unsigned long)log_buf_ctl.data); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_RELEASE: + ret = vfio_log_buf_release_buffer(container); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_START: + ret = vfio_log_buf_start(container); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STOP: + ret = vfio_log_buf_stop(container); + break; + case VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY: + ret = vfio_log_buf_query(container); + break; + default: + pr_err("log buf control flag incorrect\n"); + ret = -EINVAL; + break; + } + return ret; +} + +static long vfio_log_buf_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_container *container = filep->private_data; + long ret = -EINVAL; + + switch (cmd) { + case VFIO_LOG_BUF_CTL: + ret = vfio_ioctl_handle_log_buf_ctl(container, arg); + break; + default: + pr_err("log buf control cmd incorrect\n"); + break; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static long vfio_log_buf_fops_compat_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)compat_ptr(arg); + return vfio_log_buf_fops_unl_ioctl(filep, cmd, arg); } #endif /* +CONFIG_COMPAT */ + +static const struct file_operations vfio_log_buf_fops = { + .owner = THIS_MODULE, + .mmap = vfio_log_buf_fops_mmap, + .unlocked_ioctl = vfio_log_buf_fops_unl_ioctl, + .release = vfio_log_buf_release, +#ifdef CONFIG_COMPAT + .compat_ioctl = vfio_log_buf_fops_compat_ioctl, +#endif +}; + +static int vfio_get_log_buf_fd(struct vfio_container *container, + unsigned long arg) +{ + struct file *filep = NULL; + int ret; + + if (container->log_buf.fd > 0) + return container->log_buf.fd; + + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) { + pr_err("get_unused_fd_flags get fd failed\n"); + return ret; + } + + filep = anon_inode_getfile("[vfio-log-buf]", &vfio_log_buf_fops, + container, O_RDWR); + if (IS_ERR(filep)) { + pr_err("anon_inode_getfile failed\n"); + put_unused_fd(ret); + ret = PTR_ERR(filep); + return ret; + } + + filep->f_mode |= (FMODE_READ | FMODE_WRITE | FMODE_LSEEK); + + fd_install(ret, filep); + + container->log_buf.fd = ret; + return ret; +} + static long vfio_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { struct vfio_container *container = filep->private_data; struct vfio_iommu_driver *driver; @@ -1179,6 +1582,9 @@ static long vfio_fops_unl_ioctl(struct file *filep, case VFIO_SET_IOMMU: ret = vfio_ioctl_set_iommu(container, arg); break; + case VFIO_GET_LOG_BUF_FD: + ret = vfio_get_log_buf_fd(container, arg); + break; default: driver = container->iommu_driver; data = container->iommu_data; @@ -1210,6 +1616,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) INIT_LIST_HEAD(&container->group_list); init_rwsem(&container->group_lock); kref_init(&container->kref); + memset(&container->log_buf, 0, sizeof(struct vfio_log_buf));
filep->private_data = container;
@@ -1219,9 +1626,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep) { struct vfio_container *container = filep->private_data; - filep->private_data = NULL; - vfio_container_put(container);
return 0; diff --git a/include/linux/vfio_pci_migration.h b/include/linux/vfio_pci_migration.h new file mode 100644 index 0000000..464ffb4 --- /dev/null +++ b/include/linux/vfio_pci_migration.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved. + */ + +#ifndef VFIO_PCI_MIGRATION_H +#define VFIO_PCI_MIGRATION_H + +#include <linux/types.h> +#include <linux/pci.h> + +#define VFIO_REGION_TYPE_MIGRATION (3) +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ #define +VFIO_REGION_SUBTYPE_MIGRATION (1) + +#define VFIO_MIGRATION_BUFFER_MAX_SIZE SZ_256K #define +VFIO_MIGRATION_REGION_DATA_OFFSET \ + (sizeof(struct vfio_device_migration_info)) #define +VFIO_DEVICE_MIGRATION_OFFSET(x) \ + offsetof(struct vfio_device_migration_info, x) + +struct vfio_device_migration_info { + __u32 device_state; /* VFIO device state */ #define +VFIO_DEVICE_STATE_STOP (0) #define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) #define +VFIO_DEVICE_STATE_RESUMING (1 << 2) #define VFIO_DEVICE_STATE_MASK +(VFIO_DEVICE_STATE_RUNNING | \ + VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING) + __u32 reserved; + + __u32 device_cmd; + __u32 version_id; + + __u64 pending_bytes; + __u64 data_offset; + __u64 data_size; +}; + +enum { + VFIO_DEVICE_STOP = 0xffff0001, + VFIO_DEVICE_CONTINUE, + VFIO_DEVICE_MIGRATION_CANCEL, +}; + +struct vfio_log_buf_sge { + __u64 len; + __u64 addr; +}; + +struct vfio_log_buf_info { + __u32 uuid; + __u64 buffer_size; + __u64 addrs_size; + __u64 frag_size; + struct vfio_log_buf_sge *sgevec; +}; + +struct vfio_log_buf_ctl { + __u32 argsz; + __u32 flags; + #define VFIO_DEVICE_LOG_BUF_FLAG_SETUP (1 << 0) + #define VFIO_DEVICE_LOG_BUF_FLAG_RELEASE (1 << 1) + #define VFIO_DEVICE_LOG_BUF_FLAG_START (1 << 2) + #define VFIO_DEVICE_LOG_BUF_FLAG_STOP (1 << 3) + #define VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY (1 << 4) + void *data; +}; +#define VFIO_LOG_BUF_CTL _IO(VFIO_TYPE, VFIO_BASE + 21) #define +VFIO_GET_LOG_BUF_FD _IO(VFIO_TYPE, VFIO_BASE + 22) #define +VFIO_DEVICE_LOG_BUF_CTL _IO(VFIO_TYPE, VFIO_BASE + 23) + +struct vf_migration_log_info { + __u32 dom_uuid; + __u64 buffer_size; + __u64 sge_len; + __u64 sge_num; + struct vfio_log_buf_sge *sgevec; +}; + +struct vfio_device_migration_ops { + /* Get device information */ + int (*get_info)(struct pci_dev *pdev, + struct vfio_device_migration_info *info); + /* Enable a vf device */ + int (*enable)(struct pci_dev *pdev); + /* Disable a vf device */ + int (*disable)(struct pci_dev *pdev); + /* Save a vf device */ + int (*save)(struct pci_dev *pdev, void *base, + uint64_t off, uint64_t count); + /* Resuming a vf device */ + int (*restore)(struct pci_dev *pdev, void *base, + uint64_t off, uint64_t count); + /* Log start a vf device */ + int (*log_start)(struct pci_dev *pdev, + struct vf_migration_log_info *log_info); + /* Log stop a vf device */ + int (*log_stop)(struct pci_dev *pdev, uint32_t uuid); + /* Get vf device log status */ + int (*get_log_status)(struct pci_dev *pdev); + /* Pre enable a vf device(load_setup, before restore a vf) */ + int (*pre_enable)(struct pci_dev *pdev); + /* Cancel a vf device when live migration failed (rollback) */ + int (*cancel)(struct pci_dev *pdev); + /* Init a vf device */ + int (*init)(struct pci_dev *pdev); + /* Uninit a vf device */ + void (*uninit)(struct pci_dev *pdev); + /* Release a vf device */ + void (*release)(struct pci_dev *pdev); }; + +struct vfio_pci_vendor_mig_driver { + struct pci_dev *pdev; + unsigned char bus_num; + struct vfio_device_migration_ops *dev_mig_ops; + struct module *owner; + atomic_t count; + struct list_head list; +}; + +struct vfio_pci_migration_data { + u64 state_size; + struct pci_dev *vf_dev; + struct vfio_pci_vendor_mig_driver *mig_driver; + struct vfio_device_migration_info *mig_ctl; + void *vf_data; +}; + +int vfio_pci_register_migration_ops(struct vfio_device_migration_ops *ops, + struct module *mod, struct pci_dev *pdev); void +vfio_pci_unregister_migration_ops(struct module *mod, + struct pci_dev *pdev); + +#endif /* VFIO_PCI_MIGRATION_H */ -- 1.8.3.1
on 2022/6/24 11:13, RongWang wrote:
From: Rong Wang w_angrong@163.com
kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5CO9A CVE: NA
As pass through devices, hypervisor can`t control the status of device, and can`t track dirty memory DMA from device, during migration. The goal of this framework is to combine hardware to accomplish the task above.
qemu |status control and dirty memory report vfio |ops to hardware hardware
Signed-off-by: Rong Wang w_angrong@163.com Signed-off-by: HuHua Li 18245010845@163.com Signed-off-by: Ripeng Qiu 965412048@qq.com
drivers/vfio/pci/Makefile | 2 +- drivers/vfio/pci/vfio_pci.c | 54 +++ drivers/vfio/pci/vfio_pci_migration.c | 755 ++++++++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_private.h | 14 +- drivers/vfio/vfio.c | 411 +++++++++++++++++- include/linux/vfio_pci_migration.h | 136 ++++++ 6 files changed, 1367 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_migration.c create mode 100644 include/linux/vfio_pci_migration.h
Please name the driver file according to the model of your network card. Do not use the common name of pci to prevent it from being mixed with the vfio pci framework.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 76d8ec0..80a777d 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -1,5 +1,5 @@
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o +vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio_pci_migration.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 51b791c..59d8280 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -30,6 +30,7 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#include <linux/vfio_pci_migration.h>
Do not extend the private data types of your own network card devices to the public framework vfio_pci.c, please keep the driver independent and do not pollute the framework.
#include "vfio_pci_private.h"
@@ -296,6 +297,14 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
vfio_pci_probe_mmaps(vdev);
- if (vfio_dev_migration_is_supported(pdev)) {
ret = vfio_pci_migration_init(vdev);
if (ret) {
dev_warn(&vdev->pdev->dev, "Failed to init vfio_pci_migration\n");
vfio_pci_disable(vdev);
return ret;
}
- } return 0;
}
@@ -392,6 +401,7 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev) out: pci_disable_device(pdev);
vfio_pci_migration_exit(vdev); vfio_pci_try_bus_reset(vdev);
if (!disable_idle_d3)
@@ -642,6 +652,41 @@ struct vfio_devices { int max_index; };
+static long vfio_pci_handle_log_buf_ctl(struct vfio_pci_device *vdev,
- const unsigned long arg)
+{
Don't put the private logging capabilities of your own NIC devices into the public framework.
- struct vfio_log_buf_ctl *log_buf_ctl = NULL;
- struct vfio_log_buf_info *log_buf_info = NULL;
- struct vf_migration_log_info migration_log_info;
- long ret = 0;
- log_buf_ctl = (struct vfio_log_buf_ctl *)arg;
- log_buf_info = (struct vfio_log_buf_info *)log_buf_ctl->data;
- switch (log_buf_ctl->flags) {
- case VFIO_DEVICE_LOG_BUF_FLAG_START:
migration_log_info.dom_uuid = log_buf_info->uuid;
migration_log_info.buffer_size =
log_buf_info->buffer_size;
migration_log_info.sge_num = log_buf_info->addrs_size;
migration_log_info.sge_len = log_buf_info->frag_size;
migration_log_info.sgevec = log_buf_info->sgevec;
ret = vfio_pci_device_log_start(vdev,
&migration_log_info);
break;
- case VFIO_DEVICE_LOG_BUF_FLAG_STOP:
ret = vfio_pci_device_log_stop(vdev,
log_buf_info->uuid);
break;
- case VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY:
ret = vfio_pci_device_log_status_query(vdev);
break;
- default:
ret = -EINVAL;
break;
- }
- return ret;
+} static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -1142,6 +1187,8 @@ static long vfio_pci_ioctl(void *device_data,
return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, ioeventfd.fd);
} else if (cmd == VFIO_DEVICE_LOG_BUF_CTL) {
return vfio_pci_handle_log_buf_ctl(vdev, arg);
}
return -ENOTTY;
@@ -1566,6 +1613,9 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_power_state(pdev, PCI_D3hot); }
- if (vfio_dev_migration_is_supported(pdev))
ret = vfio_pci_device_init(pdev);
- return ret;
}
@@ -1591,6 +1641,10 @@ static void vfio_pci_remove(struct pci_dev *pdev)
if (!disable_idle_d3) pci_set_power_state(pdev, PCI_D0);
- if (vfio_dev_migration_is_supported(pdev)) {
vfio_pci_device_uninit(pdev);
- }
}
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/vfio_pci_migration.c b/drivers/vfio/pci/vfio_pci_migration.c new file mode 100644 index 0000000..f69cd13 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_migration.c @@ -0,0 +1,755 @@ +// SPDX-License-Identifier: GPL-2.0 +/*
- Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved.
- */
+#include <linux/module.h> +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_migration.h>
+#include "vfio_pci_private.h"
+static LIST_HEAD(vfio_pci_mig_drivers_list); +static DEFINE_MUTEX(vfio_pci_mig_drivers_mutex);
+static void vfio_pci_add_mig_drv(struct vfio_pci_vendor_mig_driver *mig_drv) +{
- mutex_lock(&vfio_pci_mig_drivers_mutex);
- atomic_set(&mig_drv->count, 1);
- list_add_tail(&mig_drv->list, &vfio_pci_mig_drivers_list);
- mutex_unlock(&vfio_pci_mig_drivers_mutex);
+}
+static void vfio_pci_remove_mig_drv(struct vfio_pci_vendor_mig_driver *mig_drv) +{
- mutex_lock(&vfio_pci_mig_drivers_mutex);
- list_del(&mig_drv->list);
- mutex_unlock(&vfio_pci_mig_drivers_mutex);
+}
+static struct vfio_pci_vendor_mig_driver *
- vfio_pci_find_mig_drv(struct pci_dev *pdev, struct module *module)
+{
- struct vfio_pci_vendor_mig_driver *mig_drv = NULL;
- mutex_lock(&vfio_pci_mig_drivers_mutex);
- list_for_each_entry(mig_drv, &vfio_pci_mig_drivers_list, list) {
if (mig_drv->owner == module) {
if (mig_drv->bus_num == pdev->bus->number)
goto out;
}
- }
- mig_drv = NULL;
+out:
- mutex_unlock(&vfio_pci_mig_drivers_mutex);
- return mig_drv;
+}
+static struct vfio_pci_vendor_mig_driver *
- vfio_pci_get_mig_driver(struct pci_dev *pdev)
+{
- struct vfio_pci_vendor_mig_driver *mig_drv = NULL;
- struct pci_dev *pf_dev = pci_physfn(pdev);
- mutex_lock(&vfio_pci_mig_drivers_mutex);
- list_for_each_entry(mig_drv, &vfio_pci_mig_drivers_list, list) {
if (mig_drv->bus_num == pf_dev->bus->number)
goto out;
- }
- mig_drv = NULL;
+out:
- mutex_unlock(&vfio_pci_mig_drivers_mutex);
- return mig_drv;
+}
+bool vfio_dev_migration_is_supported(struct pci_dev *pdev) +{
- struct vfio_pci_vendor_mig_driver *mig_driver = NULL;
- mig_driver = vfio_pci_get_mig_driver(pdev);
- if (!mig_driver || !mig_driver->dev_mig_ops) {
dev_warn(&pdev->dev, "unable to find a mig_drv module\n");
return false;
- }
- return true;
+}
+int vfio_pci_device_log_start(struct vfio_pci_device *vdev,
- struct vf_migration_log_info *log_info)
+{
- struct vfio_pci_vendor_mig_driver *mig_driver;
- mig_driver = vfio_pci_get_mig_driver(vdev->pdev);
- if (!mig_driver || !mig_driver->dev_mig_ops) {
dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n");
return -EFAULT;
- }
- if (!mig_driver->dev_mig_ops->log_start ||
(mig_driver->dev_mig_ops->log_start(vdev->pdev,
log_info) != 0)) {
dev_err(&vdev->pdev->dev, "failed to set log start\n");
return -EFAULT;
- }
- return 0;
+}
+int vfio_pci_device_log_stop(struct vfio_pci_device *vdev, uint32_t uuid) +{
- struct vfio_pci_vendor_mig_driver *mig_driver;
- mig_driver = vfio_pci_get_mig_driver(vdev->pdev);
- if (!mig_driver || !mig_driver->dev_mig_ops) {
dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n");
return -EFAULT;
- }
- if (!mig_driver->dev_mig_ops->log_stop ||
(mig_driver->dev_mig_ops->log_stop(vdev->pdev, uuid) != 0)) {
dev_err(&vdev->pdev->dev, "failed to set log stop\n");
return -EFAULT;
- }
- return 0;
+}
+int vfio_pci_device_log_status_query(struct vfio_pci_device *vdev) +{
- struct vfio_pci_vendor_mig_driver *mig_driver;
- mig_driver = vfio_pci_get_mig_driver(vdev->pdev);
- if (!mig_driver || !mig_driver->dev_mig_ops) {
dev_err(&vdev->pdev->dev, "unable to find a mig_drv module\n");
return -EFAULT;
- }
- if (!mig_driver->dev_mig_ops->get_log_status ||
(mig_driver->dev_mig_ops->get_log_status(vdev->pdev) != 0)) {
dev_err(&vdev->pdev->dev, "failed to get log status\n");
return -EFAULT;
- }
- return 0;
+}
+int vfio_pci_device_init(struct pci_dev *pdev) +{
- struct vfio_pci_vendor_mig_driver *mig_drv;
- mig_drv = vfio_pci_get_mig_driver(pdev);
- if (!mig_drv || !mig_drv->dev_mig_ops) {
dev_err(&pdev->dev, "unable to find a mig_drv module\n");
return -EFAULT;
- }
- if (mig_drv->dev_mig_ops->init)
return mig_drv->dev_mig_ops->init(pdev);
- return -EFAULT;
+}
+void vfio_pci_device_uninit(struct pci_dev *pdev) +{
- struct vfio_pci_vendor_mig_driver *mig_drv;
- mig_drv = vfio_pci_get_mig_driver(pdev);
- if (!mig_drv || !mig_drv->dev_mig_ops) {
dev_err(&pdev->dev, "unable to find a mig_drv module\n");
return;
- }
- if (mig_drv->dev_mig_ops->uninit)
mig_drv->dev_mig_ops->uninit(pdev);
+}
+static void vfio_pci_device_release(struct pci_dev *pdev,
- struct vfio_pci_vendor_mig_driver *mig_drv)
+{
- if (mig_drv->dev_mig_ops->release)
mig_drv->dev_mig_ops->release(pdev);
+}
+static int vfio_pci_device_get_info(struct pci_dev *pdev,
- struct vfio_device_migration_info *mig_info,
- struct vfio_pci_vendor_mig_driver *mig_drv)
+{
- if (mig_drv->dev_mig_ops->get_info)
return mig_drv->dev_mig_ops->get_info(pdev, mig_info);
- return -EFAULT;
+}
+static int vfio_pci_device_enable(struct pci_dev *pdev,
- struct vfio_pci_vendor_mig_driver *mig_drv)
+{
- if (!mig_drv->dev_mig_ops->enable ||
(mig_drv->dev_mig_ops->enable(pdev) != 0)) {
return -EINVAL;
- }
- return 0;
+}
+static int vfio_pci_device_disable(struct pci_dev *pdev,
- struct vfio_pci_vendor_mig_driver *mig_drv)
+{
- if (!mig_drv->dev_mig_ops->disable ||
(mig_drv->dev_mig_ops->disable(pdev) != 0))
return -EINVAL;
- return 0;
+}
+static int vfio_pci_device_pre_enable(struct pci_dev *pdev,
- struct vfio_pci_vendor_mig_driver *mig_drv)
+{
- if (!mig_drv->dev_mig_ops->pre_enable ||
(mig_drv->dev_mig_ops->pre_enable(pdev) != 0))
return -EINVAL;
- return 0;
+}
+static int vfio_pci_device_state_save(struct pci_dev *pdev,
- struct vfio_pci_migration_data *data)
+{
- struct vfio_device_migration_info *mig_info = data->mig_ctl;
- struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver;
- void *base = (void *)mig_info;
- int ret = 0;
- if ((mig_info->device_state & VFIO_DEVICE_STATE_RUNNING) != 0) {
ret = vfio_pci_device_disable(pdev, mig_drv);
if (ret) {
dev_err(&pdev->dev, "failed to stop VF function!\n");
return ret;
}
mig_info->device_state &= ~VFIO_DEVICE_STATE_RUNNING;
- }
- if (mig_drv->dev_mig_ops && mig_drv->dev_mig_ops->save) {
ret = mig_drv->dev_mig_ops->save(pdev, base,
mig_info->data_offset, data->state_size);
if (ret) {
dev_err(&pdev->dev, "failed to save device state!\n");
return -EINVAL;
}
- } else {
return -EFAULT;
- }
- mig_info->data_size = data->state_size;
- mig_info->pending_bytes = mig_info->data_size;
- return ret;
+}
+static int vfio_pci_device_state_restore(struct vfio_pci_migration_data *data) +{
- struct vfio_device_migration_info *mig_info = data->mig_ctl;
- struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver;
- struct pci_dev *pdev = data->vf_dev;
- void *base = (void *)mig_info;
- int ret;
- if (mig_drv->dev_mig_ops && mig_drv->dev_mig_ops->restore) {
ret = mig_drv->dev_mig_ops->restore(pdev, base,
mig_info->data_offset, mig_info->data_size);
if (ret) {
dev_err(&pdev->dev, "failed to restore device state!\n");
return -EINVAL;
}
return 0;
- }
- return -EFAULT;
+}
+static int vfio_pci_set_device_state(struct vfio_pci_migration_data *data,
- u32 state)
+{
- struct vfio_device_migration_info *mig_ctl = data->mig_ctl;
- struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver;
- struct pci_dev *pdev = data->vf_dev;
- int ret = 0;
- if (state == mig_ctl->device_state)
return 0;
- if (!mig_drv->dev_mig_ops)
return -EINVAL;
- switch (state) {
- case VFIO_DEVICE_STATE_RUNNING:
if (!(mig_ctl->device_state &
VFIO_DEVICE_STATE_RUNNING))
ret = vfio_pci_device_enable(pdev, mig_drv);
break;
- case VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING:
/*
* (pre-copy) - device should start logging data.
*/
ret = 0;
break;
- case VFIO_DEVICE_STATE_SAVING:
/* stop the vf function, save state */
ret = vfio_pci_device_state_save(pdev, data);
break;
- case VFIO_DEVICE_STATE_STOP:
if (mig_ctl->device_state & VFIO_DEVICE_STATE_RUNNING)
ret = vfio_pci_device_disable(pdev, mig_drv);
break;
- case VFIO_DEVICE_STATE_RESUMING:
ret = vfio_pci_device_pre_enable(pdev, mig_drv);
break;
- default:
ret = -EFAULT;
break;
- }
- if (ret)
return ret;
- mig_ctl->device_state = state;
- return 0;
+}
+static ssize_t vfio_pci_handle_mig_dev_state(
- struct vfio_pci_migration_data *data,
- char __user *buf, size_t count, bool iswrite)
+{
- struct vfio_device_migration_info *mig_ctl = data->mig_ctl;
- u32 device_state;
- int ret;
- if (count != sizeof(device_state))
return -EINVAL;
- if (iswrite) {
if (copy_from_user(&device_state, buf, count))
return -EFAULT;
ret = vfio_pci_set_device_state(data, device_state);
if (ret)
return ret;
- } else {
if (copy_to_user(buf, &mig_ctl->device_state, count))
return -EFAULT;
- }
- return count;
+}
+static ssize_t vfio_pci_handle_mig_pending_bytes(
- struct vfio_device_migration_info *mig_info,
- char __user *buf, size_t count, bool iswrite)
+{
- u64 pending_bytes;
- if (count != sizeof(pending_bytes) || iswrite)
return -EINVAL;
- if (mig_info->device_state ==
(VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RUNNING)) {
/* In pre-copy state we have no data to return for now,
* return 0 pending bytes
*/
pending_bytes = 0;
- } else {
pending_bytes = mig_info->pending_bytes;
- }
- if (copy_to_user(buf, &pending_bytes, count))
return -EFAULT;
- return count;
+}
+static ssize_t vfio_pci_handle_mig_data_offset(
- struct vfio_device_migration_info *mig_info,
- char __user *buf, size_t count, bool iswrite)
+{
- u64 data_offset = mig_info->data_offset;
- if (count != sizeof(data_offset) || iswrite)
return -EINVAL;
- if (copy_to_user(buf, &data_offset, count))
return -EFAULT;
- return count;
+}
+static ssize_t vfio_pci_handle_mig_data_size(
- struct vfio_device_migration_info *mig_info,
- char __user *buf, size_t count, bool iswrite)
+{
- u64 data_size;
- if (count != sizeof(data_size))
return -EINVAL;
- if (iswrite) {
/* data_size is writable only during resuming state */
if (mig_info->device_state != VFIO_DEVICE_STATE_RESUMING)
return -EINVAL;
if (copy_from_user(&data_size, buf, sizeof(data_size)))
return -EFAULT;
mig_info->data_size = data_size;
- } else {
if (mig_info->device_state != VFIO_DEVICE_STATE_SAVING)
return -EINVAL;
if (copy_to_user(buf, &mig_info->data_size,
sizeof(data_size)))
return -EFAULT;
- }
- return count;
+}
+static ssize_t vfio_pci_handle_mig_dev_cmd(struct vfio_pci_migration_data *data,
- char __user *buf, size_t count, bool iswrite)
+{
- struct vfio_pci_vendor_mig_driver *mig_drv = data->mig_driver;
- struct pci_dev *pdev = data->vf_dev;
- u32 device_cmd;
- int ret = -EFAULT;
- if (count != sizeof(device_cmd) || !iswrite || !mig_drv->dev_mig_ops)
return -EINVAL;
- if (copy_from_user(&device_cmd, buf, count))
return -EFAULT;
- switch (device_cmd) {
- case VFIO_DEVICE_MIGRATION_CANCEL:
if (mig_drv->dev_mig_ops->cancel)
ret = mig_drv->dev_mig_ops->cancel(pdev);
break;
- default:
dev_err(&pdev->dev, "cmd is invaild\n");
return -EINVAL;
- }
- if (ret != 0)
return ret;
- return count;
+}
+static ssize_t vfio_pci_handle_mig_drv_version(
- struct vfio_device_migration_info *mig_info,
- char __user *buf, size_t count, bool iswrite)
+{
- u32 version_id = mig_info->version_id;
- if (count != sizeof(version_id) || iswrite)
return -EINVAL;
- if (copy_to_user(buf, &version_id, count))
return -EFAULT;
- return count;
+}
+static ssize_t vfio_pci_handle_mig_data_rw(
- struct vfio_pci_migration_data *data,
- char __user *buf, size_t count, u64 pos, bool iswrite)
+{
- struct vfio_device_migration_info *mig_ctl = data->mig_ctl;
- void *data_addr = data->vf_data;
- if (count == 0) {
dev_err(&data->vf_dev->dev, "qemu operation data size error!\n");
return -EINVAL;
- }
- data_addr += pos - mig_ctl->data_offset;
- if (iswrite) {
if (copy_from_user(data_addr, buf, count))
return -EFAULT;
mig_ctl->pending_bytes += count;
if (mig_ctl->pending_bytes > data->state_size)
return -EINVAL;
- } else {
if (copy_to_user(buf, data_addr, count))
return -EFAULT;
if (mig_ctl->pending_bytes < count)
return -EINVAL;
mig_ctl->pending_bytes -= count;
- }
- return count;
+}
+static ssize_t vfio_pci_dev_migrn_rw(struct vfio_pci_device *vdev,
- char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
- unsigned int index =
VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
- struct vfio_pci_migration_data *data =
(struct vfio_pci_migration_data *)vdev->region[index].data;
- loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
- struct vfio_device_migration_info *mig_ctl = data->mig_ctl;
- int ret;
- if (pos >= vdev->region[index].size)
return -EINVAL;
- count = min(count, (size_t)(vdev->region[index].size - pos));
- if (pos >= VFIO_MIGRATION_REGION_DATA_OFFSET)
return vfio_pci_handle_mig_data_rw(data,
buf, count, pos, iswrite);
- switch (pos) {
- case VFIO_DEVICE_MIGRATION_OFFSET(device_state):
ret = vfio_pci_handle_mig_dev_state(data,
buf, count, iswrite);
break;
- case VFIO_DEVICE_MIGRATION_OFFSET(pending_bytes):
ret = vfio_pci_handle_mig_pending_bytes(mig_ctl,
buf, count, iswrite);
break;
- case VFIO_DEVICE_MIGRATION_OFFSET(data_offset):
ret = vfio_pci_handle_mig_data_offset(mig_ctl,
buf, count, iswrite);
break;
- case VFIO_DEVICE_MIGRATION_OFFSET(data_size):
ret = vfio_pci_handle_mig_data_size(mig_ctl,
buf, count, iswrite);
break;
- case VFIO_DEVICE_MIGRATION_OFFSET(device_cmd):
ret = vfio_pci_handle_mig_dev_cmd(data,
buf, count, iswrite);
break;
- case VFIO_DEVICE_MIGRATION_OFFSET(version_id):
ret = vfio_pci_handle_mig_drv_version(mig_ctl,
buf, count, iswrite);
break;
- default:
dev_err(&vdev->pdev->dev, "invalid pos offset\n");
ret = -EFAULT;
break;
- }
- if (mig_ctl->device_state == VFIO_DEVICE_STATE_RESUMING &&
mig_ctl->pending_bytes == data->state_size &&
mig_ctl->data_size == data->state_size) {
if (vfio_pci_device_state_restore(data) != 0) {
dev_err(&vdev->pdev->dev, "Failed to restore device state!\n");
return -EFAULT;
}
mig_ctl->pending_bytes = 0;
mig_ctl->data_size = 0;
- }
- return ret;
+}
+static void vfio_pci_dev_migrn_release(struct vfio_pci_device *vdev,
- struct vfio_pci_region *region)
+{
- struct vfio_pci_migration_data *data = region->data;
- if (data) {
kfree(data->mig_ctl);
kfree(data);
- }
+}
+static const struct vfio_pci_regops vfio_pci_migration_regops = {
- .rw = vfio_pci_dev_migrn_rw,
- .release = vfio_pci_dev_migrn_release,
+};
+static int vfio_pci_migration_info_init(struct pci_dev *pdev,
- struct vfio_device_migration_info *mig_info,
- struct vfio_pci_vendor_mig_driver *mig_drv)
+{
- int ret;
- ret = vfio_pci_device_get_info(pdev, mig_info, mig_drv);
- if (ret) {
dev_err(&pdev->dev, "failed to get device info\n");
return ret;
- }
- if (mig_info->data_size > VFIO_MIGRATION_BUFFER_MAX_SIZE) {
dev_err(&pdev->dev, "mig_info->data_size %llu is invalid\n",
mig_info->data_size);
return -EINVAL;
- }
- mig_info->data_offset = VFIO_MIGRATION_REGION_DATA_OFFSET;
- return ret;
+}
+static int vfio_device_mig_data_init(struct vfio_pci_device *vdev,
- struct vfio_pci_migration_data *data)
+{
- struct vfio_device_migration_info *mig_ctl;
- u64 mig_offset;
- int ret;
- mig_ctl = kzalloc(sizeof(*mig_ctl), GFP_KERNEL);
- if (!mig_ctl)
return -ENOMEM;
- ret = vfio_pci_migration_info_init(vdev->pdev, mig_ctl,
data->mig_driver);
- if (ret) {
dev_err(&vdev->pdev->dev, "get device info error!\n");
goto err;
- }
- mig_offset = sizeof(struct vfio_device_migration_info);
- data->state_size = mig_ctl->data_size;
- data->mig_ctl = krealloc(mig_ctl, mig_offset + data->state_size,
GFP_KERNEL);
- if (!data->mig_ctl) {
ret = -ENOMEM;
goto err;
- }
- data->vf_data = (void *)((char *)data->mig_ctl + mig_offset);
- memset(data->vf_data, 0, data->state_size);
- data->mig_ctl->data_size = 0;
- ret = vfio_pci_register_dev_region(vdev, VFIO_REGION_TYPE_MIGRATION,
VFIO_REGION_SUBTYPE_MIGRATION,
&vfio_pci_migration_regops, mig_offset + data->state_size,
VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE, data);
- if (ret) {
kfree(data->mig_ctl);
return ret;
- }
- return 0;
+err:
- kfree(mig_ctl);
- return ret;
+}
+int vfio_pci_migration_init(struct vfio_pci_device *vdev) +{
- struct vfio_pci_vendor_mig_driver *mig_driver = NULL;
- struct vfio_pci_migration_data *data = NULL;
- struct pci_dev *pdev = vdev->pdev;
- int ret;
- mig_driver = vfio_pci_get_mig_driver(pdev);
- if (!mig_driver || !mig_driver->dev_mig_ops) {
dev_err(&pdev->dev, "unable to find a mig_driver module\n");
return -EINVAL;
- }
- if (!try_module_get(mig_driver->owner)) {
pr_err("module %s is not live\n", mig_driver->owner->name);
return -ENODEV;
- }
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data) {
module_put(mig_driver->owner);
return -ENOMEM;
- }
- data->mig_driver = mig_driver;
- data->vf_dev = pdev;
- ret = vfio_device_mig_data_init(vdev, data);
- if (ret) {
dev_err(&pdev->dev, "failed to init vfio device migration data!\n");
goto err;
- }
- return ret;
+err:
- kfree(data);
- module_put(mig_driver->owner);
- return ret;
+}
+void vfio_pci_migration_exit(struct vfio_pci_device *vdev) +{
- struct vfio_pci_vendor_mig_driver *mig_driver = NULL;
- mig_driver = vfio_pci_get_mig_driver(vdev->pdev);
- if (!mig_driver || !mig_driver->dev_mig_ops) {
dev_warn(&vdev->pdev->dev, "mig_driver is not found\n");
return;
- }
- if (module_refcount(mig_driver->owner) > 0) {
vfio_pci_device_release(vdev->pdev, mig_driver);
module_put(mig_driver->owner);
- }
+}
+int vfio_pci_register_migration_ops(struct vfio_device_migration_ops *ops,
- struct module *mod, struct pci_dev *pdev)
+{
- struct vfio_pci_vendor_mig_driver *mig_driver = NULL;
- if (!ops || !mod || !pdev)
return -EINVAL;
- mig_driver = vfio_pci_find_mig_drv(pdev, mod);
- if (mig_driver) {
pr_info("%s migration ops has already been registered\n",
mod->name);
atomic_add(1, &mig_driver->count);
return 0;
- }
- if (!try_module_get(THIS_MODULE))
return -ENODEV;
- mig_driver = kzalloc(sizeof(*mig_driver), GFP_KERNEL);
- if (!mig_driver) {
module_put(THIS_MODULE);
return -ENOMEM;
- }
- mig_driver->pdev = pdev;
- mig_driver->bus_num = pdev->bus->number;
- mig_driver->owner = mod;
- mig_driver->dev_mig_ops = ops;
- vfio_pci_add_mig_drv(mig_driver);
- return 0;
+} +EXPORT_SYMBOL_GPL(vfio_pci_register_migration_ops);
+void vfio_pci_unregister_migration_ops(struct module *mod, struct pci_dev *pdev) +{
- struct vfio_pci_vendor_mig_driver *mig_driver = NULL;
- if (!mod || !pdev)
return;
- mig_driver = vfio_pci_find_mig_drv(pdev, mod);
- if (!mig_driver) {
pr_err("mig_driver is not found\n");
return;
- }
- if (atomic_sub_and_test(1, &mig_driver->count)) {
vfio_pci_remove_mig_drv(mig_driver);
kfree(mig_driver);
module_put(THIS_MODULE);
pr_info("%s succeed to unregister migration ops\n",
THIS_MODULE->name);
- }
+} +EXPORT_SYMBOL_GPL(vfio_pci_unregister_migration_ops); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 17d2bae..03af269 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -15,6 +15,7 @@ #include <linux/pci.h> #include <linux/irqbypass.h> #include <linux/types.h> +#include <linux/vfio_pci_migration.h>
#ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -55,7 +56,7 @@ struct vfio_pci_irq_ctx { struct vfio_pci_region;
struct vfio_pci_regops {
- size_t (*rw)(struct vfio_pci_device *vdev, char __user *buf,
- ssize_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); void (*release)(struct vfio_pci_device *vdev, struct vfio_pci_region *region);
@@ -173,4 +174,15 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif
+extern bool vfio_dev_migration_is_supported(struct pci_dev *pdev); +extern int vfio_pci_migration_init(struct vfio_pci_device *vdev); +extern void vfio_pci_migration_exit(struct vfio_pci_device *vdev); +extern int vfio_pci_device_log_start(struct vfio_pci_device *vdev,
- struct vf_migration_log_info *log_info);
+extern int vfio_pci_device_log_stop(struct vfio_pci_device *vdev,
- uint32_t uuid);
+extern int vfio_pci_device_log_status_query(struct vfio_pci_device *vdev); +extern int vfio_pci_device_init(struct pci_dev *pdev); +extern void vfio_pci_device_uninit(struct pci_dev *pdev); #endif /* VFIO_PCI_PRIVATE_H */ diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 7a386fb..35f2a29 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/vfio.h> +#include <linux/vfio_pci_migration.h> #include <linux/wait.h> #include <linux/sched/signal.h>
@@ -40,6 +41,9 @@ #define DRIVER_AUTHOR "Alex Williamson alex.williamson@redhat.com" #define DRIVER_DESC "VFIO - User Level meta-driver"
+#define LOG_BUF_FRAG_SIZE (2 * 1024 * 1024) // fix to 2M +#define LOG_BUF_MAX_ADDRS_SIZE 128 // max vm ram size is 1T
static struct vfio { struct class *class; struct list_head iommu_drivers_list; @@ -57,6 +61,14 @@ struct vfio_iommu_driver { struct list_head vfio_next; };
+struct vfio_log_buf {
- struct vfio_log_buf_info info;
- int fd;
- int buffer_state;
- int device_state;
- unsigned long *cpu_addrs;
+};
struct vfio_container { struct kref kref; struct list_head group_list; @@ -64,6 +76,7 @@ struct vfio_container { struct vfio_iommu_driver *iommu_driver; void *iommu_data; bool noiommu;
- struct vfio_log_buf log_buf;
};
struct vfio_unbound_dev { @@ -1158,8 +1171,398 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, return ret; }
+static long vfio_dispatch_cmd_to_devices(const struct vfio_container *container,
- unsigned int cmd, unsigned long arg)
+{
- struct vfio_group *group = NULL;
- struct vfio_device *device = NULL;
- long ret = -ENXIO;
- list_for_each_entry(group, &container->group_list, container_next) {
list_for_each_entry(device, &group->device_list, group_next) {
ret = device->ops->ioctl(device->device_data, cmd, arg);
if (ret) {
pr_err("dispatch cmd to devices failed\n");
return ret;
}
}
- }
- return ret;
+}
+static long vfio_log_buf_start(struct vfio_container *container) +{
- struct vfio_log_buf_ctl log_buf_ctl;
- long ret;
- log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info);
- log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_START;
- log_buf_ctl.data = (void *)&container->log_buf.info;
- ret = vfio_dispatch_cmd_to_devices(container, VFIO_DEVICE_LOG_BUF_CTL,
(unsigned long)&log_buf_ctl);
- if (ret)
return ret;
- container->log_buf.device_state = 1;
- return 0;
+}
+static long vfio_log_buf_stop(struct vfio_container *container) +{
- struct vfio_log_buf_ctl log_buf_ctl;
- long ret;
- if (container->log_buf.device_state == 0) {
pr_warn("device already stopped\n");
return 0;
- }
- log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info);
- log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_STOP;
- log_buf_ctl.data = (void *)&container->log_buf.info;
- ret = vfio_dispatch_cmd_to_devices(container, VFIO_DEVICE_LOG_BUF_CTL,
(unsigned long)&log_buf_ctl);
- if (ret)
return ret;
- container->log_buf.device_state = 0;
- return 0;
+}
+static long vfio_log_buf_query(struct vfio_container *container) +{
- struct vfio_log_buf_ctl log_buf_ctl;
- log_buf_ctl.argsz = sizeof(struct vfio_log_buf_info);
- log_buf_ctl.flags = VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY;
- log_buf_ctl.data = (void *)&container->log_buf.info;
- return vfio_dispatch_cmd_to_devices(container,
VFIO_DEVICE_LOG_BUF_CTL, (unsigned long)&log_buf_ctl);
+}
+static int vfio_log_buf_fops_mmap(struct file *filep,
- struct vm_area_struct *vma)
+{
- struct vfio_container *container = filep->private_data;
- struct vfio_log_buf *log_buf = &container->log_buf;
- unsigned long frag_pg_size;
- unsigned long frag_offset;
- phys_addr_t pa;
- int ret = -EINVAL;
- if (!log_buf->cpu_addrs) {
pr_err("mmap before setup, please setup log buf first\n");
return ret;
- }
- if (log_buf->info.frag_size < PAGE_SIZE) {
pr_err("mmap frag size should not less than page size!\n");
return ret;
- }
- frag_pg_size = log_buf->info.frag_size / PAGE_SIZE;
- frag_offset = vma->vm_pgoff / frag_pg_size;
- if (frag_offset >= log_buf->info.addrs_size) {
pr_err("mmap offset out of range!\n");
return ret;
- }
- if (vma->vm_end - vma->vm_start != log_buf->info.frag_size) {
pr_err("mmap size error, should be aligned with frag size!\n");
return ret;
- }
- pa = virt_to_phys((void *)log_buf->cpu_addrs[frag_offset]);
- ret = remap_pfn_range(vma, vma->vm_start,
pa >> PAGE_SHIFT,
vma->vm_end - vma->vm_start,
vma->vm_page_prot);
- if (ret)
pr_err("remap_pfn_range error!\n");
- return ret;
+}
+static struct device *vfio_get_dev(struct vfio_container *container) +{
- struct vfio_group *group = NULL;
- struct vfio_device *device = NULL;
- list_for_each_entry(group, &container->group_list, container_next) {
list_for_each_entry(device, &group->device_list, group_next) {
return device->dev;
}
- }
- return NULL;
+}
+static void vfio_log_buf_release_dma(struct device *dev,
- struct vfio_log_buf *log_buf)
+{
- int i;
- for (i = 0; i < log_buf->info.addrs_size; i++) {
if ((log_buf->cpu_addrs && log_buf->cpu_addrs[i] != 0) &&
(log_buf->info.sgevec &&
log_buf->info.sgevec[i].addr != 0)) {
dma_free_coherent(dev, log_buf->info.frag_size,
(void *)log_buf->cpu_addrs[i],
log_buf->info.sgevec[i].addr);
log_buf->cpu_addrs[i] = 0;
log_buf->info.sgevec[i].addr = 0;
}
- }
+}
+static long vfio_log_buf_alloc_dma(struct vfio_log_buf_info *info,
- struct vfio_log_buf *log_buf, struct device *dev)
+{
- int i;
- for (i = 0; i < info->addrs_size; i++) {
log_buf->cpu_addrs[i] = (unsigned long)dma_alloc_coherent(dev,
info->frag_size, &log_buf->info.sgevec[i].addr,
GFP_KERNEL);
log_buf->info.sgevec[i].len = info->frag_size;
if (log_buf->cpu_addrs[i] == 0 ||
log_buf->info.sgevec[i].addr == 0) {
return -ENOMEM;
}
- }
- return 0;
+}
+static long vfio_log_buf_alloc_addrs(struct vfio_log_buf_info *info,
- struct vfio_log_buf *log_buf)
+{
- log_buf->info.sgevec = kcalloc(info->addrs_size,
sizeof(struct vfio_log_buf_sge), GFP_KERNEL);
- if (!log_buf->info.sgevec)
return -ENOMEM;
- log_buf->cpu_addrs = kcalloc(info->addrs_size,
sizeof(unsigned long), GFP_KERNEL);
- if (!log_buf->cpu_addrs) {
kfree(log_buf->info.sgevec);
log_buf->info.sgevec = NULL;
return -ENOMEM;
- }
- return 0;
+}
+static long vfio_log_buf_info_valid(struct vfio_log_buf_info *info) +{
- if (info->addrs_size > LOG_BUF_MAX_ADDRS_SIZE ||
info->addrs_size == 0) {
pr_err("can`t support vm ram size larger than 1T or equal to 0\n");
return -EINVAL;
- }
- if (info->frag_size != LOG_BUF_FRAG_SIZE) {
pr_err("only support %d frag size\n", LOG_BUF_FRAG_SIZE);
return -EINVAL;
- }
- return 0;
+}
+static long vfio_log_buf_setup(struct vfio_container *container,
- unsigned long data)
+{
- struct vfio_log_buf_info info;
- struct vfio_log_buf *log_buf = &container->log_buf;
- struct device *dev = NULL;
- long ret;
- if (log_buf->info.sgevec) {
pr_warn("log buf already setup\n");
return 0;
- }
- if (copy_from_user(&info, (void __user *)data,
sizeof(struct vfio_log_buf_info)))
return -EFAULT;
- ret = vfio_log_buf_info_valid(&info);
- if (ret)
return ret;
- ret = vfio_log_buf_alloc_addrs(&info, log_buf);
- if (ret)
goto err_out;
- dev = vfio_get_dev(container);
- if (!dev) {
pr_err("can`t get dev\n");
goto err_free_addrs;
- }
- ret = vfio_log_buf_alloc_dma(&info, log_buf, dev);
- if (ret)
goto err_free_dma_array;
- log_buf->info.uuid = info.uuid;
- log_buf->info.buffer_size = info.buffer_size;
- log_buf->info.frag_size = info.frag_size;
- log_buf->info.addrs_size = info.addrs_size;
- log_buf->buffer_state = 1;
- return 0;
+err_free_dma_array:
- vfio_log_buf_release_dma(dev, log_buf);
+err_free_addrs:
- kfree(log_buf->cpu_addrs);
- log_buf->cpu_addrs = NULL;
- kfree(log_buf->info.sgevec);
- log_buf->info.sgevec = NULL;
+err_out:
- return -ENOMEM;
+}
+static long vfio_log_buf_release_buffer(struct vfio_container *container) +{
- struct vfio_log_buf *log_buf = &container->log_buf;
- struct device *dev = NULL;
- if (log_buf->buffer_state == 0) {
pr_warn("buffer already released\n");
return 0;
- }
- dev = vfio_get_dev(container);
- if (!dev) {
pr_err("can`t get dev\n");
return -EFAULT;
- }
- vfio_log_buf_release_dma(dev, log_buf);
- kfree(log_buf->cpu_addrs);
- log_buf->cpu_addrs = NULL;
- kfree(log_buf->info.sgevec);
- log_buf->info.sgevec = NULL;
- log_buf->buffer_state = 0;
- return 0;
+}
+static int vfio_log_buf_release(struct inode *inode, struct file *filep) +{
- struct vfio_container *container = filep->private_data;
- vfio_log_buf_stop(container);
- vfio_log_buf_release_buffer(container);
- memset(&container->log_buf, 0, sizeof(struct vfio_log_buf));
- return 0;
+}
+static long vfio_ioctl_handle_log_buf_ctl(struct vfio_container *container,
- unsigned long arg)
+{
- struct vfio_log_buf_ctl log_buf_ctl;
- long ret = 0;
- if (copy_from_user(&log_buf_ctl, (void __user *)arg,
sizeof(struct vfio_log_buf_ctl)))
return -EFAULT;
- switch (log_buf_ctl.flags) {
- case VFIO_DEVICE_LOG_BUF_FLAG_SETUP:
ret = vfio_log_buf_setup(container,
(unsigned long)log_buf_ctl.data);
break;
- case VFIO_DEVICE_LOG_BUF_FLAG_RELEASE:
ret = vfio_log_buf_release_buffer(container);
break;
- case VFIO_DEVICE_LOG_BUF_FLAG_START:
ret = vfio_log_buf_start(container);
break;
- case VFIO_DEVICE_LOG_BUF_FLAG_STOP:
ret = vfio_log_buf_stop(container);
break;
- case VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY:
ret = vfio_log_buf_query(container);
break;
- default:
pr_err("log buf control flag incorrect\n");
ret = -EINVAL;
break;
- }
- return ret;
+}
+static long vfio_log_buf_fops_unl_ioctl(struct file *filep,
- unsigned int cmd, unsigned long arg)
+{
- struct vfio_container *container = filep->private_data;
- long ret = -EINVAL;
- switch (cmd) {
- case VFIO_LOG_BUF_CTL:
ret = vfio_ioctl_handle_log_buf_ctl(container, arg);
break;
- default:
pr_err("log buf control cmd incorrect\n");
break;
- }
- return ret;
+}
+#ifdef CONFIG_COMPAT +static long vfio_log_buf_fops_compat_ioctl(struct file *filep,
- unsigned int cmd, unsigned long arg)
+{
- arg = (unsigned long)compat_ptr(arg);
- return vfio_log_buf_fops_unl_ioctl(filep, cmd, arg);
+} +#endif /* CONFIG_COMPAT */
+static const struct file_operations vfio_log_buf_fops = {
- .owner = THIS_MODULE,
- .mmap = vfio_log_buf_fops_mmap,
- .unlocked_ioctl = vfio_log_buf_fops_unl_ioctl,
- .release = vfio_log_buf_release,
+#ifdef CONFIG_COMPAT
- .compat_ioctl = vfio_log_buf_fops_compat_ioctl,
+#endif +};
+static int vfio_get_log_buf_fd(struct vfio_container *container,
- unsigned long arg)
+{
- struct file *filep = NULL;
- int ret;
- if (container->log_buf.fd > 0)
return container->log_buf.fd;
- ret = get_unused_fd_flags(O_CLOEXEC);
- if (ret < 0) {
pr_err("get_unused_fd_flags get fd failed\n");
return ret;
- }
- filep = anon_inode_getfile("[vfio-log-buf]", &vfio_log_buf_fops,
container, O_RDWR);
- if (IS_ERR(filep)) {
pr_err("anon_inode_getfile failed\n");
put_unused_fd(ret);
ret = PTR_ERR(filep);
return ret;
- }
- filep->f_mode |= (FMODE_READ | FMODE_WRITE | FMODE_LSEEK);
- fd_install(ret, filep);
- container->log_buf.fd = ret;
- return ret;
+}
static long vfio_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
- unsigned int cmd, unsigned long arg)
{ struct vfio_container *container = filep->private_data; struct vfio_iommu_driver *driver; @@ -1179,6 +1582,9 @@ static long vfio_fops_unl_ioctl(struct file *filep, case VFIO_SET_IOMMU: ret = vfio_ioctl_set_iommu(container, arg); break;
- case VFIO_GET_LOG_BUF_FD:
ret = vfio_get_log_buf_fd(container, arg);
default: driver = container->iommu_driver; data = container->iommu_data;break;
@@ -1210,6 +1616,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) INIT_LIST_HEAD(&container->group_list); init_rwsem(&container->group_lock); kref_init(&container->kref);
memset(&container->log_buf, 0, sizeof(struct vfio_log_buf));
filep->private_data = container;
@@ -1219,9 +1626,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep) { struct vfio_container *container = filep->private_data;
filep->private_data = NULL;
vfio_container_put(container);
return 0;
diff --git a/include/linux/vfio_pci_migration.h b/include/linux/vfio_pci_migration.h new file mode 100644 index 0000000..464ffb4 --- /dev/null +++ b/include/linux/vfio_pci_migration.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/*
- Copyright (c) 2022 Huawei Technologies Co., Ltd. All rights reserved.
- */
+#ifndef VFIO_PCI_MIGRATION_H +#define VFIO_PCI_MIGRATION_H
+#include <linux/types.h> +#include <linux/pci.h>
+#define VFIO_REGION_TYPE_MIGRATION (3) +/* sub-types for VFIO_REGION_TYPE_MIGRATION */ +#define VFIO_REGION_SUBTYPE_MIGRATION (1)
+#define VFIO_MIGRATION_BUFFER_MAX_SIZE SZ_256K +#define VFIO_MIGRATION_REGION_DATA_OFFSET \
- (sizeof(struct vfio_device_migration_info))
+#define VFIO_DEVICE_MIGRATION_OFFSET(x) \
- offsetof(struct vfio_device_migration_info, x)
+struct vfio_device_migration_info {
- __u32 device_state; /* VFIO device state */
+#define VFIO_DEVICE_STATE_STOP (0) +#define VFIO_DEVICE_STATE_RUNNING (1 << 0) +#define VFIO_DEVICE_STATE_SAVING (1 << 1) +#define VFIO_DEVICE_STATE_RESUMING (1 << 2) +#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
- VFIO_DEVICE_STATE_SAVING | VFIO_DEVICE_STATE_RESUMING)
- __u32 reserved;
- __u32 device_cmd;
- __u32 version_id;
- __u64 pending_bytes;
- __u64 data_offset;
- __u64 data_size;
+};
This part already exists in include/uapi/linux/vfio.h, please do not repeat the statement.
+enum {
- VFIO_DEVICE_STOP = 0xffff0001,
- VFIO_DEVICE_CONTINUE,
- VFIO_DEVICE_MIGRATION_CANCEL,
+};
+struct vfio_log_buf_sge {
- __u64 len;
- __u64 addr;
+};
+struct vfio_log_buf_info {
- __u32 uuid;
- __u64 buffer_size;
- __u64 addrs_size;
- __u64 frag_size;
- struct vfio_log_buf_sge *sgevec;
+};
+struct vfio_log_buf_ctl {
- __u32 argsz;
- __u32 flags;
- #define VFIO_DEVICE_LOG_BUF_FLAG_SETUP (1 << 0)
- #define VFIO_DEVICE_LOG_BUF_FLAG_RELEASE (1 << 1)
- #define VFIO_DEVICE_LOG_BUF_FLAG_START (1 << 2)
- #define VFIO_DEVICE_LOG_BUF_FLAG_STOP (1 << 3)
- #define VFIO_DEVICE_LOG_BUF_FLAG_STATUS_QUERY (1 << 4)
- void *data;
+}; +#define VFIO_LOG_BUF_CTL _IO(VFIO_TYPE, VFIO_BASE + 21) +#define VFIO_GET_LOG_BUF_FD _IO(VFIO_TYPE, VFIO_BASE + 22) +#define VFIO_DEVICE_LOG_BUF_CTL _IO(VFIO_TYPE, VFIO_BASE + 23)
+struct vf_migration_log_info {
- __u32 dom_uuid;
- __u64 buffer_size;
- __u64 sge_len;
- __u64 sge_num;
- struct vfio_log_buf_sge *sgevec;
+};
+struct vfio_device_migration_ops {
- /* Get device information */
- int (*get_info)(struct pci_dev *pdev,
struct vfio_device_migration_info *info);
- /* Enable a vf device */
- int (*enable)(struct pci_dev *pdev);
- /* Disable a vf device */
- int (*disable)(struct pci_dev *pdev);
- /* Save a vf device */
- int (*save)(struct pci_dev *pdev, void *base,
uint64_t off, uint64_t count);
- /* Resuming a vf device */
- int (*restore)(struct pci_dev *pdev, void *base,
uint64_t off, uint64_t count);
- /* Log start a vf device */
- int (*log_start)(struct pci_dev *pdev,
struct vf_migration_log_info *log_info);
- /* Log stop a vf device */
- int (*log_stop)(struct pci_dev *pdev, uint32_t uuid);
- /* Get vf device log status */
- int (*get_log_status)(struct pci_dev *pdev);
- /* Pre enable a vf device(load_setup, before restore a vf) */
- int (*pre_enable)(struct pci_dev *pdev);
- /* Cancel a vf device when live migration failed (rollback) */
- int (*cancel)(struct pci_dev *pdev);
- /* Init a vf device */
- int (*init)(struct pci_dev *pdev);
- /* Uninit a vf device */
- void (*uninit)(struct pci_dev *pdev);
- /* Release a vf device */
- void (*release)(struct pci_dev *pdev);
+};
+struct vfio_pci_vendor_mig_driver {
- struct pci_dev *pdev;
- unsigned char bus_num;
- struct vfio_device_migration_ops *dev_mig_ops;
- struct module *owner;
- atomic_t count;
- struct list_head list;
+};
+struct vfio_pci_migration_data {
- u64 state_size;
- struct pci_dev *vf_dev;
- struct vfio_pci_vendor_mig_driver *mig_driver;
- struct vfio_device_migration_info *mig_ctl;
- void *vf_data;
+};
+int vfio_pci_register_migration_ops(struct vfio_device_migration_ops *ops,
- struct module *mod, struct pci_dev *pdev);
+void vfio_pci_unregister_migration_ops(struct module *mod,
- struct pci_dev *pdev);
+#endif /* VFIO_PCI_MIGRATION_H */
The current openEuler kernel framework itself has added the device live migration function, and has also added the device live migration function. For details, please refer to drivers/crypto/hisilicon/migration
The Accelerator Device Live Migration driver has been uploaded to kernel 5.18 and is supported on the new framework VFIO Migration Protocol v2.0. Your network card device can use it as a reference.
Thanks, Liu Longfang