From: Juan Zhou zhoujuan51@h-partners.com
Support hns roce bonding.
Junxian Huang (8): RDMA/hns: Support RoCE bonding RDMA/hns: Set IB port state depending on upper device for RoCE bonding RDMA/hns: Support dispatching IB event for RoCE bonding RDMA/hns: Add functions to obtain netdev and bus_num from an hr_dev RDMA/hns: Support reset recovery for RoCE bonding RDMA/hns: Fix wild pointer error of RoCE bonding when rmmod hns3 RDMA/hns: Fix the device loss after unbinding RoCE bond resource slave RDMA/hns: Fix the concurrency error between bond and reset.
drivers/infiniband/hw/hns/Makefile | 4 +- drivers/infiniband/hw/hns/hns_roce_ah.c | 1 - drivers/infiniband/hw/hns/hns_roce_bond.c | 951 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_bond.h | 92 ++ drivers/infiniband/hw/hns/hns_roce_device.h | 21 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 173 +++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 19 + drivers/infiniband/hw/hns/hns_roce_main.c | 153 +++- drivers/infiniband/hw/hns/hns_roce_pd.c | 1 - drivers/infiniband/hw/hns/hns_roce_qp.c | 5 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 1 - 11 files changed, 1387 insertions(+), 34 deletions(-) create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.c create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.h
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
----------------------------------------------------------
Support hns roce bonding
Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: ChunZhi Hu huchunzhi@huawei.com --- drivers/infiniband/hw/hns/Makefile | 4 +- drivers/infiniband/hw/hns/hns_roce_bond.c | 810 ++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_bond.h | 86 +++ drivers/infiniband/hw/hns/hns_roce_device.h | 8 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 132 +++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 19 + drivers/infiniband/hw/hns/hns_roce_main.c | 73 +- 7 files changed, 1119 insertions(+), 13 deletions(-) create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.c create mode 100644 drivers/infiniband/hw/hns/hns_roce_bond.h
diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index 161615fda..8faa43009 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -4,11 +4,13 @@ #
ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3pf +ccflags-y += -I $(srctree)/drivers/net/ethernet/hisilicon/hns3/hns3_common
hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o hns_roce_sysfs.o + hns_roce_debugfs.o hns_roce_sysfs.o hns_roce_bond.o
ifdef CONFIG_INFINIBAND_HNS_HIP08 hns-roce-hw-v2-objs := hns_roce_hw_v2.o $(hns-roce-objs) diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c new file mode 100644 index 000000000..d6ebf41c9 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -0,0 +1,810 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022 Hisilicon Limited. + */ + +#include <linux/pci.h> +#include "hnae3.h" +#include "hns_roce_device.h" +#include "hns_roce_hw_v2.h" +#include "hns_roce_bond.h" + +static DEFINE_MUTEX(roce_bond_mutex); +static DEFINE_XARRAY(roce_bond_xa); + +static struct hns_roce_dev *hns_roce_get_hrdev_by_netdev(struct net_device *net_dev) +{ + struct ib_device *ibdev = + ib_device_get_by_netdev(net_dev, RDMA_DRIVER_HNS); + struct hns_roce_dev *hr_dev; + + if (!ibdev) + return NULL; + + hr_dev = container_of(ibdev, struct hns_roce_dev, ib_dev); + ib_device_put(ibdev); + + return hr_dev; +} + +static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev) +{ + struct net_device *upper_dev; + + rcu_read_lock(); + upper_dev = netdev_master_upper_dev_get_rcu(net_dev); + rcu_read_unlock(); + + return upper_dev; +} + +static bool is_netdev_bond_slave(struct net_device *net_dev, + struct hns_roce_bond_group *bond_grp) +{ + int i; + + if (!net_dev || !bond_grp) + return false; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) + if (net_dev == bond_grp->bond_func_info[i].net_dev) + return true; + + return false; +} + +static bool is_hrdev_bond_slave(struct hns_roce_dev *hr_dev, + struct net_device *upper_dev) +{ + struct hns_roce_bond_group *bond_grp; + + if (!hr_dev || !upper_dev) + return false; + + if (!netif_is_lag_master(upper_dev)) + return false; + + if (upper_dev == get_upper_dev_from_ndev(hr_dev->iboe.netdevs[0])) + return true; + + bond_grp = hns_roce_get_bond_grp(hr_dev); + if (bond_grp && upper_dev == bond_grp->upper_dev) + return true; + + return false; +} + +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_die_info *die_info = + xa_load(&roce_bond_xa, hr_dev->pci_dev->bus->number); + struct hns_roce_bond_group *bond_grp; + int i; + + if (!die_info) + return NULL; + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + bond_grp = die_info->bgrps[i]; + if (!bond_grp) + continue; + if (is_netdev_bond_slave(hr_dev->iboe.netdevs[0], bond_grp) || + bond_grp->upper_dev == + get_upper_dev_from_ndev(hr_dev->iboe.netdevs[0])) + return bond_grp; + } + + return NULL; +} + +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + + if (bond_grp && + (bond_grp->bond_state == HNS_ROCE_BOND_REGISTERING || + bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED)) + return true; + + return false; +} + +static inline bool is_active_slave(struct net_device *net_dev, + struct hns_roce_bond_group *bond_grp) +{ + if (!bond_grp || !bond_grp->bond || !bond_grp->bond->curr_active_slave) + return false; + + return net_dev == bond_grp->bond->curr_active_slave->dev; +} + +struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + struct net_device *net_dev = NULL; + int i; + + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) + return NULL; + + if (!bond_grp) + return NULL; + + mutex_lock(&bond_grp->bond_mutex); + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) + goto out; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && is_active_slave(net_dev, bond_grp)) + break; + } + } else { + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && get_port_state(net_dev) == IB_PORT_ACTIVE) + break; + } + } + +out: + mutex_unlock(&bond_grp->bond_mutex); + + return net_dev; +} + +static void hns_roce_queue_bond_work(struct hns_roce_bond_group *bond_grp, + unsigned long delay) +{ + schedule_delayed_work(&bond_grp->bond_work, delay); +} + +static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u32 active_slave_map = 0; + u8 active_slave_num = 0; + bool active; + u8 i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev) { + active = (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) ? + is_active_slave(net_dev, bond_grp) : + (get_port_state(net_dev) == IB_PORT_ACTIVE); + if (active) { + active_slave_num++; + active_slave_map |= (1 << i); + } + } + } + + bond_grp->active_slave_num = active_slave_num; + bond_grp->active_slave_map = active_slave_map; +} + +static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) +{ + struct hns_roce_dev *hr_dev = NULL; + struct net_device *net_dev; + int ret; + int i; + + for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev) + hns_roce_bond_uninit_client(bond_grp, i); + } + + bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev) { + hr_dev = hns_roce_bond_init_client(bond_grp, i); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + if (!hr_dev) + return; + + bond_grp->slave_map_diff = 0; + hns_roce_bond_get_active_slave(bond_grp); + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); + if (ret) { + ibdev_err(&hr_dev->ib_dev, "failed to set RoCE bond!\n"); + return; + } + + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + ibdev_info(&hr_dev->ib_dev, "RoCE set bond finished!\n"); +} + +static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) +{ + u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + struct hns_roce_dev *hr_dev = NULL; + struct net_device *net_dev; + int i, ret; + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) + goto out; + + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->main_hr_dev = NULL; + + hns_roce_bond_uninit_client(bond_grp, main_func_idx); + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev) { + hr_dev = hns_roce_bond_init_client(bond_grp, i); + if (hr_dev) + bond_grp->main_hr_dev = hr_dev; + } + } + +out: + ret = hns_roce_cleanup_bond(bond_grp); + if (!ret) + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE clear bond finished!\n"); +} + +static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + hns_roce_bond_get_active_slave(bond_grp); + + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + if (ret) { + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to change RoCE bond slave state!\n"); + return; + } + + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave changestate finished!\n"); +} + +static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp) +{ + u32 inc_slave_map = bond_grp->slave_map_diff; + u8 inc_func_idx = 0; + int ret; + + while (inc_slave_map > 0) { + if (inc_slave_map & 1) + hns_roce_bond_uninit_client(bond_grp, inc_func_idx); + inc_slave_map >>= 1; + inc_func_idx++; + } + + bond_grp->slave_map_diff = 0; + hns_roce_bond_get_active_slave(bond_grp); + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + if (ret) { + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to increase RoCE bond slave!\n"); + return; + } + + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave increase finished!\n"); +} + +static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp) +{ + u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); + u32 dec_slave_map = bond_grp->slave_map_diff; + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + u8 dec_func_idx = 0; + int ret; + int i; + + if (dec_slave_map & (1 << main_func_idx)) { + hns_roce_bond_uninit_client(bond_grp, main_func_idx); + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (!(dec_slave_map & (1 << i)) && net_dev) { + bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING; + hr_dev = hns_roce_bond_init_client(bond_grp, i); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + } + + while (dec_slave_map > 0) { + if (dec_slave_map & 1) { + bond_grp->bond_func_info[dec_func_idx].net_dev = NULL; + hns_roce_bond_init_client(bond_grp, dec_func_idx); + } + dec_slave_map >>= 1; + dec_func_idx++; + } + + bond_grp->slave_map_diff = 0; + hns_roce_bond_get_active_slave(bond_grp); + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); + if (ret) { + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to decrease RoCE bond slave!\n"); + return; + } + + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave decrease finished!\n"); +} + +static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp) +{ + enum hns_roce_bond_state bond_state = bond_grp->bond_state; + bool bond_ready = bond_grp->bond_ready; + + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "do_bond: bond_ready - %d, bond_state - %d.\n", + bond_ready, bond_grp->bond_state); + + if (!bond_ready) { + hns_roce_clear_bond(bond_grp); + return; + } + + switch (bond_state) { + case HNS_ROCE_BOND_NOT_BONDED: + hns_roce_set_bond(bond_grp); + return; + case HNS_ROCE_BOND_SLAVE_CHANGESTATE: + hns_roce_slave_changestate(bond_grp); + return; + case HNS_ROCE_BOND_SLAVE_INC: + hns_roce_slave_inc(bond_grp); + return; + case HNS_ROCE_BOND_SLAVE_DEC: + hns_roce_slave_dec(bond_grp); + return; + default: + return; + } +} + +void hns_roce_do_bond_work(struct work_struct *work) +{ + struct delayed_work *delayed_work = to_delayed_work(work); + struct hns_roce_bond_group *bond_grp = + container_of(delayed_work, struct hns_roce_bond_group, + bond_work); + int status; + + status = mutex_trylock(&roce_bond_mutex); + if (!status) { + /* delay 1 sec */ + hns_roce_queue_bond_work(bond_grp, HZ); + return; + } + + hns_roce_do_bond(bond_grp); + mutex_unlock(&roce_bond_mutex); +} + +int hns_roce_bond_init(struct hns_roce_dev *hr_dev) +{ + int ret; + + hr_dev->bond_nb.notifier_call = hns_roce_bond_event; + ret = register_netdevice_notifier(&hr_dev->bond_nb); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to register notifier for RoCE bond!\n"); + hr_dev->bond_nb.notifier_call = NULL; + } + + return ret; +} + +static struct hns_roce_die_info *alloc_die_info(int bus_num) +{ + struct hns_roce_die_info *die_info; + int ret; + + die_info = kzalloc(sizeof(struct hns_roce_die_info), GFP_KERNEL); + if (!die_info) + return NULL; + + ret = xa_err(xa_store(&roce_bond_xa, bus_num, die_info, GFP_KERNEL)); + if (ret) { + kfree(die_info); + return NULL; + } + + return die_info; +} + +static void dealloc_die_info(struct hns_roce_die_info *die_info, u8 bus_num) +{ + xa_erase(&roce_bond_xa, bus_num); + kvfree(die_info); +} + +static int alloc_bond_id(struct hns_roce_bond_group *bond_grp) +{ + u8 bus_num = bond_grp->bus_num; + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + int i; + + if (!die_info) { + die_info = alloc_die_info(bus_num); + if (!die_info) { + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to alloc die_info.\n"); + return -ENOMEM; + } + } + + for (i = 0; i < ROCE_BOND_NUM_MAX; i++) { + if (die_info->bond_id_mask & BOND_ID(i)) + continue; + + die_info->bond_id_mask |= BOND_ID(i); + die_info->bgrps[i] = bond_grp; + bond_grp->bond_id = i; + + return 0; + } + + return -ENOSPC; +} + +static int remove_bond_id(int bus_num, u8 bond_id) +{ + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); + + if (bond_id >= ROCE_BOND_NUM_MAX) + return -EINVAL; + + if (!die_info) + return -ENODEV; + + die_info->bond_id_mask &= ~BOND_ID(bond_id); + die_info->bgrps[bond_id] = NULL; + if (!die_info->bond_id_mask) + dealloc_die_info(die_info, bus_num); + + return 0; +} + +int hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp) +{ + int ret; + + ret = bond_grp->main_hr_dev ? + hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO; + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to clear RoCE bond, ret = %d.\n", ret); + + cancel_delayed_work(&bond_grp->bond_work); + ret = remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to remove bond ID %d, ret = %d.\n", + bond_grp->bond_id, ret); + kfree(bond_grp); + + return ret; +} + +static bool hns_roce_bond_lowerstate_event(struct hns_roce_dev *hr_dev, + struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changelowerstate_info *info) +{ + struct net_device *net_dev = + netdev_notifier_info_to_dev((struct netdev_notifier_info *)info); + + if (!netif_is_lag_port(net_dev) || + (!bond_grp || hr_dev != bond_grp->main_hr_dev)) + return false; + + mutex_lock(&bond_grp->bond_mutex); + + if (bond_grp->bond_ready && + bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED) + bond_grp->bond_state = HNS_ROCE_BOND_SLAVE_CHANGESTATE; + + mutex_unlock(&bond_grp->bond_mutex); + + return true; +} + +static bool is_bond_setting_supported(struct netdev_lag_upper_info *bond_info) +{ + if (!bond_info) + return false; + + if (bond_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && + bond_info->tx_type != NETDEV_LAG_TX_TYPE_HASH) + return false; + + if (bond_info->tx_type == NETDEV_LAG_TX_TYPE_HASH && + bond_info->hash_type > NETDEV_LAG_HASH_L23) + return false; + + return true; +} + +static void hns_roce_bond_info_update(struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev, + bool slave_inc) +{ + struct hns_roce_v2_priv *priv; + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + u8 func_idx, i; + + if (!slave_inc) { + for (i = 0; i < ROCE_BOND_FUNC_MAX; ++i) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && upper_dev != + get_upper_dev_from_ndev(net_dev)) { + bond_grp->slave_map_diff |= (1U << i); + bond_grp->slave_map &= ~(1U << i); + } + } + return; + } + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + if (hr_dev) { + func_idx = PCI_FUNC(hr_dev->pci_dev->devfn); + if (!bond_grp->bond_func_info[func_idx].net_dev) { + bond_grp->slave_map_diff |= (1U << func_idx); + bond_grp->slave_map |= (1U << func_idx); + priv = hr_dev->priv; + + bond_grp->bond_func_info[func_idx].net_dev = + net_dev; + + bond_grp->bond_func_info[func_idx].handle = + priv->handle; + } + } + } + rcu_read_unlock(); +} + +static bool hns_roce_bond_upper_event(struct hns_roce_bond_group *bond_grp, + struct netdev_notifier_changeupper_info *info) +{ + struct netdev_lag_upper_info *bond_upper_info = NULL; + struct net_device *upper_dev = info->upper_dev; + bool slave_inc = info->linking; + bool changed = false; + + if (!bond_grp || !upper_dev || !netif_is_lag_master(upper_dev)) + return false; + + if (slave_inc) + bond_upper_info = info->upper_info; + + mutex_lock(&bond_grp->bond_mutex); + + if (bond_upper_info) + bond_grp->tx_type = bond_upper_info->tx_type; + + hns_roce_bond_info_update(bond_grp, upper_dev, slave_inc); + + bond_grp->bond = netdev_priv(upper_dev); + + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) { + bond_grp->bond_ready = true; + changed = true; + } else { + bond_grp->bond_state = slave_inc ? + HNS_ROCE_BOND_SLAVE_INC : + HNS_ROCE_BOND_SLAVE_DEC; + bond_grp->bond_ready = true; + changed = true; + } + + mutex_unlock(&bond_grp->bond_mutex); + + return changed; +} + +static struct hns_roce_bond_group *hns_roce_alloc_bond_grp(struct hns_roce_dev *main_hr_dev, + struct net_device *upper_dev) +{ + struct hns_roce_bond_group *bond_grp; + int ret; + + bond_grp = kzalloc(sizeof(*bond_grp), GFP_KERNEL); + if (!bond_grp) + return NULL; + + mutex_init(&bond_grp->bond_mutex); + + INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_do_bond_work); + + bond_grp->upper_dev = upper_dev; + bond_grp->main_hr_dev = main_hr_dev; + bond_grp->bond_ready = false; + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + bond_grp->bus_num = main_hr_dev->pci_dev->bus->number; + + ret = alloc_bond_id(bond_grp); + if (ret) { + ibdev_err(&main_hr_dev->ib_dev, + "failed to alloc bond ID, ret = %d.\n", ret); + kfree(bond_grp); + return NULL; + } + + hns_roce_bond_info_update(bond_grp, upper_dev, true); + + return bond_grp; +} + +static bool is_dev_bond_supported(struct hns_roce_bond_group *bond_grp, + struct net_device *net_dev, int bus_num) +{ + struct hns_roce_dev *hr_dev = hns_roce_get_hrdev_by_netdev(net_dev); + + if (!hr_dev) { + if (bond_grp && + get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + return true; + else + return false; + } + + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) + return false; + + if (hr_dev->is_vf || pci_num_vf(hr_dev->pci_dev) > 0) + return false; + + if (bus_num != get_hr_bus_num(hr_dev)) + return false; + + return true; +} + +static bool check_unlinking_bond_support(struct hns_roce_bond_group *bond_grp) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(bond_grp->upper_dev, net_dev) { + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0) + slave_num++; + } + rcu_read_unlock(); + + return (slave_num > 1); +} + +static bool check_linking_bond_support(struct netdev_lag_upper_info *bond_info, + struct hns_roce_bond_group *bond_grp, + struct net_device *upper_dev, + int bus_num) +{ + struct net_device *net_dev; + u8 slave_num = 0; + + if (!is_bond_setting_supported(bond_info)) + return false; + + rcu_read_lock(); + for_each_netdev_in_bond_rcu(upper_dev, net_dev) { + if (is_dev_bond_supported(bond_grp, net_dev, bus_num)) { + slave_num++; + } else { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + + return (slave_num > 1 && slave_num <= ROCE_BOND_FUNC_MAX); +} + +static enum bond_support_type + check_bond_support(struct hns_roce_dev *hr_dev, + struct net_device **upper_dev, + struct netdev_notifier_changeupper_info *info) +{ + struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + bool bond_grp_exist = false; + struct net_device *net_dev; + int bus_num = -1; + bool support; + + *upper_dev = info->upper_dev; + if (bond_grp && *upper_dev == bond_grp->upper_dev) + bond_grp_exist = true; + + if (!info->linking && !bond_grp_exist) + return BOND_NOT_SUPPORT; + + if (info->linking) + support = check_linking_bond_support(info->upper_info, bond_grp, + *upper_dev, bus_num); + else + support = check_unlinking_bond_support(bond_grp); + + if (support) + return BOND_SUPPORT; + + return bond_grp_exist ? BOND_EXISTING_NOT_SUPPORT : BOND_NOT_SUPPORT; +} + +int hns_roce_bond_event(struct notifier_block *self, + unsigned long event, void *ptr) +{ + struct net_device *net_dev = netdev_notifier_info_to_dev(ptr); + struct hns_roce_dev *hr_dev = + container_of(self, struct hns_roce_dev, bond_nb); + enum bond_support_type support = BOND_SUPPORT; + struct hns_roce_bond_group *bond_grp; + struct net_device *upper_dev; + bool changed; + + if (event != NETDEV_CHANGEUPPER && event != NETDEV_CHANGELOWERSTATE) + return NOTIFY_DONE; + + if (event == NETDEV_CHANGEUPPER) { + support = check_bond_support(hr_dev, &upper_dev, ptr); + if (support == BOND_NOT_SUPPORT) + return NOTIFY_DONE; + } else { + upper_dev = get_upper_dev_from_ndev(net_dev); + } + + if (upper_dev && !is_hrdev_bond_slave(hr_dev, upper_dev)) + return NOTIFY_DONE; + else if (!upper_dev && hr_dev != hns_roce_get_hrdev_by_netdev(net_dev)) + return NOTIFY_DONE; + + bond_grp = hns_roce_get_bond_grp(hr_dev); + if (event == NETDEV_CHANGEUPPER) { + if (!bond_grp) { + bond_grp = hns_roce_alloc_bond_grp(hr_dev, upper_dev); + if (!bond_grp) { + ibdev_err(&hr_dev->ib_dev, + "failed to alloc RoCE bond_grp!\n"); + return NOTIFY_DONE; + } + } else if (hr_dev != bond_grp->main_hr_dev) { + return NOTIFY_DONE; + } + if (support == BOND_EXISTING_NOT_SUPPORT) { + bond_grp->bond_ready = false; + hns_roce_queue_bond_work(bond_grp, HZ); + return NOTIFY_DONE; + } + changed = hns_roce_bond_upper_event(bond_grp, ptr); + } else { + changed = hns_roce_bond_lowerstate_event(hr_dev, bond_grp, ptr); + } + if (changed) + hns_roce_queue_bond_work(bond_grp, HZ); + + return NOTIFY_DONE; +} diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h new file mode 100644 index 000000000..94ee5bf36 --- /dev/null +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022 Hisilicon Limited. + */ + +#ifndef _HNS_ROCE_BOND_H +#define _HNS_ROCE_BOND_H + +#include <linux/netdevice.h> +#include <net/bonding.h> + +#define ROCE_BOND_FUNC_MAX 4 +#define ROCE_BOND_NUM_MAX 2 + +#define BOND_ID(id) BIT(id) + +enum { + BOND_MODE_1, + BOND_MODE_2_4, +}; + +enum bond_support_type { + BOND_NOT_SUPPORT, + /* + * bond_grp already exists, but in the current + * conditions it's no longer supported + */ + BOND_EXISTING_NOT_SUPPORT, + BOND_SUPPORT, +}; + +enum hns_roce_bond_state { + HNS_ROCE_BOND_NOT_BONDED, + HNS_ROCE_BOND_IS_BONDED, + HNS_ROCE_BOND_REGISTERING, + HNS_ROCE_BOND_SLAVE_INC, + HNS_ROCE_BOND_SLAVE_DEC, + HNS_ROCE_BOND_SLAVE_CHANGESTATE, +}; + +enum hns_roce_bond_cmd_type { + HNS_ROCE_SET_BOND, + HNS_ROCE_CHANGE_BOND, + HNS_ROCE_CLEAR_BOND, +}; + +struct hns_roce_func_info { + struct net_device *net_dev; + struct hnae3_handle *handle; +}; + +struct hns_roce_bond_group { + struct net_device *upper_dev; + struct hns_roce_dev *main_hr_dev; + u8 active_slave_num; + u32 slave_map; + u32 active_slave_map; + u32 slave_map_diff; + u8 bond_id; + u8 bus_num; + struct bonding *bond; + bool bond_ready; + enum hns_roce_bond_state bond_state; + enum netdev_lag_tx_type tx_type; + /* + * A mutex which protect bond_grp info + */ + struct mutex bond_mutex; + struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; + struct delayed_work bond_work; +}; + +struct hns_roce_die_info { + u8 bond_id_mask; + struct hns_roce_bond_group *bgrps[ROCE_BOND_NUM_MAX]; +}; + +int hns_roce_bond_init(struct hns_roce_dev *hr_dev); +int hns_roce_bond_event(struct notifier_block *self, + unsigned long event, void *ptr); +int hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp); +bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); +struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev); +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev); + +#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 5f69d1f8e..97ac4cac2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -36,6 +36,7 @@ #include <rdma/ib_verbs.h> #include <rdma/hns-abi.h> #include "hns_roce_debugfs.h" +#include "hns_roce_bond.h"
#define PCI_REVISION_ID_HIP08 0x21 #define PCI_REVISION_ID_HIP09 0x30 @@ -147,6 +148,7 @@ enum { HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), HNS_ROCE_CAP_FLAG_STASH = BIT(17), HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), + HNS_ROCE_CAP_FLAG_BOND = BIT(21), HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), };
@@ -969,6 +971,9 @@ struct hns_roce_hw { enum hns_roce_scc_algo algo); int (*query_scc_param)(struct hns_roce_dev *hr_dev, enum hns_roce_scc_algo alog); + int (*bond_init)(struct hns_roce_dev *hr_dev); + bool (*bond_is_active)(struct hns_roce_dev *hr_dev); + struct net_device *(*get_bond_netdev)(struct hns_roce_dev *hr_dev); };
#define HNS_ROCE_SCC_PARAM_SIZE 4 @@ -1045,6 +1050,7 @@ struct hns_roce_dev { struct hns_roce_dev_debugfs dbgfs; atomic64_t *dfx_cnt; struct hns_roce_scc_param *scc_param; + struct notifier_block bond_nb; };
static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) @@ -1320,7 +1326,7 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type); void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type); void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev); int hns_roce_init(struct hns_roce_dev *hr_dev); -void hns_roce_exit(struct hns_roce_dev *hr_dev); +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup); int hns_roce_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ib_cq); int hns_roce_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ib_qp); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ec3f78586..d2cdaed84 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -43,6 +43,7 @@ #include <rdma/uverbs_ioctl.h>
#include "hnae3.h" +#include "hclge_main.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_cmd.h" @@ -1363,6 +1364,61 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, return ret; }
+static inline enum hns_roce_opcode_type + get_bond_opcode(enum hns_roce_bond_cmd_type bond_type) +{ + if (bond_type == HNS_ROCE_SET_BOND) + return HNS_ROCE_OPC_SET_BOND_INFO; + else if (bond_type == HNS_ROCE_CHANGE_BOND) + return HNS_ROCE_OPC_CHANGE_ACTIVE_PORT; + else + return HNS_ROCE_OPC_CLEAR_BOND_INFO; +} + +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type) +{ + enum hns_roce_opcode_type opcode = get_bond_opcode(bond_type); + struct hns_roce_bond_info *slave_info; + struct hns_roce_cmq_desc desc = {}; + int ret; + + slave_info = (struct hns_roce_bond_info *)desc.data; + hns_roce_cmq_setup_basic_desc(&desc, opcode, false); + + slave_info->bond_id = cpu_to_le32(bond_grp->bond_id); + if (bond_type == HNS_ROCE_CLEAR_BOND) + goto out; + + if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_1); + if (bond_grp->active_slave_num != 1) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "active slave cnt(%d) in Mode 1 is invalid.\n", + bond_grp->active_slave_num); + } else { + slave_info->bond_mode = cpu_to_le32(BOND_MODE_2_4); + slave_info->hash_policy = + cpu_to_le32(bond_grp->bond->params.xmit_policy); + } + + slave_info->active_slave_cnt = + cpu_to_le32(bond_grp->active_slave_num); + slave_info->active_slave_mask = + cpu_to_le32(bond_grp->active_slave_map); + slave_info->slave_mask = + cpu_to_le32(bond_grp->slave_map); + +out: + ret = hns_roce_cmq_send(bond_grp->main_hr_dev, &desc, 1); + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "cmq bond type(%d) failed, ret = %d.\n", + bond_type, ret); + + return ret; +} + static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, dma_addr_t base_addr, u8 cmd, unsigned long tag) { @@ -6842,6 +6898,9 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops, .config_scc_param = hns_roce_v2_config_scc_param, .query_scc_param = hns_roce_v2_query_scc_param, + .bond_init = hns_roce_bond_init, + .bond_is_active = hns_roce_bond_is_active, + .get_bond_netdev = hns_roce_get_bond_netdev, };
static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = { @@ -6896,6 +6955,34 @@ static void hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev, priv->handle = handle; }
+static bool check_vf_support(struct pci_dev *vf) +{ + struct hns_roce_bond_group *bond_grp; + struct pci_dev *pf = pci_physfn(vf); + struct hnae3_ae_dev *ae_dev; + struct hnae3_handle *handle; + struct hns_roce_dev *hr_dev; + struct hclge_dev *hdev; + + /* pf == vf means that the driver is running in VM. */ + if (pf == vf) + return true; + + ae_dev = pci_get_drvdata(pf); + hdev = ae_dev->priv; + handle = &hdev->vport[0].roce; + hr_dev = handle->priv; + + if (!hr_dev) + return false; + + bond_grp = hns_roce_get_bond_grp(hr_dev); + if (bond_grp) + return false; + + return true; +} + static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) { struct hns_roce_dev *hr_dev; @@ -6913,6 +7000,11 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
hns_roce_hw_v2_get_cfg(hr_dev, handle);
+ if (hr_dev->is_vf && !check_vf_support(hr_dev->pci_dev)) { + ret = -EOPNOTSUPP; + goto error_failed_roce_init; + } + ret = hns_roce_init(hr_dev); if (ret) { dev_err(hr_dev->dev, "RoCE Engine init failed!\n"); @@ -6932,7 +7024,7 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) return 0;
error_failed_free_mr_init: - hns_roce_exit(hr_dev); + hns_roce_exit(hr_dev, true);
error_failed_roce_init: kfree(hr_dev->priv); @@ -6944,7 +7036,7 @@ error_failed_kzalloc: }
static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, - bool reset) + bool reset, bool bond_cleanup) { struct hns_roce_dev *hr_dev = handle->priv;
@@ -6959,7 +7051,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) free_mr_exit(hr_dev);
- hns_roce_exit(hr_dev); + hns_roce_exit(hr_dev, bond_cleanup); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } @@ -7015,7 +7107,37 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT;
- __hns_roce_hw_v2_uninit_instance(handle, reset); + __hns_roce_hw_v2_uninit_instance(handle, reset, true); + + handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; +} + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle; + int ret; + + handle = bond_grp->bond_func_info[func_idx].handle; + ret = hns_roce_hw_v2_init_instance(handle); + if (ret) + return NULL; + + return handle->priv; +} + +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx) +{ + struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle; + + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) + return; + + handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT; + + __hns_roce_hw_v2_uninit_instance(handle, false, false);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; } @@ -7080,7 +7202,7 @@ static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) handle->rinfo.reset_state = HNS_ROCE_STATE_RST_UNINIT; dev_info(&handle->pdev->dev, "In reset process RoCE client uninit.\n"); msleep(HNS_ROCE_V2_HW_RST_UNINT_DELAY); - __hns_roce_hw_v2_uninit_instance(handle, false); + __hns_roce_hw_v2_uninit_instance(handle, false, false);
return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 3167003c4..074e1f290 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -226,6 +226,9 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, + HNS_ROCE_OPC_SET_BOND_INFO = 0x8601, + HNS_ROCE_OPC_CLEAR_BOND_INFO = 0x8602, + HNS_ROCE_OPC_CHANGE_ACTIVE_PORT = 0x8603, };
enum { @@ -1571,7 +1574,23 @@ struct hns_roce_sccc_clr_done { __le32 rsv[5]; };
+struct hns_roce_bond_info { + __le32 bond_id; + __le32 bond_mode; + __le32 active_slave_cnt; + __le32 active_slave_mask; + __le32 slave_mask; + __le32 hash_policy; +}; + +struct hns_roce_dev + *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, + int func_idx); +void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx); int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, + enum hns_roce_bond_cmd_type bond_type);
static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 4e9ef6f02..68bd08c87 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -37,9 +37,36 @@ #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_cache.h> + +#include "hnae3.h" #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" +#include "hns_roce_hw_v2.h" + +static struct net_device *hns_roce_get_netdev(struct ib_device *ib_dev, + u32 port_num) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + struct net_device *ndev; + + if (port_num < 1 || port_num > hr_dev->caps.num_ports) + return NULL; + + ndev = hr_dev->hw->get_bond_netdev(hr_dev); + + rcu_read_lock(); + + if (!ndev) + ndev = hr_dev->iboe.netdevs[port_num - 1]; + + if (ndev) + dev_hold(ndev); + + rcu_read_unlock(); + + return ndev; +}
static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u32 port, const u8 *addr) @@ -262,7 +289,9 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
spin_lock_irqsave(&hr_dev->iboe.lock, flags);
- net_dev = hr_dev->iboe.netdevs[port]; + net_dev = hr_dev->hw->get_bond_netdev(hr_dev); + if (!net_dev) + net_dev = hr_dev->iboe.netdevs[port]; if (!net_dev) { spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); dev_err(dev, "find netdev %u failed!\n", port); @@ -615,9 +644,18 @@ static int hns_roce_get_hw_stats(struct ib_device *device, return num_counters; }
-static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) +static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, + bool bond_cleanup) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + struct hns_roce_bond_group *bond_grp; + + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + unregister_netdevice_notifier(&hr_dev->bond_nb); + bond_grp = hns_roce_get_bond_grp(hr_dev); + if (bond_grp && bond_cleanup) + hns_roce_cleanup_bond(bond_grp); + }
hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); @@ -647,6 +685,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .disassociate_ucontext = hns_roce_disassociate_ucontext, .get_dma_mr = hns_roce_get_dma_mr, .get_link_layer = hns_roce_get_link_layer, + .get_netdev = hns_roce_get_netdev, .get_port_immutable = hns_roce_port_immutable, .mmap = hns_roce_mmap, .mmap_free = hns_roce_free_mmap, @@ -766,7 +805,12 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) return ret; } dma_set_max_seg_size(dev, UINT_MAX); - ret = ib_register_device(ib_dev, "hns_%d", dev); + + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) && + (hr_dev->hw->bond_is_active(hr_dev))) + ret = ib_register_device(ib_dev, "hns_bond_%d", dev); + else + ret = ib_register_device(ib_dev, "hns_%d", dev); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; @@ -785,9 +829,26 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) goto error_failed_setup_mtu_mac; }
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + ret = hr_dev->hw->bond_init(hr_dev); + if (ret) { + dev_err(dev, "roce bond init failed, ret = %d\n", ret); + /* For non-bond devices, the failure of bond_init does + * not affect other functions. + */ + if (hr_dev->hw->bond_is_active(hr_dev)) + goto error_bond_init; + else + ret = 0; + } + } + hr_dev->active = true; - return 0;
+ return ret; + +error_bond_init: + unregister_netdevice_notifier(&iboe->nb); error_failed_setup_mtu_mac: ib_unregister_device(ib_dev);
@@ -1164,11 +1225,11 @@ error_failed_alloc_dfx_cnt: return ret; }
-void hns_roce_exit(struct hns_roce_dev *hr_dev) +void hns_roce_exit(struct hns_roce_dev *hr_dev, bool bond_cleanup) { hns_roce_unregister_sysfs(hr_dev); + hns_roce_unregister_device(hr_dev, bond_cleanup); hns_roce_unregister_debugfs(hr_dev); - hns_roce_unregister_device(hr_dev);
if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev);
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
---------------------------------------------------------------
For RoCE bonding, the IB port state should depend on the link status of upper device. When the upper device is link up, the IB port state should be IB_PORT_ATIVE; otherwise, the state should be IB_PORT_DOWN.
Particularly, when all slaves are link down, the upper device will become link down automatically, and when at least one slave become link up, the upper device will become link up. In these situations the IB port will also change accordingly.
Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_main.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+)
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 68bd08c87..d35ccaf56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -260,6 +260,19 @@ static int hns_roce_query_device(struct ib_device *ib_dev, return 0; }
+static enum ib_port_state get_upper_port_state(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_bond_group *bond_grp; + struct net_device *upper; + + bond_grp = hns_roce_get_bond_grp(hr_dev); + upper = bond_grp ? bond_grp->upper_dev : NULL; + if (upper) + return get_port_state(upper); + + return IB_PORT_ACTIVE; +} + static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, struct ib_port_attr *props) { @@ -305,6 +318,11 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num, IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
+ if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND && + props->state == IB_PORT_ACTIVE) + props->state = get_upper_port_state(hr_dev); + + spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
return 0;
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
---------------------------------------------------------------
Support dispatching IB event for RoCE bonding. After setting bond, IB_EVENT_PORT_ERR is dispatched in the following situation: 1. bond0 becomes link down; 2. all slaves become link down (as it will lead to a link-down of the upper device).
IB_EVENT_PORT_ACTIVE is dispatched in the following situation: 1. bond0 becomes link up; 2. one slave becomes link up when all slaves were link down (as it will lead to a link-up of the upper device).
Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_bond.c | 17 +++++++++-------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 10 ++++++++++ drivers/infiniband/hw/hns/hns_roce_main.c | 18 ++++++++++++------ 3 files changed, 31 insertions(+), 14 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index d6ebf41c9..fc179ea52 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -138,17 +138,18 @@ struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev) if (bond_grp->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { net_dev = bond_grp->bond_func_info[i].net_dev; - if (net_dev && is_active_slave(net_dev, bond_grp)) - break; - } - } else { - for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { - net_dev = bond_grp->bond_func_info[i].net_dev; - if (net_dev && get_port_state(net_dev) == IB_PORT_ACTIVE) - break; + if (net_dev && is_active_slave(net_dev, bond_grp) && + get_port_state(net_dev) == IB_PORT_ACTIVE) + goto out; } }
+ for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && get_port_state(net_dev) == IB_PORT_ACTIVE) + break; + } + out: mutex_unlock(&bond_grp->bond_mutex);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d2cdaed84..de762e4d5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7234,6 +7234,7 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, { struct net_device *netdev = handle->rinfo.netdev; struct hns_roce_dev *hr_dev = handle->priv; + struct hns_roce_bond_group *bond_grp; struct ib_event event; unsigned long flags; u8 phy_port; @@ -7241,6 +7242,15 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, if (linkup || !hr_dev) return;
+ /* For bond device, the link status depends on the upper netdev, + * and the upper device's link status depends on all the slaves' + * netdev but not only one. So bond device cannot get a correct + * link status from this path. + */ + bond_grp = hns_roce_get_bond_grp(hr_dev); + if (bond_grp) + return; + for (phy_port = 0; phy_port < hr_dev->caps.num_ports; phy_port++) if (netdev == hr_dev->iboe.netdevs[phy_port]) break; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d35ccaf56..6e8196127 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -115,17 +115,16 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) return ret; }
-static int handle_en_event(struct hns_roce_dev *hr_dev, u32 port, - unsigned long dev_event) +static int handle_en_event(struct net_device *netdev, + struct hns_roce_dev *hr_dev, + u32 port, unsigned long dev_event) { struct device *dev = hr_dev->dev; enum ib_port_state port_state; - struct net_device *netdev; struct ib_event event; unsigned long flags; int ret = 0;
- netdev = hr_dev->iboe.netdevs[port]; if (!netdev) { dev_err(dev, "can't find netdev on port(%u)!\n", port); return -ENODEV; @@ -173,17 +172,24 @@ static int hns_roce_netdev_event(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct hns_roce_bond_group *bond_grp; struct hns_roce_ib_iboe *iboe = NULL; struct hns_roce_dev *hr_dev = NULL; + struct net_device *upper = NULL; int ret; u32 port;
hr_dev = container_of(self, struct hns_roce_dev, iboe.nb); iboe = &hr_dev->iboe; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { + bond_grp = hns_roce_get_bond_grp(hr_dev); + upper = bond_grp ? bond_grp->upper_dev : NULL; + }
for (port = 0; port < hr_dev->caps.num_ports; port++) { - if (dev == iboe->netdevs[port]) { - ret = handle_en_event(hr_dev, port, event); + if ((!upper && dev == iboe->netdevs[port]) || + (upper && dev == upper)) { + ret = handle_en_event(dev, hr_dev, port, event); if (ret) return NOTIFY_DONE; break;
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
--------------------------------------------------------------------------
Add 2 inline functions to obtain netdev and bus_num from an hr_dev to improve readability.
Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_ah.c | 1 - drivers/infiniband/hw/hns/hns_roce_bond.c | 11 +++++------ drivers/infiniband/hw/hns/hns_roce_device.h | 12 +++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 22 ++++++++++----------- drivers/infiniband/hw/hns/hns_roce_pd.c | 1 - drivers/infiniband/hw/hns/hns_roce_qp.c | 5 +++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 1 - 8 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 44c65332c..a1adf249b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -30,7 +30,6 @@ * SOFTWARE. */
-#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include "hnae3.h" diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index fc179ea52..a35b982ec 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -3,7 +3,6 @@ * Copyright (c) 2022 Hisilicon Limited. */
-#include <linux/pci.h> #include "hnae3.h" #include "hns_roce_device.h" #include "hns_roce_hw_v2.h" @@ -64,7 +63,7 @@ static bool is_hrdev_bond_slave(struct hns_roce_dev *hr_dev, if (!netif_is_lag_master(upper_dev)) return false;
- if (upper_dev == get_upper_dev_from_ndev(hr_dev->iboe.netdevs[0])) + if (upper_dev == get_upper_dev_from_ndev(get_hr_netdev(hr_dev, 0))) return true;
bond_grp = hns_roce_get_bond_grp(hr_dev); @@ -77,7 +76,8 @@ static bool is_hrdev_bond_slave(struct hns_roce_dev *hr_dev, struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev) { struct hns_roce_die_info *die_info = - xa_load(&roce_bond_xa, hr_dev->pci_dev->bus->number); + xa_load(&roce_bond_xa, get_hr_bus_num(hr_dev)); + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_bond_group *bond_grp; int i;
@@ -88,9 +88,8 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev) bond_grp = die_info->bgrps[i]; if (!bond_grp) continue; - if (is_netdev_bond_slave(hr_dev->iboe.netdevs[0], bond_grp) || - bond_grp->upper_dev == - get_upper_dev_from_ndev(hr_dev->iboe.netdevs[0])) + if (is_netdev_bond_slave(net_dev, bond_grp) || + (bond_grp->upper_dev == get_upper_dev_from_ndev(net_dev))) return bond_grp; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 97ac4cac2..8bd3bad64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -33,6 +33,7 @@ #ifndef _HNS_ROCE_DEVICE_H #define _HNS_ROCE_DEVICE_H
+#include <linux/pci.h> #include <rdma/ib_verbs.h> #include <rdma/hns-abi.h> #include "hns_roce_debugfs.h" @@ -1197,6 +1198,17 @@ static inline enum ib_port_state get_port_state(struct net_device *net_dev) IB_PORT_ACTIVE : IB_PORT_DOWN; }
+static inline struct net_device *get_hr_netdev(struct hns_roce_dev *hr_dev, + u8 port) +{ + return hr_dev->iboe.netdevs[port]; +} + +static inline u8 get_hr_bus_num(struct hns_roce_dev *hr_dev) +{ + return hr_dev->pci_dev->bus->number; +} + extern const struct attribute_group *hns_attr_port_groups[];
void hns_roce_init_uar_table(struct hns_roce_dev *dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index de762e4d5..b85b061fe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7252,7 +7252,7 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, return;
for (phy_port = 0; phy_port < hr_dev->caps.num_ports; phy_port++) - if (netdev == hr_dev->iboe.netdevs[phy_port]) + if (netdev == get_hr_netdev(hr_dev, phy_port)) break;
if (phy_port == hr_dev->caps.num_ports) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 6e8196127..3f4316d71 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -32,7 +32,6 @@ */ #include <linux/acpi.h> #include <linux/module.h> -#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_smi.h> #include <rdma/ib_user_verbs.h> @@ -58,7 +57,7 @@ static struct net_device *hns_roce_get_netdev(struct ib_device *ib_dev, rcu_read_lock();
if (!ndev) - ndev = hr_dev->iboe.netdevs[port_num - 1]; + ndev = get_hr_netdev(hr_dev, port_num - 1);
if (ndev) dev_hold(ndev); @@ -201,14 +200,14 @@ static int hns_roce_netdev_event(struct notifier_block *self,
static int hns_roce_setup_mtu_mac(struct hns_roce_dev *hr_dev) { + struct net_device *net_dev; int ret; u8 i;
for (i = 0; i < hr_dev->caps.num_ports; i++) { hr_dev->iboe.port_state[i] = IB_PORT_DOWN; - - ret = hns_roce_set_mac(hr_dev, i, - hr_dev->iboe.netdevs[i]->dev_addr); + net_dev = get_hr_netdev(hr_dev, i); + ret = hns_roce_set_mac(hr_dev, i, net_dev->dev_addr); if (ret) return ret; } @@ -310,7 +309,7 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u32 port_num,
net_dev = hr_dev->hw->get_bond_netdev(hr_dev); if (!net_dev) - net_dev = hr_dev->iboe.netdevs[port]; + net_dev = get_hr_netdev(hr_dev, port); if (!net_dev) { spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); dev_err(dev, "find netdev %u failed!\n", port); @@ -777,11 +776,12 @@ static const struct ib_device_ops hns_roce_dev_restrack_ops = {
static int hns_roce_register_device(struct hns_roce_dev *hr_dev) { - int ret; struct hns_roce_ib_iboe *iboe = NULL; - struct ib_device *ib_dev = NULL; struct device *dev = hr_dev->dev; + struct ib_device *ib_dev = NULL; + struct net_device *net_dev; unsigned int i; + int ret;
iboe = &hr_dev->iboe; spin_lock_init(&iboe->lock); @@ -820,11 +820,11 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, &hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_restrack_ops); for (i = 0; i < hr_dev->caps.num_ports; i++) { - if (!hr_dev->iboe.netdevs[i]) + net_dev = get_hr_netdev(hr_dev, i); + if (!net_dev) continue;
- ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i], - i + 1); + ret = ib_device_set_netdev(ib_dev, net_dev, i + 1); if (ret) return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index d35cf59d0..225c3e328 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -30,7 +30,6 @@ * SOFTWARE. */
-#include <linux/pci.h> #include "hns_roce_device.h"
void hns_roce_init_pd_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index a1429af9c..0b7064b0a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -31,7 +31,6 @@ * SOFTWARE. */
-#include <linux/pci.h> #include <rdma/ib_addr.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> @@ -1318,11 +1317,13 @@ static int check_mtu_validate(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_attr *attr, int attr_mask) { + struct net_device *net_dev; enum ib_mtu active_mtu; int p;
p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port; - active_mtu = iboe_get_mtu(hr_dev->iboe.netdevs[p]->mtu); + net_dev = get_hr_netdev(hr_dev, p); + active_mtu = iboe_get_mtu(net_dev->mtu);
if ((hr_dev->caps.max_mtu >= IB_MTU_2048 && attr->path_mtu > hr_dev->caps.max_mtu) || diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 4abae9477..31f100211 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -3,7 +3,6 @@ * Copyright (c) 2018 Hisilicon Limited. */
-#include <linux/pci.h> #include <rdma/ib_umem.h> #include <rdma/uverbs_ioctl.h> #include "hns_roce_device.h"
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
---------------------------------------------------------------
Currently, a RoCE bond device cannot be recovered to a bond device after reset.
Applying this patch, RoCE bonding device 'hns_bond_xx' can be recovered after reset, including the following changes: 1. modify the condition for juding whether bond_grp is active, as the bond_grp may be also holding HNS_ROCE_CHANGE_BOND during reset init. Thus, as long as the bond_grp's state is not HNS_ROCE_BOND_NOT_BONDED, it should be considered active. 2. update the link status of slave in bond_grp from NIC bonding driver right before sending command to firmware, as RoCE driver is uninited for a while in reset process, and during this period bond_grp cannot update the information. 3. After the reset, re-config the bond_grp information to firmware, as the firmware is also reset and the previous configuration is cleared.
Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_bond.c | 32 ++++++++++++++++++++--- drivers/infiniband/hw/hns/hns_roce_main.c | 10 +++++-- 2 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index a35b982ec..6fdd8e9fd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -100,9 +100,7 @@ bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) { struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev);
- if (bond_grp && - (bond_grp->bond_state == HNS_ROCE_BOND_REGISTERING || - bond_grp->bond_state == HNS_ROCE_BOND_IS_BONDED)) + if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED) return true;
return false; @@ -186,6 +184,13 @@ static void hns_roce_bond_get_active_slave(struct hns_roce_bond_group *bond_grp) bond_grp->active_slave_map = active_slave_map; }
+static int hns_roce_recover_bond(struct hns_roce_bond_group *bond_grp) +{ + hns_roce_bond_get_active_slave(bond_grp); + + return hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); +} + static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) { struct hns_roce_dev *hr_dev = NULL; @@ -355,6 +360,9 @@ static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp) enum hns_roce_bond_state bond_state = bond_grp->bond_state; bool bond_ready = bond_grp->bond_ready;
+ if (!bond_grp->main_hr_dev) + return; + ibdev_info(&bond_grp->main_hr_dev->ib_dev, "do_bond: bond_ready - %d, bond_state - %d.\n", bond_ready, bond_grp->bond_state); @@ -403,13 +411,29 @@ void hns_roce_do_bond_work(struct work_struct *work)
int hns_roce_bond_init(struct hns_roce_dev *hr_dev) { + struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + struct hns_roce_v2_priv *priv = hr_dev->priv; int ret;
+ if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT && + bond_grp) { + bond_grp->main_hr_dev = hr_dev; + ret = hns_roce_recover_bond(bond_grp); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to recover RoCE bond, ret = %d.\n", + ret); + return ret; + } + bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; + } + hr_dev->bond_nb.notifier_call = hns_roce_bond_event; ret = register_netdevice_notifier(&hr_dev->bond_nb); if (ret) { ibdev_err(&hr_dev->ib_dev, - "failed to register notifier for RoCE bond!\n"); + "failed to register notifier for RoCE bond, ret = %d.\n", + ret); hr_dev->bond_nb.notifier_call = NULL; }
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 3f4316d71..22835c331 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -671,13 +671,19 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, bool bond_cleanup) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_bond_group *bond_grp;
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { unregister_netdevice_notifier(&hr_dev->bond_nb); bond_grp = hns_roce_get_bond_grp(hr_dev); - if (bond_grp && bond_cleanup) - hns_roce_cleanup_bond(bond_grp); + if (bond_grp) { + if (bond_cleanup) + hns_roce_cleanup_bond(bond_grp); + else if (priv->handle->rinfo.reset_state == + HNS_ROCE_STATE_RST_UNINIT) + bond_grp->main_hr_dev = NULL; + } }
hr_dev->active = false;
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
--------------------------------------------------------------------------
When rmmod hns3, the uninit procedure is in this order: pf0 roce uninit instance, pf0 nic uninit instance, pf1 roce uninit instance, pf1 nic uninit instance, and so on.
During pf0 nic uninit instance, pf0 netdev is unregistered and RoCE bonding driver is will be notified by a bonding event. Then a clear-bond work will be scheduled.
At this time, the clear-bond work and pf1 roce uninit instance are being executed concurrently. As the clear-bond work modifies the instance state of pf1 earlier, pf1 roce uninit instance will return when the state is found changed. This leads to pf1 nic uninit instance fast enough to be completed before the clear-bond work. When the clear-bond work accesses pf1 nic resources which have been released, an error occurs.
To fix the error, add a new instance state to indicate an ongoing bond work involving bonding uninit. The roce driver uninit instance will wait for the completion of the bond work when the device being uninited is also in the procedure of bonding uninit to avoid concurrency and make sure the nic resources won't be released for the moment.
Fixes: e62a20278f18 ("RDMA/hns: support RoCE bonding") Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_bond.c | 136 ++++++++++++-------- drivers/infiniband/hw/hns/hns_roce_bond.h | 7 +- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 24 +++- drivers/infiniband/hw/hns/hns_roce_main.c | 11 +- 5 files changed, 121 insertions(+), 58 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index 6fdd8e9fd..38f2326f9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -56,6 +56,8 @@ static bool is_hrdev_bond_slave(struct hns_roce_dev *hr_dev, struct net_device *upper_dev) { struct hns_roce_bond_group *bond_grp; + struct net_device *net_dev; + u8 bus_num;
if (!hr_dev || !upper_dev) return false; @@ -63,21 +65,23 @@ static bool is_hrdev_bond_slave(struct hns_roce_dev *hr_dev, if (!netif_is_lag_master(upper_dev)) return false;
- if (upper_dev == get_upper_dev_from_ndev(get_hr_netdev(hr_dev, 0))) + net_dev = get_hr_netdev(hr_dev, 0); + bus_num = get_hr_bus_num(hr_dev); + + if (upper_dev == get_upper_dev_from_ndev(net_dev)) return true;
- bond_grp = hns_roce_get_bond_grp(hr_dev); + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); if (bond_grp && upper_dev == bond_grp->upper_dev) return true;
return false; }
-struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev) +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num) { - struct hns_roce_die_info *die_info = - xa_load(&roce_bond_xa, get_hr_bus_num(hr_dev)); - struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_die_info *die_info = xa_load(&roce_bond_xa, bus_num); struct hns_roce_bond_group *bond_grp; int i;
@@ -98,7 +102,11 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev)
bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev) { - struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); + + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num);
if (bond_grp && bond_grp->bond_state != HNS_ROCE_BOND_NOT_BONDED) return true; @@ -117,13 +125,15 @@ static inline bool is_active_slave(struct net_device *net_dev,
struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev) { - struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); - struct net_device *net_dev = NULL; + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); int i;
if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) return NULL;
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); if (!bond_grp) return NULL;
@@ -144,9 +154,10 @@ struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev) for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { net_dev = bond_grp->bond_func_info[i].net_dev; if (net_dev && get_port_state(net_dev) == IB_PORT_ACTIVE) - break; + goto out; }
+ net_dev = NULL; out: mutex_unlock(&bond_grp->bond_mutex);
@@ -205,6 +216,7 @@ static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) }
bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING; + bond_grp->main_hr_dev = NULL;
for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { net_dev = bond_grp->bond_func_info[i].net_dev; @@ -216,19 +228,21 @@ static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp) } } } - if (!hr_dev) - return;
bond_grp->slave_map_diff = 0; hns_roce_bond_get_active_slave(bond_grp); - ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND); - if (ret) { - ibdev_err(&hr_dev->ib_dev, "failed to set RoCE bond!\n"); - return; - } + + ret = bond_grp->main_hr_dev ? + hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND) : -EIO;
bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; - ibdev_info(&hr_dev->ib_dev, "RoCE set bond finished!\n"); + complete(&bond_grp->bond_work_done); + + if (ret) + BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE set bond finished!\n"); }
static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) @@ -269,15 +283,17 @@ static void hns_roce_slave_changestate(struct hns_roce_bond_group *bond_grp) hns_roce_bond_get_active_slave(bond_grp);
ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); - if (ret) { - ibdev_err(&bond_grp->main_hr_dev->ib_dev, - "failed to change RoCE bond slave state!\n"); - return; - }
bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; - ibdev_info(&bond_grp->main_hr_dev->ib_dev, - "RoCE slave changestate finished!\n"); + complete(&bond_grp->bond_work_done); + + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to change RoCE bond slave state, ret = %d.\n", + ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave changestate finished!\n"); }
static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp) @@ -295,16 +311,18 @@ static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp)
bond_grp->slave_map_diff = 0; hns_roce_bond_get_active_slave(bond_grp); + ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); - if (ret) { - ibdev_err(&bond_grp->main_hr_dev->ib_dev, - "failed to increase RoCE bond slave!\n"); - return; - }
bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; - ibdev_info(&bond_grp->main_hr_dev->ib_dev, - "RoCE slave increase finished!\n"); + complete(&bond_grp->bond_work_done); + + if (ret) + ibdev_err(&bond_grp->main_hr_dev->ib_dev, + "failed to increase slave, ret = %d.\n", ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave increase finished!\n"); }
static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp) @@ -318,6 +336,7 @@ static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp) int i;
if (dec_slave_map & (1 << main_func_idx)) { + bond_grp->main_hr_dev = NULL; hns_roce_bond_uninit_client(bond_grp, main_func_idx); for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { net_dev = bond_grp->bond_func_info[i].net_dev; @@ -343,16 +362,19 @@ static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp)
bond_grp->slave_map_diff = 0; hns_roce_bond_get_active_slave(bond_grp); - ret = hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND); - if (ret) { - ibdev_err(&bond_grp->main_hr_dev->ib_dev, - "failed to decrease RoCE bond slave!\n"); - return; - } + + ret = bond_grp->main_hr_dev ? + hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND) : -EIO;
bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; - ibdev_info(&bond_grp->main_hr_dev->ib_dev, - "RoCE slave decrease finished!\n"); + complete(&bond_grp->bond_work_done); + + if (ret) + BOND_ERR_LOG("failed to decrease RoCE bond slave, ret = %d.\n", + ret); + else + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave decrease finished!\n"); }
static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp) @@ -367,6 +389,8 @@ static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp) "do_bond: bond_ready - %d, bond_state - %d.\n", bond_ready, bond_grp->bond_state);
+ reinit_completion(&bond_grp->bond_work_done); + if (!bond_ready) { hns_roce_clear_bond(bond_grp); return; @@ -411,10 +435,13 @@ void hns_roce_do_bond_work(struct work_struct *work)
int hns_roce_bond_init(struct hns_roce_dev *hr_dev) { - struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); int ret;
+ bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT && bond_grp) { bond_grp->main_hr_dev = hr_dev; @@ -513,21 +540,24 @@ static int remove_bond_id(int bus_num, u8 bond_id)
int hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp) { + bool completion_no_waiter; int ret;
ret = bond_grp->main_hr_dev ? hns_roce_cmd_bond(bond_grp, HNS_ROCE_CLEAR_BOND) : -EIO; if (ret) - ibdev_err(&bond_grp->main_hr_dev->ib_dev, - "failed to clear RoCE bond, ret = %d.\n", ret); + BOND_ERR_LOG("failed to clear RoCE bond, ret = %d.\n", ret);
cancel_delayed_work(&bond_grp->bond_work); ret = remove_bond_id(bond_grp->bus_num, bond_grp->bond_id); if (ret) - ibdev_err(&bond_grp->main_hr_dev->ib_dev, - "failed to remove bond ID %d, ret = %d.\n", - bond_grp->bond_id, ret); - kfree(bond_grp); + BOND_ERR_LOG("failed to remove bond id %u, ret = %d.\n", + bond_grp->bond_id, ret); + + completion_no_waiter = completion_done(&bond_grp->bond_work_done); + complete(&bond_grp->bond_work_done); + if (completion_no_waiter) + kfree(bond_grp);
return ret; } @@ -665,6 +695,8 @@ static struct hns_roce_bond_group *hns_roce_alloc_bond_grp(struct hns_roce_dev *
INIT_DELAYED_WORK(&bond_grp->bond_work, hns_roce_do_bond_work);
+ init_completion(&bond_grp->bond_work_done); + bond_grp->upper_dev = upper_dev; bond_grp->main_hr_dev = main_hr_dev; bond_grp->bond_ready = false; @@ -754,13 +786,14 @@ static enum bond_support_type struct net_device **upper_dev, struct netdev_notifier_changeupper_info *info) { - struct hns_roce_bond_group *bond_grp = hns_roce_get_bond_grp(hr_dev); + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); + struct hns_roce_bond_group *bond_grp; + int bus_num = get_hr_bus_num(hr_dev); bool bond_grp_exist = false; - struct net_device *net_dev; - int bus_num = -1; bool support;
*upper_dev = info->upper_dev; + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); if (bond_grp && *upper_dev == bond_grp->upper_dev) bond_grp_exist = true;
@@ -787,6 +820,7 @@ int hns_roce_bond_event(struct notifier_block *self, container_of(self, struct hns_roce_dev, bond_nb); enum bond_support_type support = BOND_SUPPORT; struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); struct net_device *upper_dev; bool changed;
@@ -806,7 +840,7 @@ int hns_roce_bond_event(struct notifier_block *self, else if (!upper_dev && hr_dev != hns_roce_get_hrdev_by_netdev(net_dev)) return NOTIFY_DONE;
- bond_grp = hns_roce_get_bond_grp(hr_dev); + bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0), bus_num); if (event == NETDEV_CHANGEUPPER) { if (!bond_grp) { bond_grp = hns_roce_alloc_bond_grp(hr_dev, upper_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h index 94ee5bf36..c9de9315d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.h +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -14,6 +14,9 @@
#define BOND_ID(id) BIT(id)
+#define BOND_ERR_LOG(fmt, ...) \ + pr_err("HNS RoCE Bonding: " fmt, ##__VA_ARGS__) \ + enum { BOND_MODE_1, BOND_MODE_2_4, @@ -68,6 +71,7 @@ struct hns_roce_bond_group { struct mutex bond_mutex; struct hns_roce_func_info bond_func_info[ROCE_BOND_FUNC_MAX]; struct delayed_work bond_work; + struct completion bond_work_done; };
struct hns_roce_die_info { @@ -81,6 +85,7 @@ int hns_roce_bond_event(struct notifier_block *self, int hns_roce_cleanup_bond(struct hns_roce_bond_group *bond_grp); bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev); -struct hns_roce_bond_group *hns_roce_get_bond_grp(struct hns_roce_dev *hr_dev); +struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, + u8 bus_num);
#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 8bd3bad64..c08d02c14 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -174,6 +174,7 @@ enum hns_roce_instance_state { HNS_ROCE_STATE_INIT, HNS_ROCE_STATE_INITED, HNS_ROCE_STATE_UNINIT, + HNS_ROCE_STATE_BOND_UNINIT, };
enum { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b85b061fe..a05099aed 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6976,7 +6976,8 @@ static bool check_vf_support(struct pci_dev *vf) if (!hr_dev) return false;
- bond_grp = hns_roce_get_bond_grp(hr_dev); + bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0), + pf->bus->number); if (bond_grp) return false;
@@ -7102,6 +7103,19 @@ reset_chk_err: static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, bool reset) { + struct hns_roce_bond_group *bond_grp; + + /* Wait for the completion of bond work to avoid concurrency */ + if (handle->rinfo.instance_state == HNS_ROCE_STATE_BOND_UNINIT) { + bond_grp = hns_roce_get_bond_grp(handle->rinfo.netdev, + handle->pdev->bus->number); + if (bond_grp) { + wait_for_completion(&bond_grp->bond_work_done); + if (bond_grp->bond_state == HNS_ROCE_BOND_NOT_BONDED) + kfree(bond_grp); + } + } + if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) return;
@@ -7135,7 +7149,7 @@ void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) return;
- handle->rinfo.instance_state = HNS_ROCE_STATE_UNINIT; + handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;
__hns_roce_hw_v2_uninit_instance(handle, false, false);
@@ -7235,9 +7249,11 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, struct net_device *netdev = handle->rinfo.netdev; struct hns_roce_dev *hr_dev = handle->priv; struct hns_roce_bond_group *bond_grp; + struct net_device *hr_net_dev; struct ib_event event; unsigned long flags; u8 phy_port; + u8 bus_num;
if (linkup || !hr_dev) return; @@ -7247,7 +7263,9 @@ static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, * netdev but not only one. So bond device cannot get a correct * link status from this path. */ - bond_grp = hns_roce_get_bond_grp(hr_dev); + hr_net_dev = get_hr_netdev(hr_dev, 0); + bus_num = get_hr_bus_num(hr_dev); + bond_grp = hns_roce_get_bond_grp(hr_net_dev, bus_num); if (bond_grp) return;
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 22835c331..6960d5b2a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -181,7 +181,8 @@ static int hns_roce_netdev_event(struct notifier_block *self, hr_dev = container_of(self, struct hns_roce_dev, iboe.nb); iboe = &hr_dev->iboe; if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { - bond_grp = hns_roce_get_bond_grp(hr_dev); + bond_grp = hns_roce_get_bond_grp(get_hr_netdev(hr_dev, 0), + get_hr_bus_num(hr_dev)); upper = bond_grp ? bond_grp->upper_dev : NULL; }
@@ -267,10 +268,12 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
static enum ib_port_state get_upper_port_state(struct hns_roce_dev *hr_dev) { + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev); struct net_device *upper;
- bond_grp = hns_roce_get_bond_grp(hr_dev); + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); upper = bond_grp ? bond_grp->upper_dev : NULL; if (upper) return get_port_state(upper); @@ -670,13 +673,15 @@ static int hns_roce_get_hw_stats(struct ib_device *device, static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, bool bond_cleanup) { + struct net_device *net_dev = get_hr_netdev(hr_dev, 0); struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_bond_group *bond_grp; + u8 bus_num = get_hr_bus_num(hr_dev);
if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { unregister_netdevice_notifier(&hr_dev->bond_nb); - bond_grp = hns_roce_get_bond_grp(hr_dev); + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); if (bond_grp) { if (bond_cleanup) hns_roce_cleanup_bond(bond_grp);
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
--------------------------------------------------------------------------
When the resource pf of RoCE bonding is unbinded, the main_hr_dev will be unregistered and the bond resources will be cleaned up. Currently, other slaves will not be re-initialized, and they are not availbale until the whole RoCE ko is removed and inserted again.
To fix this problem, re-initialized all the slaves to hns_* device except the resource slave itself before bond resources cleanup.
Fixes: e62a20278f18 ("RDMA/hns: support RoCE bonding") Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_bond.c | 22 +++++++++++---- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++ drivers/infiniband/hw/hns/hns_roce_main.c | 33 ++++++++++++++++------ 3 files changed, 43 insertions(+), 15 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index 38f2326f9..911a2ba28 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -37,19 +37,19 @@ static struct net_device *get_upper_dev_from_ndev(struct net_device *net_dev) return upper_dev; }
-static bool is_netdev_bond_slave(struct net_device *net_dev, - struct hns_roce_bond_group *bond_grp) +static int get_netdev_bond_slave_id(struct net_device *net_dev, + struct hns_roce_bond_group *bond_grp) { int i;
if (!net_dev || !bond_grp) - return false; + return -ENODEV;
for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) if (net_dev == bond_grp->bond_func_info[i].net_dev) - return true; + return i;
- return false; + return -ENOENT; }
static bool is_hrdev_bond_slave(struct hns_roce_dev *hr_dev, @@ -92,7 +92,7 @@ struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, bond_grp = die_info->bgrps[i]; if (!bond_grp) continue; - if (is_netdev_bond_slave(net_dev, bond_grp) || + if (get_netdev_bond_slave_id(net_dev, bond_grp) >= 0 || (bond_grp->upper_dev == get_upper_dev_from_ndev(net_dev))) return bond_grp; } @@ -823,6 +823,7 @@ int hns_roce_bond_event(struct notifier_block *self, u8 bus_num = get_hr_bus_num(hr_dev); struct net_device *upper_dev; bool changed; + int slave_id;
if (event != NETDEV_CHANGEUPPER && event != NETDEV_CHANGELOWERSTATE) return NOTIFY_DONE; @@ -852,6 +853,15 @@ int hns_roce_bond_event(struct notifier_block *self, } else if (hr_dev != bond_grp->main_hr_dev) { return NOTIFY_DONE; } + /* In the case of netdev being unregistered, the roce + * instance shouldn't be inited. + */ + if (net_dev->reg_state >= NETREG_UNREGISTERING) { + slave_id = get_netdev_bond_slave_id(net_dev, bond_grp); + if (slave_id >= 0) + bond_grp->bond_func_info[slave_id].handle = NULL; + } + if (support == BOND_EXISTING_NOT_SUPPORT) { bond_grp->bond_ready = false; hns_roce_queue_bond_work(bond_grp, HZ); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a05099aed..2153590d0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7134,6 +7134,9 @@ struct hns_roce_dev int ret;
handle = bond_grp->bond_func_info[func_idx].handle; + if (!handle || !handle->client) + return NULL; + ret = hns_roce_hw_v2_init_instance(handle); if (ret) return NULL; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 6960d5b2a..c3d057222 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -678,19 +678,34 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev, struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_bond_group *bond_grp; u8 bus_num = get_hr_bus_num(hr_dev); + int i;
- if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND) { - unregister_netdevice_notifier(&hr_dev->bond_nb); - bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); - if (bond_grp) { - if (bond_cleanup) - hns_roce_cleanup_bond(bond_grp); - else if (priv->handle->rinfo.reset_state == - HNS_ROCE_STATE_RST_UNINIT) - bond_grp->main_hr_dev = NULL; + if (!(hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_BOND)) + goto normal_unregister; + + unregister_netdevice_notifier(&hr_dev->bond_nb); + bond_grp = hns_roce_get_bond_grp(net_dev, bus_num); + if (!bond_grp) + goto normal_unregister; + + if (bond_cleanup) { + /* To avoid the loss of other slave devices when main_hr_dev + * is unregistered, re-initialized the remaining slaves before + * the bond resources cleanup. + */ + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (net_dev && net_dev != iboe->netdevs[0]) + hns_roce_bond_init_client(bond_grp, i); } + hns_roce_cleanup_bond(bond_grp); + } else if (priv->handle->rinfo.reset_state == + HNS_ROCE_STATE_RST_UNINIT) { + bond_grp->main_hr_dev = NULL; }
+normal_unregister: hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); ib_unregister_device(&hr_dev->ib_dev);
From: Junxian Huang huangjunxian6@hisilicon.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I968IB
--------------------------------------------------------------------------
In the concurrency process between setting bond and reset, when the reset process is finished, the driver detects that bond resource has already been allocated, thus entering the bond recover process, where the bond state is set to HNS_ROCE_BOND_IS_BONDED. But at this point the set bond process hasn't been executed yet(i.e. slaves haven't been uninited). This wrong bond state leads to the abnormal reset result that 2 slaves are both registered as bond device.
Thus delete the bond state setting in bond recover process. Besides, to fix other potential concurrency errors between bond and reset, some improvements are also added:
1. For the situation that reset occurs before bond work, add a reset check at the beginning of bond work. If there is an ongoing reset process, re-queue the bond work until the reset is finished.
2. For the situation that reset occurs during bond work, add reset checks to bond init/uninit process, treating this situation as an abnormal case.
Fixes: b0f80ad22f96 ("RDMA/hns: Support reset recovery for RoCE bonding") Signed-off-by: Junxian Huang huangjunxian6@hisilicon.com Signed-off-by: Juan Zhou zhoujuan51@h-partners.com --- drivers/infiniband/hw/hns/hns_roce_bond.c | 151 +++++++++++++++------ drivers/infiniband/hw/hns/hns_roce_bond.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 14 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 4 +- 4 files changed, 126 insertions(+), 44 deletions(-)
diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.c b/drivers/infiniband/hw/hns/hns_roce_bond.c index 911a2ba28..146eeb7f4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.c +++ b/drivers/infiniband/hw/hns/hns_roce_bond.c @@ -211,8 +211,11 @@ static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)
for (i = ROCE_BOND_FUNC_MAX - 1; i >= 0; i--) { net_dev = bond_grp->bond_func_info[i].net_dev; - if (net_dev) - hns_roce_bond_uninit_client(bond_grp, i); + if (net_dev) { + ret = hns_roce_bond_uninit_client(bond_grp, i); + if (ret) + goto set_err; + } }
bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING; @@ -234,15 +237,19 @@ static void hns_roce_set_bond(struct hns_roce_bond_group *bond_grp)
ret = bond_grp->main_hr_dev ? hns_roce_cmd_bond(bond_grp, HNS_ROCE_SET_BOND) : -EIO; + if (ret) + goto set_err;
bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; complete(&bond_grp->bond_work_done); + ibdev_info(&bond_grp->main_hr_dev->ib_dev, "RoCE set bond finished!\n");
- if (ret) - BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret); - else - ibdev_info(&bond_grp->main_hr_dev->ib_dev, - "RoCE set bond finished!\n"); + return; + +set_err: + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + BOND_ERR_LOG("failed to set RoCE bond, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); }
static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) @@ -258,7 +265,11 @@ static void hns_roce_clear_bond(struct hns_roce_bond_group *bond_grp) bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; bond_grp->main_hr_dev = NULL;
- hns_roce_bond_uninit_client(bond_grp, main_func_idx); + ret = hns_roce_bond_uninit_client(bond_grp, main_func_idx); + if (ret) { + BOND_ERR_LOG("failed to uninit bond, ret = %d.\n", ret); + return; + }
for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { net_dev = bond_grp->bond_func_info[i].net_dev; @@ -303,8 +314,15 @@ static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp) int ret;
while (inc_slave_map > 0) { - if (inc_slave_map & 1) - hns_roce_bond_uninit_client(bond_grp, inc_func_idx); + if (inc_slave_map & 1) { + ret = hns_roce_bond_uninit_client(bond_grp, inc_func_idx); + if (ret) { + BOND_ERR_LOG("failed to uninit slave %u, ret = %d.\n", + inc_func_idx, ret); + bond_grp->bond_func_info[inc_func_idx].net_dev = NULL; + bond_grp->slave_map &= ~(1U << inc_func_idx); + } + } inc_slave_map >>= 1; inc_func_idx++; } @@ -325,36 +343,66 @@ static void hns_roce_slave_inc(struct hns_roce_bond_group *bond_grp) "RoCE slave increase finished!\n"); }
+static int switch_main_dev(struct hns_roce_bond_group *bond_grp, + u32 *dec_slave_map, u8 main_func_idx) +{ + struct hns_roce_dev *hr_dev; + struct net_device *net_dev; + int ret; + int i; + + bond_grp->main_hr_dev = NULL; + ret = hns_roce_bond_uninit_client(bond_grp, main_func_idx); + if (ret) { + BOND_ERR_LOG("failed to uninit main dev %u, ret = %d.\n", + main_func_idx, ret); + *dec_slave_map &= ~(1U << main_func_idx); + bond_grp->slave_map |= (1U << main_func_idx); + return ret; + } + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + if (!(*dec_slave_map & (1 << i)) && net_dev) { + bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING; + hr_dev = hns_roce_bond_init_client(bond_grp, i); + if (hr_dev) { + bond_grp->main_hr_dev = hr_dev; + break; + } + } + } + + if (!bond_grp->main_hr_dev) + return -ENODEV; + + return 0; +} + static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp) { u8 main_func_idx = PCI_FUNC(bond_grp->main_hr_dev->pci_dev->devfn); u32 dec_slave_map = bond_grp->slave_map_diff; - struct hns_roce_dev *hr_dev; struct net_device *net_dev; u8 dec_func_idx = 0; int ret; - int i;
if (dec_slave_map & (1 << main_func_idx)) { - bond_grp->main_hr_dev = NULL; - hns_roce_bond_uninit_client(bond_grp, main_func_idx); - for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { - net_dev = bond_grp->bond_func_info[i].net_dev; - if (!(dec_slave_map & (1 << i)) && net_dev) { - bond_grp->bond_state = HNS_ROCE_BOND_REGISTERING; - hr_dev = hns_roce_bond_init_client(bond_grp, i); - if (hr_dev) { - bond_grp->main_hr_dev = hr_dev; - break; - } - } - } + ret = switch_main_dev(bond_grp, &dec_slave_map, main_func_idx); + if (ret == -ENODEV) + goto dec_err; }
while (dec_slave_map > 0) { if (dec_slave_map & 1) { + net_dev = bond_grp->bond_func_info[dec_func_idx].net_dev; bond_grp->bond_func_info[dec_func_idx].net_dev = NULL; - hns_roce_bond_init_client(bond_grp, dec_func_idx); + if (!hns_roce_bond_init_client(bond_grp, dec_func_idx)) { + BOND_ERR_LOG("failed to re-init slave %u.\n", + dec_func_idx); + bond_grp->slave_map |= (1U << dec_func_idx); + bond_grp->bond_func_info[dec_func_idx].net_dev = net_dev; + } } dec_slave_map >>= 1; dec_func_idx++; @@ -365,16 +413,20 @@ static void hns_roce_slave_dec(struct hns_roce_bond_group *bond_grp)
ret = bond_grp->main_hr_dev ? hns_roce_cmd_bond(bond_grp, HNS_ROCE_CHANGE_BOND) : -EIO; + if (ret) + goto dec_err;
bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; complete(&bond_grp->bond_work_done); + ibdev_info(&bond_grp->main_hr_dev->ib_dev, + "RoCE slave decrease finished!\n");
- if (ret) - BOND_ERR_LOG("failed to decrease RoCE bond slave, ret = %d.\n", - ret); - else - ibdev_info(&bond_grp->main_hr_dev->ib_dev, - "RoCE slave decrease finished!\n"); + return; + +dec_err: + bond_grp->bond_state = HNS_ROCE_BOND_NOT_BONDED; + BOND_ERR_LOG("failed to decrease RoCE bond slave, ret = %d.\n", ret); + hns_roce_cleanup_bond(bond_grp); }
static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp) @@ -414,7 +466,25 @@ static void hns_roce_do_bond(struct hns_roce_bond_group *bond_grp) } }
-void hns_roce_do_bond_work(struct work_struct *work) +bool is_bond_slave_in_reset(struct hns_roce_bond_group *bond_grp) +{ + struct hnae3_handle *handle; + struct net_device *net_dev; + int i; + + for (i = 0; i < ROCE_BOND_FUNC_MAX; i++) { + net_dev = bond_grp->bond_func_info[i].net_dev; + handle = bond_grp->bond_func_info[i].handle; + if (net_dev && handle && + handle->rinfo.reset_state != HNS_ROCE_STATE_NON_RST && + handle->rinfo.reset_state != HNS_ROCE_STATE_RST_INITED) + return true; + } + + return false; +} + +static void hns_roce_do_bond_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct hns_roce_bond_group *bond_grp = @@ -422,15 +492,19 @@ void hns_roce_do_bond_work(struct work_struct *work) bond_work); int status;
+ if (is_bond_slave_in_reset(bond_grp)) + goto queue_work; + status = mutex_trylock(&roce_bond_mutex); - if (!status) { - /* delay 1 sec */ - hns_roce_queue_bond_work(bond_grp, HZ); - return; - } + if (!status) + goto queue_work;
hns_roce_do_bond(bond_grp); mutex_unlock(&roce_bond_mutex); + return; + +queue_work: + hns_roce_queue_bond_work(bond_grp, HZ); }
int hns_roce_bond_init(struct hns_roce_dev *hr_dev) @@ -452,7 +526,6 @@ int hns_roce_bond_init(struct hns_roce_dev *hr_dev) ret); return ret; } - bond_grp->bond_state = HNS_ROCE_BOND_IS_BONDED; }
hr_dev->bond_nb.notifier_call = hns_roce_bond_event; diff --git a/drivers/infiniband/hw/hns/hns_roce_bond.h b/drivers/infiniband/hw/hns/hns_roce_bond.h index c9de9315d..e75fe75f7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_bond.h +++ b/drivers/infiniband/hw/hns/hns_roce_bond.h @@ -87,5 +87,6 @@ bool hns_roce_bond_is_active(struct hns_roce_dev *hr_dev); struct net_device *hns_roce_get_bond_netdev(struct hns_roce_dev *hr_dev); struct hns_roce_bond_group *hns_roce_get_bond_grp(struct net_device *net_dev, u8 bus_num); +bool is_bond_slave_in_reset(struct hns_roce_bond_group *bond_grp);
#endif diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2153590d0..988b4aeda 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7137,6 +7137,9 @@ struct hns_roce_dev if (!handle || !handle->client) return NULL;
+ if (is_bond_slave_in_reset(bond_grp)) + return NULL; + ret = hns_roce_hw_v2_init_instance(handle); if (ret) return NULL; @@ -7144,19 +7147,24 @@ struct hns_roce_dev return handle->priv; }
-void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, - int func_idx) +int hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx) { struct hnae3_handle *handle = bond_grp->bond_func_info[func_idx].handle;
if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) - return; + return -EPERM; + + if (is_bond_slave_in_reset(bond_grp)) + return -EBUSY;
handle->rinfo.instance_state = HNS_ROCE_STATE_BOND_UNINIT;
__hns_roce_hw_v2_uninit_instance(handle, false, false);
handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT; + + return 0; } static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 074e1f290..dd64e0b95 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1586,8 +1586,8 @@ struct hns_roce_bond_info { struct hns_roce_dev *hns_roce_bond_init_client(struct hns_roce_bond_group *bond_grp, int func_idx); -void hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, - int func_idx); +int hns_roce_bond_uninit_client(struct hns_roce_bond_group *bond_grp, + int func_idx); int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int hns_roce_cmd_bond(struct hns_roce_bond_group *bond_grp, enum hns_roce_bond_cmd_type bond_type);