From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from dpdk.org (dpdk.org [92.243.14.124]) by inbox.dpdk.org (Postfix) with ESMTP id 3D26CA04FA; Sun, 2 Feb 2020 17:06:34 +0100 (CET) Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id E4A9D1C18F; Sun, 2 Feb 2020 17:04:57 +0100 (CET) Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129]) by dpdk.org (Postfix) with ESMTP id D588E1C18F for ; Sun, 2 Feb 2020 17:04:56 +0100 (CET) Received: from Internal Mail-Server by MTLPINE2 (envelope-from asafp@mellanox.com) with ESMTPS (AES256-SHA encrypted); 2 Feb 2020 18:04:51 +0200 Received: from pegasus07.mtr.labs.mlnx (pegasus07.mtr.labs.mlnx [10.210.16.112]) by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id 012G3ujB032300; Sun, 2 Feb 2020 18:04:51 +0200 From: Matan Azrad To: dev@dpdk.org, Viacheslav Ovsiienko Cc: Maxime Coquelin Date: Sun, 2 Feb 2020 16:03:53 +0000 Message-Id: <1580659433-25581-14-git-send-email-matan@mellanox.com> X-Mailer: git-send-email 1.8.3.1 In-Reply-To: <1580659433-25581-1-git-send-email-matan@mellanox.com> References: <1580292549-27439-1-git-send-email-matan@mellanox.com> <1580659433-25581-1-git-send-email-matan@mellanox.com> Subject: [dpdk-dev] [PATCH v3 13/13] vdpa/mlx5: disable ROCE X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" In order to support virtio queue creation by the FW, ROCE mode should be disabled in the device. Do it by netlink which is like the devlink tool commands: 1. devlink dev param set pci/[pci] name enable_roce value false cmode driverinit 2. devlink dev reload pci/[pci] Or by sysfs which is like: echo 0 > /sys/bus/pci/devices/[pci]/roce_enable The IB device is matched again after ROCE disabling. Signed-off-by: Matan Azrad Acked-by: Viacheslav Ovsiienko Acked-by: Maxime Coquelin --- drivers/vdpa/mlx5/Makefile | 2 +- drivers/vdpa/mlx5/meson.build | 2 +- drivers/vdpa/mlx5/mlx5_vdpa.c | 191 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 160 insertions(+), 35 deletions(-) diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile index d4a544c..7153217 100644 --- a/drivers/vdpa/mlx5/Makefile +++ b/drivers/vdpa/mlx5/Makefile @@ -29,7 +29,7 @@ CFLAGS += -D_XOPEN_SOURCE=600 CFLAGS += $(WERROR_FLAGS) CFLAGS += -Wno-strict-prototypes LDLIBS += -lrte_common_mlx5 -LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci -lrte_sched +LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched # A few warnings cannot be avoided in external headers. CFLAGS += -Wno-error=cast-qual diff --git a/drivers/vdpa/mlx5/meson.build b/drivers/vdpa/mlx5/meson.build index bb96dad..9c152e5 100644 --- a/drivers/vdpa/mlx5/meson.build +++ b/drivers/vdpa/mlx5/meson.build @@ -9,7 +9,7 @@ endif fmt_name = 'mlx5_vdpa' allow_experimental_apis = true -deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched'] +deps += ['hash', 'common_mlx5', 'vhost', 'pci', 'bus_pci', 'eal', 'sched'] sources = files( 'mlx5_vdpa.c', 'mlx5_vdpa_mem.c', diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c index 57619d2..710f305 100644 --- a/drivers/vdpa/mlx5/mlx5_vdpa.c +++ b/drivers/vdpa/mlx5/mlx5_vdpa.c @@ -1,15 +1,19 @@ /* SPDX-License-Identifier: BSD-3-Clause * Copyright 2019 Mellanox Technologies, Ltd */ +#include + #include #include #include #include +#include #include #include #include #include +#include #include "mlx5_vdpa_utils.h" #include "mlx5_vdpa.h" @@ -228,6 +232,145 @@ .get_notify_area = NULL, }; +static struct ibv_device * +mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr) +{ + int n; + struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n); + struct ibv_device *ibv_match = NULL; + + if (!ibv_list) { + rte_errno = ENOSYS; + return NULL; + } + while (n-- > 0) { + struct rte_pci_addr pci_addr; + + DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name); + if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr)) + continue; + if (memcmp(addr, &pci_addr, sizeof(pci_addr))) + continue; + ibv_match = ibv_list[n]; + break; + } + if (!ibv_match) + rte_errno = ENOENT; + mlx5_glue->free_device_list(ibv_list); + return ibv_match; +} + +/* Try to disable ROCE by Netlink\Devlink. */ +static int +mlx5_vdpa_nl_roce_disable(const char *addr) +{ + int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC); + int devlink_id; + int enable; + int ret; + + if (nlsk_fd < 0) + return nlsk_fd; + devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd); + if (devlink_id < 0) { + ret = devlink_id; + DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by" + " Netlink."); + goto close; + } + ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable); + if (ret) { + DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.", + ret); + goto close; + } else if (!enable) { + DRV_LOG(INFO, "ROCE has already disabled(Netlink)."); + goto close; + } + ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0); + if (ret) + DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret); + else + DRV_LOG(INFO, "ROCE is disabled by Netlink successfully."); +close: + close(nlsk_fd); + return ret; +} + +/* Try to disable ROCE by sysfs. */ +static int +mlx5_vdpa_sys_roce_disable(const char *addr) +{ + FILE *file_o; + int enable; + int ret; + + MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr); + file_o = fopen(file_p, "rb"); + if (!file_o) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + ret = fscanf(file_o, "%d", &enable); + if (ret != 1) { + rte_errno = EINVAL; + ret = EINVAL; + goto close; + } else if (!enable) { + ret = 0; + DRV_LOG(INFO, "ROCE has already disabled(sysfs)."); + goto close; + } + fclose(file_o); + file_o = fopen(file_p, "wb"); + if (!file_o) { + rte_errno = ENOTSUP; + return -ENOTSUP; + } + fprintf(file_o, "0\n"); + ret = 0; +close: + if (ret) + DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret); + else + DRV_LOG(INFO, "ROCE is disabled by sysfs successfully."); + fclose(file_o); + return ret; +} + +#define MLX5_VDPA_MAX_RETRIES 20 +#define MLX5_VDPA_USEC 1000 +static int +mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv) +{ + char addr_name[64] = {0}; + + rte_pci_device_name(addr, addr_name, sizeof(addr_name)); + /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */ + if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 || + mlx5_vdpa_sys_roce_disable(addr_name) == 0) { + /* + * Succeed to disable ROCE, wait for the IB device to appear + * again after reload. + */ + int r; + struct ibv_device *ibv_new; + + for (r = MLX5_VDPA_MAX_RETRIES; r; r--) { + ibv_new = mlx5_vdpa_get_ib_device_match(addr); + if (ibv_new) { + *ibv = ibv_new; + return 0; + } + usleep(MLX5_VDPA_USEC); + } + DRV_LOG(ERR, "Cannot much device %s after ROCE disable, " + "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES); + rte_errno = EAGAIN; + } + return -rte_errno; +} + /** * DPDK callback to register a PCI device. * @@ -246,8 +389,7 @@ mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, struct rte_pci_device *pci_dev __rte_unused) { - struct ibv_device **ibv_list; - struct ibv_device *ibv_match = NULL; + struct ibv_device *ibv; struct mlx5_vdpa_priv *priv = NULL; struct ibv_context *ctx = NULL; struct mlx5_hca_attr attr; @@ -258,42 +400,25 @@ " driver."); return 1; } - errno = 0; - ibv_list = mlx5_glue->get_device_list(&ret); - if (!ibv_list) { - rte_errno = ENOSYS; - DRV_LOG(ERR, "Failed to get device list, is ib_uverbs loaded?"); + ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr); + if (!ibv) { + DRV_LOG(ERR, "No matching IB device for PCI slot " + PCI_PRI_FMT ".", pci_dev->addr.domain, + pci_dev->addr.bus, pci_dev->addr.devid, + pci_dev->addr.function); return -rte_errno; - } - while (ret-- > 0) { - struct rte_pci_addr pci_addr; - - DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]->name); - if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path, &pci_addr)) - continue; - if (pci_dev->addr.domain != pci_addr.domain || - pci_dev->addr.bus != pci_addr.bus || - pci_dev->addr.devid != pci_addr.devid || - pci_dev->addr.function != pci_addr.function) - continue; + } else { DRV_LOG(INFO, "PCI information matches for device \"%s\".", - ibv_list[ret]->name); - ibv_match = ibv_list[ret]; - break; + ibv->name); } - mlx5_glue->free_device_list(ibv_list); - if (!ibv_match) { - DRV_LOG(ERR, "No matching IB device for PCI slot " - "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".", - pci_dev->addr.domain, pci_dev->addr.bus, - pci_dev->addr.devid, pci_dev->addr.function); - rte_errno = ENOENT; - return -rte_errno; + if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) { + DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".", + ibv->name); + //return -rte_errno; } - ctx = mlx5_glue->dv_open_device(ibv_match); + ctx = mlx5_glue->dv_open_device(ibv); if (!ctx) { - DRV_LOG(ERR, "Failed to open IB device \"%s\".", - ibv_match->name); + DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name); rte_errno = ENODEV; return -rte_errno; } -- 1.8.3.1