From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <dev-bounces@dpdk.org>
Received: from dpdk.org (dpdk.org [92.243.14.124])
	by inbox.dpdk.org (Postfix) with ESMTP id 0037EA0526;
	Mon, 20 Jan 2020 18:10:01 +0100 (CET)
Received: from [92.243.14.124] (localhost [127.0.0.1])
	by dpdk.org (Postfix) with ESMTP id BBF291C0DB;
	Mon, 20 Jan 2020 18:04:15 +0100 (CET)
Received: from mellanox.co.il (mail-il-dmz.mellanox.com [193.47.165.129])
 by dpdk.org (Postfix) with ESMTP id EE3031BFBE
 for <dev@dpdk.org>; Mon, 20 Jan 2020 18:03:14 +0100 (CET)
Received: from Internal Mail-Server by MTLPINE1 (envelope-from
 asafp@mellanox.com)
 with ESMTPS (AES256-SHA encrypted); 20 Jan 2020 19:03:14 +0200
Received: from pegasus07.mtr.labs.mlnx (pegasus07.mtr.labs.mlnx
 [10.210.16.112])
 by labmailer.mlnx (8.13.8/8.13.8) with ESMTP id 00KH3BGv024424;
 Mon, 20 Jan 2020 19:03:14 +0200
From: Matan Azrad <matan@mellanox.com>
To: dev@dpdk.org
Cc: Maxime Coquelin <maxime.coquelin@redhat.com>,
 Thomas Monjalon <thomas@monjalon.net>
Date: Mon, 20 Jan 2020 17:03:10 +0000
Message-Id: <1579539790-3882-39-git-send-email-matan@mellanox.com>
X-Mailer: git-send-email 1.8.3.1
In-Reply-To: <1579539790-3882-1-git-send-email-matan@mellanox.com>
References: <1579539790-3882-1-git-send-email-matan@mellanox.com>
Subject: [dpdk-dev] [PATCH v1 38/38] vdpa/mlx5: disable ROCE
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>

In order to support virtio queue creation by the FW, ROCE mode
should be disabled in the device.

Do it by netlink which is like the devlink tool commands:
	1. devlink dev param set pci/[pci] name enable_roce value false
	   cmode driverinit
    	2. devlink dev reload pci/[pci]
Or by sysfs which is like:
	echo 0 >  /sys/bus/pci/devices/[pci]/roce_enable

The IB device is matched again after ROCE disabling.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/vdpa/mlx5/Makefile    |   2 +-
 drivers/vdpa/mlx5/meson.build |   2 +-
 drivers/vdpa/mlx5/mlx5_vdpa.c | 190 +++++++++++++++++++++++++++++++++++-------
 3 files changed, 160 insertions(+), 34 deletions(-)

diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
index 62938b8..3dee270 100644
--- a/drivers/vdpa/mlx5/Makefile
+++ b/drivers/vdpa/mlx5/Makefile
@@ -29,7 +29,7 @@ CFLAGS += -D_XOPEN_SOURCE=600
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -Wno-strict-prototypes
 LDLIBS += -lrte_common_mlx5
-LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_bus_pci -lrte_sched
+LDLIBS += -lrte_eal -lrte_vhost -lrte_kvargs -lrte_pci -lrte_bus_pci -lrte_sched
 
 # A few warnings cannot be avoided in external headers.
 CFLAGS += -Wno-error=cast-qual
diff --git a/drivers/vdpa/mlx5/meson.build b/drivers/vdpa/mlx5/meson.build
index 60cefd7..7e46e34 100644
--- a/drivers/vdpa/mlx5/meson.build
+++ b/drivers/vdpa/mlx5/meson.build
@@ -9,7 +9,7 @@ endif
 
 fmt_name = 'mlx5_vdpa'
 allow_experimental_apis = true
-deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
+deps += ['hash', 'common_mlx5', 'vhost', 'pci', 'bus_pci', 'eal', 'sched']
 sources = files(
 	'mlx5_vdpa.c',
 	'mlx5_vdpa_mem.c',
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.c b/drivers/vdpa/mlx5/mlx5_vdpa.c
index 2ceef18..5ea5739 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.c
@@ -1,15 +1,19 @@
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright 2019 Mellanox Technologies, Ltd
  */
+#include <unistd.h>
+
 #include <rte_malloc.h>
 #include <rte_log.h>
 #include <rte_errno.h>
 #include <rte_bus_pci.h>
+#include <rte_pci.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_common.h>
 #include <mlx5_devx_cmds.h>
 #include <mlx5_prm.h>
+#include <mlx5_nl.h>
 
 #include "mlx5_vdpa_utils.h"
 #include "mlx5_vdpa.h"
@@ -228,6 +232,144 @@
 	.get_notify_area = NULL,
 };
 
+static struct ibv_device *
+mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
+{
+	int n;
+	struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
+	struct ibv_device *ibv_match = NULL;
+
+	if (!ibv_list) {
+		rte_errno = ENOSYS;
+		return NULL;
+	}
+	while (n-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
+		if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
+			continue;
+		if (memcmp(addr, &pci_addr, sizeof(pci_addr)))
+			continue;
+		ibv_match = ibv_list[n];
+		break;
+	}
+	if (!ibv_match)
+		rte_errno = ENOENT;
+	return ibv_match;
+}
+
+/* Try to disable ROCE by Netlink\Devlink. */
+static int
+mlx5_vdpa_nl_roce_disable(const char *addr)
+{
+	int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
+	int devlink_id;
+	int enable;
+	int ret;
+
+	if (nlsk_fd < 0)
+		return nlsk_fd;
+	devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
+	if (devlink_id < 0) {
+		ret = devlink_id;
+		DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
+			" Netlink.");
+		goto close;
+	}
+	ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
+	if (ret) {
+		DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
+			ret);
+		goto close;
+	} else if (!enable) {
+		DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
+		goto close;
+	}
+	ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
+	if (ret)
+		DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
+	else
+		DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
+close:
+	close(nlsk_fd);
+	return ret;
+}
+
+/* Try to disable ROCE by sysfs. */
+static int
+mlx5_vdpa_sys_roce_disable(const char *addr)
+{
+	FILE *file_o;
+	int enable;
+	int ret;
+
+	MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
+	file_o = fopen(file_p, "rb");
+	if (!file_o) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	ret = fscanf(file_o, "%d", &enable);
+	if (ret != 1) {
+		rte_errno = EINVAL;
+		ret = EINVAL;
+		goto close;
+	} else if (!enable) {
+		ret = 0;
+		DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
+		goto close;
+	}
+	fclose(file_o);
+	file_o = fopen(file_p, "wb");
+	if (!file_o) {
+		rte_errno = ENOTSUP;
+		return -ENOTSUP;
+	}
+	fprintf(file_o, "0\n");
+	ret = 0;
+close:
+	if (ret)
+		DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
+	else
+		DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
+	fclose(file_o);
+	return ret;
+}
+
+#define MLX5_VDPA_MAX_RETRIES 20
+#define MLX5_VDPA_USEC 1000
+static int
+mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
+{
+	char addr_name[64] = {0};
+
+	rte_pci_device_name(addr, addr_name, sizeof(addr_name));
+	/* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
+	if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
+	    mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
+		/*
+		 * Succeed to disable ROCE, wait for the IB device to appear
+		 * again after reload.
+		 */
+		int r;
+		struct ibv_device *ibv_new;
+
+		for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
+			ibv_new = mlx5_vdpa_get_ib_device_match(addr);
+			if (ibv_new) {
+				*ibv = ibv_new;
+				return 0;
+			}
+			usleep(MLX5_VDPA_USEC);
+		}
+		DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
+			"retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
+		rte_errno = EAGAIN;
+	}
+	return -rte_errno;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -245,8 +387,7 @@
 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		    struct rte_pci_device *pci_dev)
 {
-	struct ibv_device **ibv_list;
-	struct ibv_device *ibv_match = NULL;
+	struct ibv_device *ibv;
 	struct mlx5_vdpa_priv *priv = NULL;
 	struct ibv_context *ctx;
 	struct mlx5_hca_attr attr;
@@ -257,41 +398,26 @@
 			" devargs.");
 		return 1;
 	}
-	errno = 0;
-	ibv_list = mlx5_glue->get_device_list(&ret);
-	if (!ibv_list) {
-		rte_errno = errno;
-		DRV_LOG(ERR, "Failed to get device list, is ib_uverbs loaded?");
-		return -ENOSYS;
-	}
-	while (ret-- > 0) {
-		struct rte_pci_addr pci_addr;
-
-		DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[ret]->name);
-		if (mlx5_dev_to_pci_addr(ibv_list[ret]->ibdev_path, &pci_addr))
-			continue;
-		if (pci_dev->addr.domain != pci_addr.domain ||
-		    pci_dev->addr.bus != pci_addr.bus ||
-		    pci_dev->addr.devid != pci_addr.devid ||
-		    pci_dev->addr.function != pci_addr.function)
-			continue;
-		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
-			ibv_list[ret]->name);
-		ibv_match = ibv_list[ret];
-		break;
-	}
-	if (!ibv_match) {
+	ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
+	if (!ibv) {
 		DRV_LOG(ERR, "No matching IB device for PCI slot "
-			"%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 ".",
-			pci_dev->addr.domain, pci_dev->addr.bus,
-			pci_dev->addr.devid, pci_dev->addr.function);
+			PCI_PRI_FMT ".", pci_dev->addr.domain,
+			pci_dev->addr.bus, pci_dev->addr.devid,
+			pci_dev->addr.function);
 		rte_errno = ENOENT;
 		return -rte_errno;
+	} else {
+		DRV_LOG(INFO, "PCI information matches for device \"%s\".",
+			ibv->name);
+	}
+	if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
+		DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
+			ibv->name);
+		//return -rte_errno;
 	}
-	ctx = mlx5_glue->dv_open_device(ibv_match);
+	ctx = mlx5_glue->dv_open_device(ibv);
 	if (!ctx) {
-		DRV_LOG(ERR, "Failed to open IB device \"%s\".",
-			ibv_match->name);
+		DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
 		rte_errno = ENODEV;
 		return -rte_errno;
 	}
-- 
1.8.3.1