- * [Patch v3 01/17] net/mana: add basic driver, build environment and doc
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 21:44   ` Stephen Hemminger
  2022-07-07 21:52   ` Stephen Hemminger
  2022-07-07 20:30 ` [Patch v3 02/17] net/mana: add device configuration and stop longli
                   ` (15 subsequent siblings)
  16 siblings, 2 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
MANA is a PCI device. It uses IB verbs to access hardware through the kernel
RDMA layer. This patch introduces build environment and basic device probe
functions.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb header file
 MAINTAINERS                       |   6 +
 doc/guides/nics/features/mana.ini |  10 +
 doc/guides/nics/index.rst         |   1 +
 doc/guides/nics/mana.rst          |  66 +++
 drivers/net/mana/mana.c           | 704 ++++++++++++++++++++++++++++++
 drivers/net/mana/mana.h           | 210 +++++++++
 drivers/net/mana/meson.build      |  44 ++
 drivers/net/mana/mp.c             | 235 ++++++++++
 drivers/net/mana/version.map      |   3 +
 drivers/net/meson.build           |   1 +
 10 files changed, 1280 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/version.map
diff --git a/MAINTAINERS b/MAINTAINERS
index 18d9edaf88..b8bda48a33 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -837,6 +837,12 @@ F: buildtools/options-ibverbs-static.sh
 F: doc/guides/nics/mlx5.rst
 F: doc/guides/nics/features/mlx5.ini
 
+Microsoft mana
+M: Long Li <longli@microsoft.com>
+F: drivers/net/mana
+F: doc/guides/nics/mana.rst
+F: doc/guides/nics/features/mana.ini
+
 Microsoft vdev_netvsc - EXPERIMENTAL
 M: Matan Azrad <matan@nvidia.com>
 F: drivers/net/vdev_netvsc/
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
new file mode 100644
index 0000000000..b92a27374c
--- /dev/null
+++ b/doc/guides/nics/features/mana.ini
@@ -0,0 +1,10 @@
+;
+; Supported features of the 'mana' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Linux                = Y
+Multiprocess aware   = Y
+Usage doc            = Y
+x86-64               = Y
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 1c94caccea..2725d1d9f0 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -41,6 +41,7 @@ Network Interface Controller Drivers
     intel_vf
     kni
     liquidio
+    mana
     memif
     mlx4
     mlx5
diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
new file mode 100644
index 0000000000..40e18fe810
--- /dev/null
+++ b/doc/guides/nics/mana.rst
@@ -0,0 +1,66 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright 2022 Microsoft Corporation
+
+MANA poll mode driver library
+=============================
+
+The MANA poll mode driver library (**librte_net_mana**) implements support
+for Microsoft Azure Network Adapter VF in SR-IOV context.
+
+Features
+--------
+
+Features of the MANA Ethdev PMD are:
+
+Prerequisites
+-------------
+
+This driver relies on external libraries and kernel drivers for resources
+allocations and initialization. The following dependencies are not part of
+DPDK and must be installed separately:
+
+- **libibverbs** (provided by rdma-core package)
+
+  User space verbs framework used by librte_net_mana. This library provides
+  a generic interface between the kernel and low-level user space drivers
+  such as libmana.
+
+  It allows slow and privileged operations (context initialization, hardware
+  resources allocations) to be managed by the kernel and fast operations to
+  never leave user space.
+
+- **libmana** (provided by rdma-core package)
+
+  Low-level user space driver library for Microsoft Azure Network Adapter
+  devices, it is automatically loaded by libibverbs.
+
+- **Kernel modules**
+
+  They provide the kernel-side verbs API and low level device drivers that
+  manage actual hardware initialization and resources sharing with user
+  space processes.
+
+  Unlike most other PMDs, these modules must remain loaded and bound to
+  their devices:
+
+  - mana: Ethernet device driver that provides kernel network interfaces.
+  - mana_ib: InifiniBand device driver.
+  - ib_uverbs: user space driver for verbs (entry point for libibverbs).
+
+Driver compilation and testing
+------------------------------
+
+Refer to the document :ref:`compiling and testing a PMD for a NIC <pmd_build_and_test>`
+for details.
+
+Netvsc PMD arguments
+--------------------
+
+The user can specify below argument in devargs.
+
+#.  ``mac``:
+
+    Specify the MAC address for this device. If it is set, the driver
+    probes and loads the NIC with a matching mac address. If it is not
+    set, the driver probes on all the NICs on the PCI device. The default
+    value is not set, meaning all the NICs will be probed and loaded.
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
new file mode 100644
index 0000000000..63ec1f75f0
--- /dev/null
+++ b/drivers/net/mana/mana.c
@@ -0,0 +1,704 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_kvargs.h>
+#include <rte_eal_paging.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include <assert.h>
+
+#include "mana.h"
+
+/* Shared memory between primary/secondary processes, per driver */
+struct mana_shared_data *mana_shared_data;
+const struct rte_memzone *mana_shared_mz;
+static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
+
+struct mana_shared_data mana_local_data;
+
+/* Spinlock for mana_shared_data */
+static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Allocate a buffer on the stack and fill it with a printf format string. */
+#define MKSTR(name, ...) \
+	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
+	char name[mkstr_size_##name + 1]; \
+	\
+	memset(name, 0, mkstr_size_##name + 1); \
+	snprintf(name, sizeof(name), "" __VA_ARGS__)
+
+int mana_logtype_driver;
+int mana_logtype_init;
+
+const struct eth_dev_ops mana_dev_ops = {
+};
+
+const struct eth_dev_ops mana_dev_sec_ops = {
+};
+
+uint16_t
+mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
+		      struct rte_mbuf **pkts __rte_unused,
+		      uint16_t pkts_n __rte_unused)
+{
+	rte_mb();
+	return 0;
+}
+
+uint16_t
+mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
+		      struct rte_mbuf **pkts __rte_unused,
+		      uint16_t pkts_n __rte_unused)
+{
+	rte_mb();
+	return 0;
+}
+
+static const char *mana_init_args[] = {
+	"mac",
+	NULL,
+};
+
+/* Support of parsing up to 8 mac address from EAL command line */
+#define MAX_NUM_ADDRESS 8
+struct mana_conf {
+	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
+	unsigned int index;
+};
+
+static int mana_arg_parse_callback(const char *key, const char *val,
+				   void *private)
+{
+	struct mana_conf *conf = (struct mana_conf *)private;
+	int ret;
+
+	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
+
+	if (conf->index >= MAX_NUM_ADDRESS) {
+		DRV_LOG(ERR, "Exceeding max MAC address");
+		return 1;
+	}
+
+	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
+	if (ret) {
+		DRV_LOG(ERR, "Invalid MAC address %s", val);
+		return ret;
+	}
+
+	conf->index++;
+
+	return 0;
+}
+
+static int mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
+{
+	struct rte_kvargs *kvlist;
+	unsigned int arg_count;
+	int ret = 0;
+
+	kvlist = rte_kvargs_parse(devargs->args, mana_init_args);
+	if (!kvlist) {
+		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->args);
+		return -EINVAL;
+	}
+
+	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
+	if (arg_count > MAX_NUM_ADDRESS) {
+		ret = -EINVAL;
+		goto free_kvlist;
+	}
+	ret = rte_kvargs_process(kvlist, mana_init_args[0],
+				 mana_arg_parse_callback, conf);
+	if (ret) {
+		DRV_LOG(ERR, "error parsing args");
+		goto free_kvlist;
+	}
+
+free_kvlist:
+	rte_kvargs_free(kvlist);
+	return ret;
+}
+
+static int get_port_mac(struct ibv_device *device, unsigned int port,
+			struct rte_ether_addr *addr)
+{
+	FILE *file;
+	int ret = 0;
+	DIR *dir;
+	struct dirent *dent;
+	unsigned int dev_port;
+	char mac[20];
+
+	MKSTR(path, "%s/device/net", device->ibdev_path);
+
+	dir = opendir(path);
+	if (!dir)
+		return -ENOENT;
+
+	while ((dent = readdir(dir))) {
+		char *name = dent->d_name;
+
+		MKSTR(filepath, "%s/%s/dev_port", path, name);
+
+		/* Ignore . and .. */
+		if ((name[0] == '.') &&
+		    ((name[1] == '\0') ||
+		     ((name[1] == '.') && (name[2] == '\0'))))
+			continue;
+
+		file = fopen(filepath, "rb");
+		if (!file)
+			continue;
+
+		ret = fscanf(file, "%u", &dev_port);
+		fclose(file);
+
+		if (ret != 1)
+			continue;
+
+		/* Ethernet ports start at 0, IB port start at 1 */
+		if (dev_port == port - 1) {
+			MKSTR(filepath, "%s/%s/address", path, name);
+
+			file = fopen(filepath, "rb");
+			if (!file)
+				continue;
+
+			ret = fscanf(file, "%s", mac);
+			fclose(file);
+
+			if (ret < 0)
+				break;
+
+			ret = rte_ether_unformat_addr(mac, addr);
+			if (ret)
+				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
+			break;
+		}
+	}
+
+	closedir(dir);
+	return ret;
+}
+
+static int mana_ibv_device_to_pci_addr(const struct ibv_device *device,
+				       struct rte_pci_addr *pci_addr)
+{
+	FILE *file;
+	char *line = NULL;
+	size_t len = 0;
+
+	MKSTR(path, "%s/device/uevent", device->ibdev_path);
+
+	file = fopen(path, "rb");
+	if (!file)
+		return -errno;
+
+	while (getline(&line, &len, file) != -1) {
+		/* Extract information. */
+		if (sscanf(line,
+			   "PCI_SLOT_NAME="
+			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
+			   &pci_addr->domain,
+			   &pci_addr->bus,
+			   &pci_addr->devid,
+			   &pci_addr->function) == 4) {
+			break;
+		}
+	}
+
+	free(line);
+	fclose(file);
+	return 0;
+}
+
+static int mana_proc_priv_init(struct rte_eth_dev *dev)
+{
+	struct mana_process_priv *priv;
+
+	priv = rte_zmalloc_socket("mana_proc_priv",
+				  sizeof(struct mana_process_priv),
+				  RTE_CACHE_LINE_SIZE,
+				  dev->device->numa_node);
+	if (!priv)
+		return -ENOMEM;
+
+	dev->process_private = priv;
+	return 0;
+}
+
+static int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
+{
+	struct mana_process_priv *priv = eth_dev->process_private;
+
+	void *addr;
+
+	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
+	if (addr == MAP_FAILED) {
+		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
+			eth_dev->data->port_id);
+		return -ENOMEM;
+	}
+
+	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
+
+	priv->db_page = addr;
+
+	return 0;
+}
+
+/* Initialize shared data for the driver (all devices) */
+static int mana_init_shared_data(void)
+{
+	int ret =  0;
+	const struct rte_memzone *secondary_mz;
+
+	rte_spinlock_lock(&mana_shared_data_lock);
+
+	/* Skip if shared data is already initialized */
+	if (mana_shared_data)
+		goto exit;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
+						     sizeof(*mana_shared_data),
+						     SOCKET_ID_ANY, 0);
+		if (!mana_shared_mz) {
+			DRV_LOG(ERR, "Cannot allocate mana shared data");
+			ret = -rte_errno;
+			goto exit;
+		}
+
+		mana_shared_data = mana_shared_mz->addr;
+		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
+		rte_spinlock_init(&mana_shared_data->lock);
+	} else {
+		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
+		if (!secondary_mz) {
+			DRV_LOG(ERR, "Cannot attach mana shared data");
+			ret = -rte_errno;
+			goto exit;
+		}
+
+		mana_shared_data = secondary_mz->addr;
+		memset(&mana_local_data, 0, sizeof(mana_local_data));
+	}
+
+exit:
+	rte_spinlock_unlock(&mana_shared_data_lock);
+
+	return ret;
+}
+
+static int mana_init_once(void)
+{
+	int ret;
+
+	ret = mana_init_shared_data();
+	if (ret)
+		return ret;
+
+	rte_spinlock_lock(&mana_shared_data->lock);
+
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		if (mana_shared_data->init_done)
+			break;
+
+		ret = mana_mp_init_primary();
+		if (ret)
+			break;
+		DRV_LOG(ERR, "MP INIT PRIMARY");
+
+		mana_shared_data->init_done = 1;
+		break;
+
+	case RTE_PROC_SECONDARY:
+
+		if (mana_local_data.init_done)
+			break;
+
+		ret = mana_mp_init_secondary();
+		if (ret)
+			break;
+
+		DRV_LOG(ERR, "MP INIT SECONDARY");
+
+		mana_local_data.init_done = 1;
+		break;
+
+	default:
+		/* Impossible, internal error */
+		ret = -EPROTO;
+		break;
+	}
+
+	rte_spinlock_unlock(&mana_shared_data->lock);
+
+	return ret;
+}
+
+static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
+			      struct rte_pci_device *pci_dev,
+			      struct rte_ether_addr *mac_addr)
+{
+	struct ibv_device **ibv_list;
+	int ibv_idx;
+	struct ibv_context *ctx;
+	struct ibv_device_attr_ex dev_attr;
+	int num_devices;
+	int ret = 0;
+	uint8_t port;
+	struct mana_priv *priv = NULL;
+	struct rte_eth_dev *eth_dev = NULL;
+	bool found_port;
+
+	ibv_list = ibv_get_device_list(&num_devices);
+	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
+		struct ibv_device *ibdev = ibv_list[ibv_idx];
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
+			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
+
+		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
+			continue;
+
+		/* Ignore if this IB device is not this PCI device */
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+
+		ctx = ibv_open_device(ibdev);
+		if (!ctx) {
+			DRV_LOG(ERR, "Failed to open IB device %s",
+				ibdev->name);
+			continue;
+		}
+
+		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
+		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
+			dev_attr.orig_attr.phys_port_cnt);
+		found_port = false;
+
+		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
+		     port++) {
+			struct ibv_parent_domain_init_attr attr = {};
+			struct rte_ether_addr addr;
+			char address[64];
+			char name[RTE_ETH_NAME_MAX_LEN];
+
+			ret = get_port_mac(ibdev, port, &addr);
+			if (ret)
+				continue;
+
+			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
+				continue;
+
+			rte_ether_format_addr(address, sizeof(address), &addr);
+			DRV_LOG(INFO, "device located port %u address %s",
+				port, address);
+			found_port = true;
+
+			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
+						  RTE_CACHE_LINE_SIZE,
+						  SOCKET_ID_ANY);
+			if (!priv) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			snprintf(name, sizeof(name), "%s_port%d",
+				 pci_dev->device.name, port);
+
+			if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+				int fd;
+
+				eth_dev = rte_eth_dev_attach_secondary(name);
+				if (!eth_dev) {
+					DRV_LOG(ERR, "Can't attach to dev %s",
+						name);
+					ret = -ENOMEM;
+					goto failed;
+				}
+
+				eth_dev->device = &pci_dev->device;
+				eth_dev->dev_ops = &mana_dev_sec_ops;
+				ret = mana_proc_priv_init(eth_dev);
+				if (ret)
+					goto failed;
+				priv->process_priv = eth_dev->process_private;
+
+				/* Get the IB FD from the primary process */
+				fd = mana_mp_req_verbs_cmd_fd(eth_dev);
+				if (fd < 0) {
+					DRV_LOG(ERR, "Failed to get FD %d", fd);
+					ret = -ENODEV;
+					goto failed;
+				}
+
+				ret = mana_map_doorbell_secondary(eth_dev, fd);
+				if (ret) {
+					DRV_LOG(ERR, "Failed secondary map %d",
+						fd);
+					goto failed;
+				}
+
+				/* fd is no not used after mapping doorbell */
+				close(fd);
+
+				rte_spinlock_lock(&mana_shared_data->lock);
+				mana_shared_data->secondary_cnt++;
+				mana_local_data.secondary_cnt++;
+				rte_spinlock_unlock(&mana_shared_data->lock);
+
+				rte_eth_copy_pci_info(eth_dev, pci_dev);
+				rte_eth_dev_probing_finish(eth_dev);
+
+				/* Impossible to have more than one port
+				 * matching a MAC address
+				 */
+				continue;
+			}
+
+			eth_dev = rte_eth_dev_allocate(name);
+			if (!eth_dev) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			eth_dev->data->mac_addrs =
+				rte_calloc("mana_mac", 1,
+					   sizeof(struct rte_ether_addr), 0);
+			if (!eth_dev->data->mac_addrs) {
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs);
+
+			priv->ib_pd = ibv_alloc_pd(ctx);
+			if (!priv->ib_pd) {
+				DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			/* Create a parent domain with the port number */
+			attr.pd = priv->ib_pd;
+			attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
+			attr.pd_context = (void *)(uint64_t)port;
+			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
+			if (!priv->ib_parent_pd) {
+				DRV_LOG(ERR,
+					"ibv_alloc_parent_domain failed port %d",
+					port);
+				ret = -ENOMEM;
+				goto failed;
+			}
+
+			priv->ib_ctx = ctx;
+			priv->port_id = eth_dev->data->port_id;
+			priv->dev_port = port;
+			eth_dev->data->dev_private = priv;
+			priv->dev_data = eth_dev->data;
+
+			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
+			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
+
+			priv->max_rx_desc =
+				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+					dev_attr.orig_attr.max_cqe);
+			priv->max_tx_desc =
+				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
+					dev_attr.orig_attr.max_cqe);
+
+			priv->max_send_sge = dev_attr.orig_attr.max_sge;
+			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
+
+			priv->max_mr = dev_attr.orig_attr.max_mr;
+			priv->max_mr_size = dev_attr.orig_attr.max_mr_size;
+
+			DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d\n",
+				name, priv->max_rx_queues, priv->max_rx_desc,
+				priv->max_send_sge);
+
+			rte_spinlock_lock(&mana_shared_data->lock);
+			mana_shared_data->primary_cnt++;
+			rte_spinlock_unlock(&mana_shared_data->lock);
+
+			eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
+
+			eth_dev->device = &pci_dev->device;
+			eth_dev->data->dev_flags |=
+				RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
+
+			DRV_LOG(INFO, "device %s at port %u",
+				name, eth_dev->data->port_id);
+
+			eth_dev->rx_pkt_burst = mana_rx_burst_removed;
+			eth_dev->tx_pkt_burst = mana_tx_burst_removed;
+			eth_dev->dev_ops = &mana_dev_ops;
+
+			rte_eth_copy_pci_info(eth_dev, pci_dev);
+			rte_eth_dev_probing_finish(eth_dev);
+		}
+
+		/* Secondary process doesn't need an ibv_ctx. It maps the
+		 * doorbell pages using the IB cmd_fd passed from the primary
+		 * process and send messages to primary process for memory
+		 * registartions.
+		 */
+		if (!found_port || rte_eal_process_type() == RTE_PROC_SECONDARY)
+			ibv_close_device(ctx);
+	}
+
+	ibv_free_device_list(ibv_list);
+	return 0;
+
+failed:
+	/* Free the resource for the port failed */
+	if (priv) {
+		if (priv->ib_parent_pd)
+			ibv_dealloc_pd(priv->ib_parent_pd);
+
+		if (priv->ib_pd)
+			ibv_dealloc_pd(priv->ib_pd);
+	}
+
+	if (eth_dev)
+		rte_eth_dev_release_port(eth_dev);
+
+	rte_free(priv);
+
+	ibv_close_device(ctx);
+	ibv_free_device_list(ibv_list);
+
+	return ret;
+}
+
+static int mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+			  struct rte_pci_device *pci_dev)
+{
+	struct rte_devargs *args = pci_dev->device.devargs;
+	struct mana_conf conf = {};
+	unsigned int i;
+	int ret;
+
+	if (args && args->args) {
+		ret = mana_parse_args(args, &conf);
+		if (ret) {
+			DRV_LOG(ERR, "failed to parse parameters args = %s",
+				args->args);
+			return ret;
+		}
+	}
+
+	ret = mana_init_once();
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
+		return ret;
+	}
+
+	/* If there are no driver parameters, probe on all ports */
+	if (!conf.index)
+		return mana_pci_probe_mac(pci_drv, pci_dev, NULL);
+
+	for (i = 0; i < conf.index; i++) {
+		ret = mana_pci_probe_mac(pci_drv, pci_dev, &conf.mac_array[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mana_dev_uninit(struct rte_eth_dev *dev)
+{
+	RTE_SET_USED(dev);
+	return 0;
+}
+
+static int mana_pci_remove(struct rte_pci_device *pci_dev)
+{
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		rte_spinlock_lock(&mana_shared_data_lock);
+
+		rte_spinlock_lock(&mana_shared_data->lock);
+
+		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
+		mana_shared_data->primary_cnt--;
+		if (!mana_shared_data->primary_cnt) {
+			DRV_LOG(DEBUG, "mp uninit primary");
+			mana_mp_uninit_primary();
+		}
+
+		rte_spinlock_unlock(&mana_shared_data->lock);
+
+		/* Also free the shared memory if this is the last */
+		if (!mana_shared_data->primary_cnt) {
+			DRV_LOG(DEBUG, "free shared memezone data");
+			rte_memzone_free(mana_shared_mz);
+		}
+
+		rte_spinlock_unlock(&mana_shared_data_lock);
+	} else {
+		rte_spinlock_lock(&mana_shared_data_lock);
+
+		rte_spinlock_lock(&mana_shared_data->lock);
+		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
+		mana_shared_data->secondary_cnt--;
+		rte_spinlock_unlock(&mana_shared_data->lock);
+
+		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
+		mana_local_data.secondary_cnt--;
+		if (!mana_local_data.secondary_cnt) {
+			DRV_LOG(DEBUG, "mp uninit secondary");
+			mana_mp_uninit_secondary();
+		}
+
+		rte_spinlock_unlock(&mana_shared_data_lock);
+	}
+
+	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
+}
+
+static const struct rte_pci_id mana_pci_id_map[] = {
+	{
+		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
+			       PCI_DEVICE_ID_MICROSOFT_MANA)
+	},
+};
+
+static struct rte_pci_driver mana_pci_driver = {
+	.driver = {
+		.name = "mana_pci",
+	},
+	.id_table = mana_pci_id_map,
+	.probe = mana_pci_probe,
+	.remove = mana_pci_remove,
+	.drv_flags = RTE_PCI_DRV_INTR_RMV,
+};
+
+RTE_INIT(rte_mana_pmd_init)
+{
+	rte_pci_register(&mana_pci_driver);
+}
+
+RTE_PMD_EXPORT_NAME(net_mana, __COUNTER__);
+RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
+RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
+RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
new file mode 100644
index 0000000000..e30c030b4e
--- /dev/null
+++ b/drivers/net/mana/mana.h
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#ifndef __MANA_H__
+#define __MANA_H__
+
+enum {
+	PCI_VENDOR_ID_MICROSOFT = 0x1414,
+};
+
+enum {
+	PCI_DEVICE_ID_MICROSOFT_MANA = 0x00ba,
+};
+
+/* Shared data between primary/secondary processes */
+struct mana_shared_data {
+	rte_spinlock_t lock;
+	int init_done;
+	unsigned int primary_cnt;
+	unsigned int secondary_cnt;
+};
+
+#define MIN_RX_BUF_SIZE	1024
+#define MAX_FRAME_SIZE	RTE_ETHER_MAX_LEN
+#define BNIC_MAX_MAC_ADDR 1
+
+#define BNIC_DEV_RX_OFFLOAD_SUPPORT ( \
+		DEV_RX_OFFLOAD_CHECKSUM | \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
+#define BNIC_DEV_TX_OFFLOAD_SUPPORT ( \
+		RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \
+		RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \
+		RTE_ETH_TX_OFFLOAD_TCP_CKSUM | \
+		RTE_ETH_TX_OFFLOAD_UDP_CKSUM | \
+		RTE_ETH_TX_OFFLOAD_TCP_TSO)
+
+#define INDIRECTION_TABLE_NUM_ELEMENTS 64
+#define TOEPLITZ_HASH_KEY_SIZE_IN_BYTES 40
+#define BNIC_ETH_RSS_SUPPORT ( \
+	ETH_RSS_IPV4 |	     \
+	ETH_RSS_NONFRAG_IPV4_TCP | \
+	ETH_RSS_NONFRAG_IPV4_UDP | \
+	ETH_RSS_IPV6 |	     \
+	ETH_RSS_NONFRAG_IPV6_TCP | \
+	ETH_RSS_NONFRAG_IPV6_UDP)
+
+#define MIN_BUFFERS_PER_QUEUE		64
+#define MAX_RECEIVE_BUFFERS_PER_QUEUE	256
+#define MAX_SEND_BUFFERS_PER_QUEUE	256
+
+struct mana_process_priv {
+	void *db_page;
+};
+
+struct mana_priv {
+	struct rte_eth_dev_data *dev_data;
+	struct mana_process_priv *process_priv;
+	int num_queues;
+
+	/* DPDK port */
+	uint16_t port_id;
+
+	/* IB device port */
+	uint8_t dev_port;
+
+	struct ibv_context *ib_ctx;
+	struct ibv_pd *ib_pd;
+	struct ibv_pd *ib_parent_pd;
+	struct ibv_rwq_ind_table *ind_table;
+	uint8_t ind_table_key[40];
+	struct ibv_qp *rwq_qp;
+	void *db_page;
+	int max_rx_queues;
+	int max_tx_queues;
+	int max_rx_desc;
+	int max_tx_desc;
+	int max_send_sge;
+	int max_recv_sge;
+	int max_mr;
+	uint64_t max_mr_size;
+};
+
+struct mana_txq_desc {
+	struct rte_mbuf *pkt;
+	uint32_t wqe_size_in_bu;
+};
+
+struct mana_rxq_desc {
+	struct rte_mbuf *pkt;
+	uint32_t wqe_size_in_bu;
+};
+
+struct mana_gdma_queue {
+	void *buffer;
+	uint32_t count;	/* in entries */
+	uint32_t size;	/* in bytes */
+	uint32_t id;
+	uint32_t head;
+	uint32_t tail;
+};
+
+struct mana_stats {
+	uint64_t packets;
+	uint64_t bytes;
+	uint64_t errors;
+	uint64_t nombuf;
+};
+
+#define MANA_MR_BTREE_PER_QUEUE_N	64
+struct mana_txq {
+	struct mana_priv *priv;
+	uint32_t num_desc;
+	struct ibv_cq *cq;
+	struct ibv_qp *qp;
+
+	struct mana_gdma_queue gdma_sq;
+	struct mana_gdma_queue gdma_cq;
+
+	uint32_t tx_vp_offset;
+
+	/* For storing pending requests */
+	struct mana_txq_desc *desc_ring;
+
+	/* desc_ring_head is where we put pending requests to ring,
+	 * completion pull off desc_ring_tail
+	 */
+	uint32_t desc_ring_head, desc_ring_tail;
+
+	struct mana_stats stats;
+	unsigned int socket;
+};
+
+struct mana_rxq {
+	struct mana_priv *priv;
+	uint32_t num_desc;
+	struct rte_mempool *mp;
+	struct ibv_cq *cq;
+	struct ibv_wq *wq;
+
+	/* For storing pending requests */
+	struct mana_rxq_desc *desc_ring;
+
+	/* desc_ring_head is where we put pending requests to ring,
+	 * completion pull off desc_ring_tail
+	 */
+	uint32_t desc_ring_head, desc_ring_tail;
+
+	struct mana_gdma_queue gdma_rq;
+	struct mana_gdma_queue gdma_cq;
+
+	struct mana_stats stats;
+
+	unsigned int socket;
+};
+
+extern int mana_logtype_driver;
+extern int mana_logtype_init;
+
+#define DRV_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, mana_logtype_driver, "%s(): " fmt "\n", \
+		__func__, ## args)
+
+#define PMD_INIT_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, mana_logtype_init, "%s(): " fmt "\n",\
+		__func__, ## args)
+
+#define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
+
+const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
+
+uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
+uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
+			       uint16_t pkts_n);
+
+/** Request timeout for IPC. */
+#define MANA_MP_REQ_TIMEOUT_SEC 5
+
+/* Request types for IPC. */
+enum mana_mp_req_type {
+	MANA_MP_REQ_VERBS_CMD_FD = 1,
+	MANA_MP_REQ_CREATE_MR,
+	MANA_MP_REQ_START_RXTX,
+	MANA_MP_REQ_STOP_RXTX,
+};
+
+/* Pameters for IPC. */
+struct mana_mp_param {
+	enum mana_mp_req_type type;
+	int port_id;
+	int result;
+
+	/* MANA_MP_REQ_CREATE_MR */
+	uintptr_t addr;
+	uint32_t len;
+};
+
+#define MANA_MP_NAME	"net_mana_mp"
+int mana_mp_init_primary(void);
+int mana_mp_init_secondary(void);
+void mana_mp_uninit_primary(void);
+void mana_mp_uninit_secondary(void);
+int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+
+void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
+
+#endif
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
new file mode 100644
index 0000000000..81c4118f53
--- /dev/null
+++ b/drivers/net/mana/meson.build
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2022 Microsoft Corporation
+
+if not is_linux or not dpdk_conf.has('RTE_ARCH_X86_64')
+    build = false
+    reason = 'mana is supported on Linux X86_64'
+    subdir_done()
+endif
+
+deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
+
+sources += files(
+	'mana.c',
+	'mp.c',
+)
+
+libnames = ['ibverbs', 'mana' ]
+foreach libname:libnames
+    lib = cc.find_library(libname, required:false)
+    if lib.found()
+        ext_deps += lib
+    else
+        build = false
+        reason = 'missing dependency, "' + libname + '"'
+        subdir_done()
+    endif
+endforeach
+
+required_symbols = [
+    ['infiniband/manadv.h', 'manadv_set_context_attr'],
+    ['infiniband/manadv.h', 'manadv_init_obj'],
+    ['infiniband/manadv.h', 'MANADV_CTX_ATTR_BUF_ALLOCATORS'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_QP'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_CQ'],
+    ['infiniband/manadv.h', 'MANADV_OBJ_RWQ'],
+]
+
+foreach arg:required_symbols
+    if not cc.has_header_symbol(arg[0], arg[1])
+        build = false
+        reason = 'missing symbol "' + arg[1] + '" in "' + arg[0] + '"'
+        subdir_done()
+    endif
+endforeach
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
new file mode 100644
index 0000000000..d7580e8a28
--- /dev/null
+++ b/drivers/net/mana/mp.c
@@ -0,0 +1,235 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_log.h>
+
+#include <infiniband/verbs.h>
+
+#include "mana.h"
+
+extern struct mana_shared_data *mana_shared_data;
+
+static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type,
+			int port_id)
+{
+	struct mana_mp_param *param;
+
+	strlcpy(msg->name, MANA_MP_NAME, sizeof(msg->name));
+	msg->len_param = sizeof(*param);
+
+	param = (struct mana_mp_param *)msg->param;
+	param->type = type;
+	param->port_id = port_id;
+}
+
+static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
+				  const void *peer)
+{
+	struct rte_eth_dev *dev;
+	const struct mana_mp_param *param =
+		(const struct mana_mp_param *)mp_msg->param;
+	struct rte_mp_msg mp_res = { 0 };
+	struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
+	int ret;
+	struct mana_priv *priv;
+
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+
+	mp_init_msg(&mp_res, param->type, param->port_id);
+
+	switch (param->type) {
+	case MANA_MP_REQ_VERBS_CMD_FD:
+		mp_res.num_fds = 1;
+		mp_res.fds[0] = priv->ib_ctx->cmd_fd;
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	default:
+		DRV_LOG(ERR, "Port %u unknown primary MP type %u",
+			param->port_id, param->type);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg,
+				    const void *peer)
+{
+	struct rte_mp_msg mp_res = { 0 };
+	struct mana_mp_param *res = (struct mana_mp_param *)mp_res.param;
+	const struct mana_mp_param *param =
+		(const struct mana_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	int ret;
+
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
+		return -ENODEV;
+	}
+
+	dev = &rte_eth_devices[param->port_id];
+
+	mp_init_msg(&mp_res, param->type, param->port_id);
+
+	switch (param->type) {
+	case MANA_MP_REQ_START_RXTX:
+		DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
+
+		rte_mb();
+
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	case MANA_MP_REQ_STOP_RXTX:
+		DRV_LOG(INFO, "Port %u stopping datapath", dev->data->port_id);
+
+		dev->tx_pkt_burst = mana_tx_burst_removed;
+		dev->rx_pkt_burst = mana_rx_burst_removed;
+
+		rte_mb();
+
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	default:
+		DRV_LOG(ERR, "Port %u unknown secondary MP type %u",
+			param->port_id, param->type);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+int mana_mp_init_primary(void)
+{
+	int ret;
+
+	ret = rte_mp_action_register(MANA_MP_NAME, mana_mp_primary_handle);
+	if (ret && rte_errno != ENOTSUP) {
+		DRV_LOG(ERR, "Failed to register primary handler %d %d",
+			ret, rte_errno);
+		return -1;
+	}
+
+	return 0;
+}
+
+void mana_mp_uninit_primary(void)
+{
+	rte_mp_action_unregister(MANA_MP_NAME);
+}
+
+int mana_mp_init_secondary(void)
+{
+	return rte_mp_action_register(MANA_MP_NAME, mana_mp_secondary_handle);
+}
+
+void mana_mp_uninit_secondary(void)
+{
+	rte_mp_action_unregister(MANA_MP_NAME);
+}
+
+int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
+{
+	struct rte_mp_msg mp_req = { 0 };
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mana_mp_param *res;
+	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int ret;
+
+	mp_init_msg(&mp_req, MANA_MP_REQ_VERBS_CMD_FD, dev->data->port_id);
+
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		DRV_LOG(ERR, "port %u request to primary process failed",
+			dev->data->port_id);
+		return ret;
+	}
+
+	if (mp_rep.nb_received != 1) {
+		DRV_LOG(ERR, "primary replied %u messages", mp_rep.nb_received);
+		ret = -EPROTO;
+		goto exit;
+	}
+
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mana_mp_param *)mp_res->param;
+	if (res->result) {
+		DRV_LOG(ERR, "failed to get CMD FD, port %u",
+			dev->data->port_id);
+		ret = res->result;
+		goto exit;
+	}
+
+	if (mp_res->num_fds != 1) {
+		DRV_LOG(ERR, "got FDs %d unexpected", mp_res->num_fds);
+		ret = -EPROTO;
+		goto exit;
+	}
+
+	ret = mp_res->fds[0];
+	DRV_LOG(ERR, "port %u command FD from primary is %d",
+		dev->data->port_id, ret);
+exit:
+	free(mp_rep.msgs);
+	return ret;
+}
+
+void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
+{
+	struct rte_mp_msg mp_req = { 0 };
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mana_mp_param *res;
+	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int i, ret;
+
+	if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) {
+		DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+			dev->data->port_id, type);
+		return;
+	}
+
+	if (!mana_shared_data->secondary_cnt)
+		return;
+
+	mp_init_msg(&mp_req, type, dev->data->port_id);
+
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		if (rte_errno != ENOTSUP)
+			DRV_LOG(ERR, "port %u failed to request Rx/Tx (%d)",
+				dev->data->port_id, type);
+		goto exit;
+	}
+	if (mp_rep.nb_sent != mp_rep.nb_received) {
+		DRV_LOG(ERR, "port %u not all secondaries responded (%d)",
+			dev->data->port_id, type);
+		goto exit;
+	}
+	for (i = 0; i < mp_rep.nb_received; i++) {
+		mp_res = &mp_rep.msgs[i];
+		res = (struct mana_mp_param *)mp_res->param;
+		if (res->result) {
+			DRV_LOG(ERR, "port %u request failed on secondary %d",
+				dev->data->port_id, i);
+			goto exit;
+		}
+	}
+exit:
+	free(mp_rep.msgs);
+}
diff --git a/drivers/net/mana/version.map b/drivers/net/mana/version.map
new file mode 100644
index 0000000000..c2e0723b4c
--- /dev/null
+++ b/drivers/net/mana/version.map
@@ -0,0 +1,3 @@
+DPDK_22 {
+	local: *;
+};
diff --git a/drivers/net/meson.build b/drivers/net/meson.build
index 2355d1cde8..0b111a6ebb 100644
--- a/drivers/net/meson.build
+++ b/drivers/net/meson.build
@@ -34,6 +34,7 @@ drivers = [
         'ixgbe',
         'kni',
         'liquidio',
+        'mana',
         'memif',
         'mlx4',
         'mlx5',
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * Re: [Patch v3 01/17] net/mana: add basic driver, build environment and doc
  2022-07-07 20:30 ` [Patch v3 01/17] net/mana: add basic driver, build environment and doc longli
@ 2022-07-07 21:44   ` Stephen Hemminger
  2022-07-07 22:12     ` Long Li
  2022-07-07 21:52   ` Stephen Hemminger
  1 sibling, 1 reply; 23+ messages in thread
From: Stephen Hemminger @ 2022-07-07 21:44 UTC (permalink / raw)
  To: longli; +Cc: longli, Ferruh Yigit, dev, Ajay Sharma, Stephen Hemminger
On Thu,  7 Jul 2022 13:30:06 -0700
longli@linuxonhyperv.com wrote:
> +	file = fopen(path, "rb");
Minor nit, if you make any later changes.
"rb" is same as "r" on Linux.  b means binary, and this is
actually a text file.
^ permalink raw reply	[flat|nested] 23+ messages in thread 
- * RE: [Patch v3 01/17] net/mana: add basic driver, build environment and doc
  2022-07-07 21:44   ` Stephen Hemminger
@ 2022-07-07 22:12     ` Long Li
  0 siblings, 0 replies; 23+ messages in thread
From: Long Li @ 2022-07-07 22:12 UTC (permalink / raw)
  To: Stephen Hemminger, longli
  Cc: Ferruh Yigit, dev, Ajay Sharma, Stephen Hemminger
> Subject: Re: [Patch v3 01/17] net/mana: add basic driver, build environment and
> doc
> 
> On Thu,  7 Jul 2022 13:30:06 -0700
> longli@linuxonhyperv.com wrote:
> 
> > +	file = fopen(path, "rb");
> 
> Minor nit, if you make any later changes.
> "rb" is same as "r" on Linux.  b means binary, and this is actually a text file.
Will fix this.
^ permalink raw reply	[flat|nested] 23+ messages in thread 
 
- * Re: [Patch v3 01/17] net/mana: add basic driver, build environment and doc
  2022-07-07 20:30 ` [Patch v3 01/17] net/mana: add basic driver, build environment and doc longli
  2022-07-07 21:44   ` Stephen Hemminger
@ 2022-07-07 21:52   ` Stephen Hemminger
  1 sibling, 0 replies; 23+ messages in thread
From: Stephen Hemminger @ 2022-07-07 21:52 UTC (permalink / raw)
  To: longli; +Cc: longli, Ferruh Yigit, dev, Ajay Sharma, Stephen Hemminger
On Thu,  7 Jul 2022 13:30:06 -0700
longli@linuxonhyperv.com wrote:
> +static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
> +			      struct rte_pci_device *pci_dev,
> +			      struct rte_ether_addr *mac_addr)
> +{
> +	struct ibv_device **ibv_list;
> +	int ibv_idx;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex dev_attr;
> +	int num_devices;
> +	int ret = 0;
> +	uint8_t port;
> +	struct mana_priv *priv = NULL;
> +	struct rte_eth_dev *eth_dev = NULL;
> +	bool found_port;
> +
> +	ibv_list = ibv_get_device_list(&num_devices);
> +	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
> +		struct ibv_device *ibdev = ibv_list[ibv_idx];
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
> +			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
> +
> +		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
> +			continue;
> +
> +		/* Ignore if this IB device is not this PCI device */
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +
> +		ctx = ibv_open_device(ibdev);
> +		if (!ctx) {
> +			DRV_LOG(ERR, "Failed to open IB device %s",
> +				ibdev->name);
> +			continue;
> +		}
> +
> +		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
> +		DRV_LOG(INFO, "dev_attr.orig_attr.phys_port_cnt %u",
> +			dev_attr.orig_attr.phys_port_cnt);
> +		found_port = false;
> +
> +		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
> +		     port++) {
> +			struct ibv_parent_domain_init_attr attr = {};
> +			struct rte_ether_addr addr;
> +			char address[64];
> +			char name[RTE_ETH_NAME_MAX_LEN];
> +
> +			ret = get_port_mac(ibdev, port, &addr);
> +			if (ret)
> +				continue;
> +
> +			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
> +				continue;
> +
> +			rte_ether_format_addr(address, sizeof(address), &addr);
> +			DRV_LOG(INFO, "device located port %u address %s",
> +				port, address);
> +			found_port = true;
> +
> +			priv = rte_zmalloc_socket(NULL, sizeof(*priv),
> +						  RTE_CACHE_LINE_SIZE,
> +						  SOCKET_ID_ANY);
> +			if (!priv) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			snprintf(name, sizeof(name), "%s_port%d",
> +				 pci_dev->device.name, port);
> +
> +			if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> +				int fd;
> +
> +				eth_dev = rte_eth_dev_attach_secondary(name);
> +				if (!eth_dev) {
> +					DRV_LOG(ERR, "Can't attach to dev %s",
> +						name);
> +					ret = -ENOMEM;
> +					goto failed;
> +				}
> +
> +				eth_dev->device = &pci_dev->device;
> +				eth_dev->dev_ops = &mana_dev_sec_ops;
> +				ret = mana_proc_priv_init(eth_dev);
> +				if (ret)
> +					goto failed;
> +				priv->process_priv = eth_dev->process_private;
> +
> +				/* Get the IB FD from the primary process */
> +				fd = mana_mp_req_verbs_cmd_fd(eth_dev);
> +				if (fd < 0) {
> +					DRV_LOG(ERR, "Failed to get FD %d", fd);
> +					ret = -ENODEV;
> +					goto failed;
> +				}
> +
> +				ret = mana_map_doorbell_secondary(eth_dev, fd);
> +				if (ret) {
> +					DRV_LOG(ERR, "Failed secondary map %d",
> +						fd);
> +					goto failed;
> +				}
> +
> +				/* fd is no not used after mapping doorbell */
> +				close(fd);
> +
> +				rte_spinlock_lock(&mana_shared_data->lock);
> +				mana_shared_data->secondary_cnt++;
> +				mana_local_data.secondary_cnt++;
> +				rte_spinlock_unlock(&mana_shared_data->lock);
> +
> +				rte_eth_copy_pci_info(eth_dev, pci_dev);
> +				rte_eth_dev_probing_finish(eth_dev);
> +
> +				/* Impossible to have more than one port
> +				 * matching a MAC address
> +				 */
> +				continue;
> +			}
> +
> +			eth_dev = rte_eth_dev_allocate(name);
> +			if (!eth_dev) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			eth_dev->data->mac_addrs =
> +				rte_calloc("mana_mac", 1,
> +					   sizeof(struct rte_ether_addr), 0);
> +			if (!eth_dev->data->mac_addrs) {
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			rte_ether_addr_copy(&addr, eth_dev->data->mac_addrs);
> +
> +			priv->ib_pd = ibv_alloc_pd(ctx);
> +			if (!priv->ib_pd) {
> +				DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			/* Create a parent domain with the port number */
> +			attr.pd = priv->ib_pd;
> +			attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
> +			attr.pd_context = (void *)(uint64_t)port;
> +			priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
> +			if (!priv->ib_parent_pd) {
> +				DRV_LOG(ERR,
> +					"ibv_alloc_parent_domain failed port %d",
> +					port);
> +				ret = -ENOMEM;
> +				goto failed;
> +			}
> +
> +			priv->ib_ctx = ctx;
> +			priv->port_id = eth_dev->data->port_id;
> +			priv->dev_port = port;
> +			eth_dev->data->dev_private = priv;
> +			priv->dev_data = eth_dev->data;
> +
> +			priv->max_rx_queues = dev_attr.orig_attr.max_qp;
> +			priv->max_tx_queues = dev_attr.orig_attr.max_qp;
> +
> +			priv->max_rx_desc =
> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> +					dev_attr.orig_attr.max_cqe);
> +			priv->max_tx_desc =
> +				RTE_MIN(dev_attr.orig_attr.max_qp_wr,
> +					dev_attr.orig_attr.max_cqe);
> +
> +			priv->max_send_sge = dev_attr.orig_attr.max_sge;
> +			priv->max_recv_sge = dev_attr.orig_attr.max_sge;
> +
> +			priv->max_mr = dev_attr.orig_attr.max_mr;
> +			priv->max_mr_size = dev_attr.orig_attr.max_mr_size;
> +
> +			DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d\n",
> +				name, priv->max_rx_queues, priv->max_rx_desc,
> +				priv->max_send_sge);
> +
Message will end up double spaced, since DRV_LOG already adds newline.
You might also want to use %u rather than %d when the values are unsigned.
^ permalink raw reply	[flat|nested] 23+ messages in thread
 
- * [Patch v3 02/17] net/mana: add device configuration and stop
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
  2022-07-07 20:30 ` [Patch v3 01/17] net/mana: add basic driver, build environment and doc longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 03/17] net/mana: add function to report support ptypes longli
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
MANA defines its memory allocation functions to override IB layer default
functions to allocate device queues. This patch adds the code for device
configuration and stop.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Removed validation for offload settings in mana_dev_configure().
 drivers/net/mana/mana.c | 75 +++++++++++++++++++++++++++++++++++++++--
 drivers/net/mana/mana.h |  3 ++
 2 files changed, 76 insertions(+), 2 deletions(-)
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 63ec1f75f0..1ea2cecd37 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -40,7 +40,79 @@ static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
 int mana_logtype_driver;
 int mana_logtype_init;
 
+void *mana_alloc_verbs_buf(size_t size, void *data)
+{
+	void *ret;
+	size_t alignment = rte_mem_page_size();
+	int socket = (int)(uintptr_t)data;
+
+	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
+
+	if (alignment == (size_t)-1) {
+		DRV_LOG(ERR, "Failed to get mem page size");
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
+	if (!ret && size)
+		rte_errno = ENOMEM;
+	return ret;
+}
+
+void mana_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+	rte_free(ptr);
+}
+
+static int mana_dev_configure(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
+
+	if (dev_conf->rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
+		dev_conf->rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
+
+	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
+		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
+		return -EINVAL;
+	}
+
+	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
+		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
+		return -EINVAL;
+	}
+
+	priv->num_queues = dev->data->nb_rx_queues;
+
+	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
+				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
+					.alloc = &mana_alloc_verbs_buf,
+					.free = &mana_free_verbs_buf,
+					.data = 0,
+				}));
+
+	return 0;
+}
+
+static int
+mana_dev_close(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	ret = ibv_close_device(priv->ib_ctx);
+	if (ret) {
+		ret = errno;
+		return ret;
+	}
+
+	return 0;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
+	.dev_configure		= mana_dev_configure,
+	.dev_close		= mana_dev_close,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
@@ -627,8 +699,7 @@ static int mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 
 static int mana_dev_uninit(struct rte_eth_dev *dev)
 {
-	RTE_SET_USED(dev);
-	return 0;
+	return mana_dev_close(dev);
 }
 
 static int mana_pci_remove(struct rte_pci_device *pci_dev)
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index e30c030b4e..66873394b9 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -207,4 +207,7 @@ int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
 
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
+void *mana_alloc_verbs_buf(size_t size, void *data);
+void mana_free_verbs_buf(void *ptr, void *data __rte_unused);
+
 #endif
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 03/17] net/mana: add function to report support ptypes
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
  2022-07-07 20:30 ` [Patch v3 01/17] net/mana: add basic driver, build environment and doc longli
  2022-07-07 20:30 ` [Patch v3 02/17] net/mana: add device configuration and stop longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 04/17] net/mana: add link update longli
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
Report supported protocol types.
Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/net/mana/mana.c | 16 ++++++++++++++++
 drivers/net/mana/mana.h |  2 --
 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 1ea2cecd37..5deea1b03a 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -110,9 +110,25 @@ mana_dev_close(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
+{
+	static const uint32_t ptypes[] = {
+		RTE_PTYPE_L2_ETHER,
+		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+		RTE_PTYPE_L4_FRAG,
+		RTE_PTYPE_L4_TCP,
+		RTE_PTYPE_L4_UDP,
+		RTE_PTYPE_UNKNOWN
+	};
+
+	return ptypes;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
 	.dev_close		= mana_dev_close,
+	.dev_supported_ptypes_get = mana_supported_ptypes,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 66873394b9..c433940022 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -168,8 +168,6 @@ extern int mana_logtype_init;
 
 #define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
 
-const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
-
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 04/17] net/mana: add link update
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (2 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 03/17] net/mana: add function to report support ptypes longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 05/17] net/mana: add function for device removal interrupts longli
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
The carrier state is managed by the Azure host. MANA runs as a VF and
always reports "up".
Signed-off-by: Long Li <longli@microsoft.com>
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c           | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index b92a27374c..62554b0a0a 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Link status          = P
 Linux                = Y
 Multiprocess aware   = Y
 Usage doc            = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 5deea1b03a..8c6491f045 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -125,10 +125,27 @@ static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev __rte_unuse
 	return ptypes;
 }
 
+static int mana_dev_link_update(struct rte_eth_dev *dev,
+				int wait_to_complete __rte_unused)
+{
+	struct rte_eth_link link;
+
+	/* MANA has no concept of carrier state, always reporting UP */
+	link = (struct rte_eth_link) {
+		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
+		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
+		.link_speed = RTE_ETH_SPEED_NUM_200G,
+		.link_status = RTE_ETH_LINK_UP,
+	};
+
+	return rte_eth_linkstatus_set(dev, &link);
+}
+
 const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
 	.dev_close		= mana_dev_close,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
+	.link_update		= mana_dev_link_update,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 05/17] net/mana: add function for device removal interrupts
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (3 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 04/17] net/mana: add link update longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 06/17] net/mana: add device info longli
                   ` (11 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
MANA supports PCI hot plug events. Add this interrupt to DPDK core so its
parent PMD can detect device removal during Azure servicing or live
migration.
Signed-off-by: Long Li <longli@microsoft.com>
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c           | 97 +++++++++++++++++++++++++++++++
 drivers/net/mana/mana.h           |  1 +
 3 files changed, 99 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index 62554b0a0a..8043e11f99 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -7,5 +7,6 @@
 Link status          = P
 Linux                = Y
 Multiprocess aware   = Y
+Removal event        = Y
 Usage doc            = Y
 x86-64               = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 8c6491f045..e15ecb8ea6 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -95,12 +95,18 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int mana_intr_uninstall(struct mana_priv *priv);
+
 static int
 mana_dev_close(struct rte_eth_dev *dev)
 {
 	struct mana_priv *priv = dev->data->dev_private;
 	int ret;
 
+	ret = mana_intr_uninstall(priv);
+	if (ret)
+		return ret;
+
 	ret = ibv_close_device(priv->ib_ctx);
 	if (ret) {
 		ret = errno;
@@ -327,6 +333,90 @@ static int mana_ibv_device_to_pci_addr(const struct ibv_device *device,
 	return 0;
 }
 
+static void mana_intr_handler(void *arg)
+{
+	struct mana_priv *priv = arg;
+	struct ibv_context *ctx = priv->ib_ctx;
+	struct ibv_async_event event;
+
+	/* Read and ack all messages from IB device */
+	while (true) {
+		if (ibv_get_async_event(ctx, &event))
+			break;
+
+		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
+			struct rte_eth_dev *dev;
+
+			dev = &rte_eth_devices[priv->port_id];
+			if (dev->data->dev_conf.intr_conf.rmv)
+				rte_eth_dev_callback_process(dev,
+					RTE_ETH_EVENT_INTR_RMV, NULL);
+		}
+
+		ibv_ack_async_event(&event);
+	}
+}
+
+static int mana_intr_uninstall(struct mana_priv *priv)
+{
+	int ret;
+
+	ret = rte_intr_callback_unregister(priv->intr_handle,
+					   mana_intr_handler, priv);
+	if (ret <= 0) {
+		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
+		return ret;
+	}
+
+	rte_intr_instance_free(priv->intr_handle);
+
+	return 0;
+}
+
+static int mana_intr_install(struct mana_priv *priv)
+{
+	int ret, flags;
+	struct ibv_context *ctx = priv->ib_ctx;
+
+	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
+	if (!priv->intr_handle) {
+		DRV_LOG(ERR, "Failed to allocate intr_handle");
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+
+	rte_intr_fd_set(priv->intr_handle, -1);
+
+	flags = fcntl(ctx->async_fd, F_GETFL);
+	ret = fcntl(ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
+		goto free_intr;
+	}
+
+	rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
+	rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
+
+	ret = rte_intr_callback_register(priv->intr_handle,
+					 mana_intr_handler, priv);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to register intr callback");
+		rte_intr_fd_set(priv->intr_handle, -1);
+		goto restore_fd;
+	}
+
+	return 0;
+
+restore_fd:
+	fcntl(ctx->async_fd, F_SETFL, flags);
+
+free_intr:
+	rte_intr_instance_free(priv->intr_handle);
+	priv->intr_handle = NULL;
+
+	return ret;
+}
+
 static int mana_proc_priv_init(struct rte_eth_dev *dev)
 {
 	struct mana_process_priv *priv;
@@ -640,6 +730,13 @@ static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
 				name, priv->max_rx_queues, priv->max_rx_desc,
 				priv->max_send_sge);
 
+			/* Create async interrupt handler */
+			ret = mana_intr_install(priv);
+			if (ret) {
+				DRV_LOG(ERR, "Failed to install intr handler");
+				goto failed;
+			}
+
 			rte_spinlock_lock(&mana_shared_data->lock);
 			mana_shared_data->primary_cnt++;
 			rte_spinlock_unlock(&mana_shared_data->lock);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index c433940022..f97eed2e81 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -72,6 +72,7 @@ struct mana_priv {
 	uint8_t ind_table_key[40];
 	struct ibv_qp *rwq_qp;
 	void *db_page;
+	struct rte_intr_handle *intr_handle;
 	int max_rx_queues;
 	int max_tx_queues;
 	int max_rx_desc;
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 06/17] net/mana: add device info
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (4 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 05/17] net/mana: add function for device removal interrupts longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 07/17] net/mana: add function to configure RSS longli
                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
Add the function to get device info.
Signed-off-by: Long Li <longli@microsoft.com>
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c           | 82 +++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index 8043e11f99..566b3e8770 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -8,5 +8,6 @@ Link status          = P
 Linux                = Y
 Multiprocess aware   = Y
 Removal event        = Y
+Speed capabilities   = P
 Usage doc            = Y
 x86-64               = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index e15ecb8ea6..15950a27ee 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -116,6 +116,86 @@ mana_dev_close(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static int mana_dev_info_get(struct rte_eth_dev *dev,
+			     struct rte_eth_dev_info *dev_info)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	dev_info->max_mtu = RTE_ETHER_MTU;
+
+	/* RX params */
+	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
+	dev_info->max_rx_pktlen = MAX_FRAME_SIZE;
+
+	dev_info->max_rx_queues = priv->max_rx_queues;
+	dev_info->max_tx_queues = priv->max_tx_queues;
+
+	dev_info->max_mac_addrs = BNIC_MAX_MAC_ADDR;
+	dev_info->max_hash_mac_addrs = 0;
+
+	dev_info->max_vfs = 1;
+
+	/* Offload params */
+	dev_info->rx_offload_capa = BNIC_DEV_RX_OFFLOAD_SUPPORT;
+
+	dev_info->tx_offload_capa = BNIC_DEV_TX_OFFLOAD_SUPPORT;
+
+	/* RSS */
+	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
+	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
+	dev_info->flow_type_rss_offloads = BNIC_ETH_RSS_SUPPORT;
+
+	/* Thresholds */
+	dev_info->default_rxconf = (struct rte_eth_rxconf){
+		.rx_thresh = {
+			.pthresh = 8,
+			.hthresh = 8,
+			.wthresh = 0,
+		},
+		.rx_free_thresh = 32,
+		/* If no descriptors available, pkts are dropped by default */
+		.rx_drop_en = 1,
+	};
+
+	dev_info->default_txconf = (struct rte_eth_txconf){
+		.tx_thresh = {
+			.pthresh = 32,
+			.hthresh = 0,
+			.wthresh = 0,
+		},
+		.tx_rs_thresh = 32,
+		.tx_free_thresh = 32,
+	};
+
+	/* Buffer limits */
+	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
+	dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
+	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
+	dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
+	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
+
+	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
+	dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
+	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
+	dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
+	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
+
+	/* Speed */
+	dev_info->speed_capa = ETH_LINK_SPEED_100G;
+
+	/* RX params */
+	dev_info->default_rxportconf.burst_size = 1;
+	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
+	dev_info->default_rxportconf.nb_queues = 1;
+
+	/* TX params */
+	dev_info->default_txportconf.burst_size = 1;
+	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
+	dev_info->default_txportconf.nb_queues = 1;
+
+	return 0;
+}
+
 static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
 {
 	static const uint32_t ptypes[] = {
@@ -150,11 +230,13 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
 	.dev_close		= mana_dev_close,
+	.dev_infos_get		= mana_dev_info_get,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
 	.link_update		= mana_dev_link_update,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
+	.dev_infos_get = mana_dev_info_get,
 };
 
 uint16_t
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 07/17] net/mana: add function to configure RSS
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (5 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 06/17] net/mana: add device info longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 08/17] net/mana: add function to configure RX queues longli
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
Currently this PMD supports RSS configuration when the device is stopped.
Configuring RSS in running state will be supported in the future.
Signed-off-by: Long Li <longli@microsoft.com>
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c           | 61 +++++++++++++++++++++++++++++++
 drivers/net/mana/mana.h           |  1 +
 3 files changed, 63 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index 566b3e8770..a59c21cc10 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -8,6 +8,7 @@ Link status          = P
 Linux                = Y
 Multiprocess aware   = Y
 Removal event        = Y
+RSS hash             = Y
 Speed capabilities   = P
 Usage doc            = Y
 x86-64               = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 15950a27ee..6563fe3661 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -211,6 +211,65 @@ static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev __rte_unuse
 	return ptypes;
 }
 
+static int mana_rss_hash_update(struct rte_eth_dev *dev,
+				struct rte_eth_rss_conf *rss_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	/* Currently can only update RSS hash when device is stopped */
+	if (dev->data->dev_started) {
+		DRV_LOG(ERR, "Can't update RSS after device has started");
+		return -ENODEV;
+	}
+
+	if (rss_conf->rss_hf & ~BNIC_ETH_RSS_SUPPORT) {
+		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
+			dev->data->port_id, rss_conf->rss_hf);
+		return -EINVAL;
+	}
+
+	if (rss_conf->rss_key && rss_conf->rss_key_len) {
+		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
+			DRV_LOG(ERR, "Port %u key len must be %u long",
+				dev->data->port_id,
+				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
+			return -EINVAL;
+		}
+
+		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
+		priv->rss_conf.rss_key =
+			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
+				    RTE_CACHE_LINE_SIZE);
+		if (!priv->rss_conf.rss_key)
+			return -ENOMEM;
+		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
+		       rss_conf->rss_key_len);
+	}
+	priv->rss_conf.rss_hf = rss_conf->rss_hf;
+
+	return 0;
+}
+
+static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
+				  struct rte_eth_rss_conf *rss_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	if (!rss_conf)
+		return -EINVAL;
+
+	if (rss_conf->rss_key &&
+	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
+		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
+		       priv->rss_conf.rss_key_len);
+	}
+
+	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
+	rss_conf->rss_hf = priv->rss_conf.rss_hf;
+
+	return 0;
+}
+
 static int mana_dev_link_update(struct rte_eth_dev *dev,
 				int wait_to_complete __rte_unused)
 {
@@ -232,6 +291,8 @@ const struct eth_dev_ops mana_dev_ops = {
 	.dev_close		= mana_dev_close,
 	.dev_infos_get		= mana_dev_info_get,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
+	.rss_hash_update	= mana_rss_hash_update,
+	.rss_hash_conf_get	= mana_rss_hash_conf_get,
 	.link_update		= mana_dev_link_update,
 };
 
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index f97eed2e81..33f68b3d1b 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -72,6 +72,7 @@ struct mana_priv {
 	uint8_t ind_table_key[40];
 	struct ibv_qp *rwq_qp;
 	void *db_page;
+	struct rte_eth_rss_conf rss_conf;
 	struct rte_intr_handle *intr_handle;
 	int max_rx_queues;
 	int max_tx_queues;
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 08/17] net/mana: add function to configure RX queues
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (6 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 07/17] net/mana: add function to configure RSS longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 09/17] net/mana: add function to configure TX queues longli
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
RX hardware queue is allocated when starting the queue. This function is
for queue configuration pre starting.
Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/net/mana/mana.c | 68 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 6563fe3661..eb468789d2 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -196,6 +196,16 @@ static int mana_dev_info_get(struct rte_eth_dev *dev,
 	return 0;
 }
 
+static void mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
+				   struct rte_eth_rxq_info *qinfo)
+{
+	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
+
+	qinfo->mp = rxq->mp;
+	qinfo->nb_desc = rxq->num_desc;
+	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
+}
+
 static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
 {
 	static const uint32_t ptypes[] = {
@@ -270,6 +280,61 @@ static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
 	return 0;
 }
 
+static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
+				   uint16_t queue_idx, uint16_t nb_desc,
+				   unsigned int socket_id,
+				   const struct rte_eth_rxconf *rx_conf __rte_unused,
+				   struct rte_mempool *mp)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	struct mana_rxq *rxq;
+	int ret;
+
+	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
+	if (!rxq) {
+		DRV_LOG(ERR, "failed to allocate rxq");
+		return -ENOMEM;
+	}
+
+	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
+		queue_idx, nb_desc, socket_id);
+
+	rxq->socket = socket_id;
+
+	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
+					    sizeof(struct mana_rxq_desc) *
+						nb_desc,
+					    RTE_CACHE_LINE_SIZE, socket_id);
+
+	if (!rxq->desc_ring) {
+		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	rxq->num_desc = nb_desc;
+
+	rxq->priv = priv;
+	rxq->num_desc = nb_desc;
+	rxq->mp = mp;
+	dev->data->rx_queues[queue_idx] = rxq;
+
+	return 0;
+
+fail:
+	rte_free(rxq->desc_ring);
+	rte_free(rxq);
+	return ret;
+}
+
+static void mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
+{
+	struct mana_rxq *rxq = dev->data->rx_queues[qid];
+
+	rte_free(rxq->desc_ring);
+	rte_free(rxq);
+}
+
 static int mana_dev_link_update(struct rte_eth_dev *dev,
 				int wait_to_complete __rte_unused)
 {
@@ -290,9 +355,12 @@ const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
 	.dev_close		= mana_dev_close,
 	.dev_infos_get		= mana_dev_info_get,
+	.rxq_info_get		= mana_dev_rx_queue_info,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
 	.rss_hash_update	= mana_rss_hash_update,
 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
+	.rx_queue_setup		= mana_dev_rx_queue_setup,
+	.rx_queue_release	= mana_dev_rx_queue_release,
 	.link_update		= mana_dev_link_update,
 };
 
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 09/17] net/mana: add function to configure TX queues
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (7 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 08/17] net/mana: add function to configure RX queues longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 10/17] net/mana: implement memory registration longli
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
TX hardware queue is allocated when starting the queue, this is for
pre configuration.
Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/net/mana/mana.c | 65 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index eb468789d2..95ef322c95 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -196,6 +196,15 @@ static int mana_dev_info_get(struct rte_eth_dev *dev,
 	return 0;
 }
 
+static void mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
+			    struct rte_eth_txq_info *qinfo)
+{
+	struct mana_txq *txq = dev->data->tx_queues[queue_id];
+
+	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+	qinfo->nb_desc = txq->num_desc;
+}
+
 static void mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
 				   struct rte_eth_rxq_info *qinfo)
 {
@@ -280,6 +289,59 @@ static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
 	return 0;
 }
 
+static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
+				   uint16_t queue_idx, uint16_t nb_desc,
+				   unsigned int socket_id,
+				   const struct rte_eth_txconf *tx_conf __rte_unused)
+
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	struct mana_txq *txq;
+	int ret;
+
+	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
+	if (!txq) {
+		DRV_LOG(ERR, "failed to allocate txq");
+		return -ENOMEM;
+	}
+
+	txq->socket = socket_id;
+
+	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
+					   sizeof(struct mana_txq_desc) *
+						nb_desc,
+					   RTE_CACHE_LINE_SIZE, socket_id);
+	if (!txq->desc_ring) {
+		DRV_LOG(ERR, "failed to allocate txq desc_ring");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
+		queue_idx, nb_desc, socket_id, txq->desc_ring);
+
+	txq->desc_ring_head = 0;
+	txq->desc_ring_tail = 0;
+	txq->priv = priv;
+	txq->num_desc = nb_desc;
+	dev->data->tx_queues[queue_idx] = txq;
+
+	return 0;
+
+fail:
+	rte_free(txq->desc_ring);
+	rte_free(txq);
+	return ret;
+}
+
+static void mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
+{
+	struct mana_txq *txq = dev->data->tx_queues[qid];
+
+	rte_free(txq->desc_ring);
+	rte_free(txq);
+}
+
 static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
 				   uint16_t queue_idx, uint16_t nb_desc,
 				   unsigned int socket_id,
@@ -355,10 +417,13 @@ const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
 	.dev_close		= mana_dev_close,
 	.dev_infos_get		= mana_dev_info_get,
+	.txq_info_get		= mana_dev_tx_queue_info,
 	.rxq_info_get		= mana_dev_rx_queue_info,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
 	.rss_hash_update	= mana_rss_hash_update,
 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
+	.tx_queue_setup		= mana_dev_tx_queue_setup,
+	.tx_queue_release	= mana_dev_tx_queue_release,
 	.rx_queue_setup		= mana_dev_rx_queue_setup,
 	.rx_queue_release	= mana_dev_rx_queue_release,
 	.link_update		= mana_dev_link_update,
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 10/17] net/mana: implement memory registration
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (8 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 09/17] net/mana: add function to configure TX queues longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 21:50   ` Stephen Hemminger
  2022-07-07 20:30 ` [Patch v3 11/17] net/mana: implement the hardware layer operations longli
                   ` (6 subsequent siblings)
  16 siblings, 1 reply; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
MANA hardware has iommu built-in, that provides hardware safe access to
user memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registration
cache mechanisum for each queue and for each port.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Change all header file functions to start with mana_.
Use spinlock in place of rwlock to memory cache access.
Remove unused header files.
 drivers/net/mana/mana.c      |  20 +++
 drivers/net/mana/mana.h      |  39 +++++
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/mp.c        |  85 +++++++++
 drivers/net/mana/mr.c        | 324 +++++++++++++++++++++++++++++++++++
 5 files changed, 469 insertions(+)
 create mode 100644 drivers/net/mana/mr.c
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 95ef322c95..24741197c9 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@ mana_dev_close(struct rte_eth_dev *dev)
 	struct mana_priv *priv = dev->data->dev_private;
 	int ret;
 
+	mana_remove_all_mr(priv);
+
 	ret = mana_intr_uninstall(priv);
 	if (ret)
 		return ret;
@@ -317,6 +319,13 @@ static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
 		goto fail;
 	}
 
+	ret = mana_mr_btree_init(&txq->mr_btree,
+				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init TXQ MR btree");
+		goto fail;
+	}
+
 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
 		queue_idx, nb_desc, socket_id, txq->desc_ring);
 
@@ -338,6 +347,8 @@ static void mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
 	struct mana_txq *txq = dev->data->tx_queues[qid];
 
+	mana_mr_btree_free(&txq->mr_btree);
+
 	rte_free(txq->desc_ring);
 	rte_free(txq);
 }
@@ -374,6 +385,13 @@ static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
 		goto fail;
 	}
 
+	ret = mana_mr_btree_init(&rxq->mr_btree,
+				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init RXQ MR btree");
+		goto fail;
+	}
+
 	rxq->num_desc = nb_desc;
 
 	rxq->priv = priv;
@@ -393,6 +411,8 @@ static void mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
 
+	mana_mr_btree_free(&rxq->mr_btree);
+
 	rte_free(rxq->desc_ring);
 	rte_free(rxq);
 }
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 33f68b3d1b..9e15b43275 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -50,6 +50,22 @@ struct mana_shared_data {
 #define MAX_RECEIVE_BUFFERS_PER_QUEUE	256
 #define MAX_SEND_BUFFERS_PER_QUEUE	256
 
+struct mana_mr_cache {
+	uint32_t	lkey;
+	uintptr_t	addr;
+	size_t		len;
+	void		*verb_obj;
+};
+
+#define MANA_MR_BTREE_CACHE_N	512
+struct mana_mr_btree {
+	uint16_t	len;	/* Used entries */
+	uint16_t	size;	/* Total entries */
+	int		overflow;
+	int		socket;
+	struct mana_mr_cache *table;
+};
+
 struct mana_process_priv {
 	void *db_page;
 };
@@ -82,6 +98,8 @@ struct mana_priv {
 	int max_recv_sge;
 	int max_mr;
 	uint64_t max_mr_size;
+	struct mana_mr_btree mr_btree;
+	rte_spinlock_t	mr_btree_lock;
 };
 
 struct mana_txq_desc {
@@ -131,6 +149,7 @@ struct mana_txq {
 	uint32_t desc_ring_head, desc_ring_tail;
 
 	struct mana_stats stats;
+	struct mana_mr_btree mr_btree;
 	unsigned int socket;
 };
 
@@ -153,6 +172,7 @@ struct mana_rxq {
 	struct mana_gdma_queue gdma_cq;
 
 	struct mana_stats stats;
+	struct mana_mr_btree mr_btree;
 
 	unsigned int socket;
 };
@@ -176,6 +196,24 @@ uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 
+struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
+				       struct mana_priv *priv,
+				       struct rte_mbuf *mbuf);
+int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+		    struct rte_mempool *pool);
+void mana_remove_all_mr(struct mana_priv *priv);
+void mana_del_pmd_mr(struct mana_mr_cache *mr);
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque,
+			   struct rte_mempool_memhdr *memhdr, unsigned int idx);
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+					   uint16_t *idx,
+					   uintptr_t addr, size_t len);
+int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry);
+int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket);
+void mana_mr_btree_free(struct mana_mr_btree *bt);
+
 /** Request timeout for IPC. */
 #define MANA_MP_REQ_TIMEOUT_SEC 5
 
@@ -204,6 +242,7 @@ int mana_mp_init_secondary(void);
 void mana_mp_uninit_primary(void);
 void mana_mp_uninit_secondary(void);
 int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len);
 
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 81c4118f53..9771394370 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
 	'mana.c',
+	'mr.c',
 	'mp.c',
 )
 
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index d7580e8a28..f4f78d2787 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -12,6 +12,52 @@
 
 extern struct mana_shared_data *mana_shared_data;
 
+static int mana_mp_mr_create(struct mana_priv *priv, uintptr_t addr,
+			     uint32_t len)
+{
+	struct ibv_mr *ibv_mr;
+	int ret;
+	struct mana_mr_cache *mr;
+
+	ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)addr, len,
+			    IBV_ACCESS_LOCAL_WRITE);
+
+	if (!ibv_mr)
+		return -errno;
+
+	DRV_LOG(DEBUG, "MR (2nd) lkey %u addr %p len %zu",
+		ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
+
+	mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
+	if (!mr) {
+		DRV_LOG(ERR, "(2nd) Failed to allocate MR");
+		ret = -ENOMEM;
+		goto fail_alloc;
+	}
+	mr->lkey = ibv_mr->lkey;
+	mr->addr = (uintptr_t)ibv_mr->addr;
+	mr->len = ibv_mr->length;
+	mr->verb_obj = ibv_mr;
+
+	rte_spinlock_lock(&priv->mr_btree_lock);
+	ret = mana_mr_btree_insert(&priv->mr_btree, mr);
+	rte_spinlock_unlock(&priv->mr_btree_lock);
+	if (ret) {
+		DRV_LOG(ERR, "(2nd) Failed to add to global MR btree");
+		goto fail_btree;
+	}
+
+	return 0;
+
+fail_btree:
+	rte_free(mr);
+
+fail_alloc:
+	ibv_dereg_mr(ibv_mr);
+
+	return ret;
+}
+
 static void mp_init_msg(struct rte_mp_msg *msg, enum mana_mp_req_type type,
 			int port_id)
 {
@@ -47,6 +93,12 @@ static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
 	mp_init_msg(&mp_res, param->type, param->port_id);
 
 	switch (param->type) {
+	case MANA_MP_REQ_CREATE_MR:
+		ret = mana_mp_mr_create(priv, param->addr, param->len);
+		res->result = ret;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
 	case MANA_MP_REQ_VERBS_CMD_FD:
 		mp_res.num_fds = 1;
 		mp_res.fds[0] = priv->ib_ctx->cmd_fd;
@@ -189,6 +241,39 @@ int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
 	return ret;
 }
 
+int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
+{
+	struct rte_mp_msg mp_req = { 0 };
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mana_mp_param *req = (struct mana_mp_param *)mp_req.param;
+	struct mana_mp_param *res;
+	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int ret;
+
+	mp_init_msg(&mp_req, MANA_MP_REQ_CREATE_MR, priv->port_id);
+	req->addr = addr;
+	req->len = len;
+
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		DRV_LOG(ERR, "Port %u request to primary failed",
+			req->port_id);
+		return ret;
+	}
+
+	if (mp_rep.nb_received != 1)
+		return -EPROTO;
+
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mana_mp_param *)mp_res->param;
+	ret = res->result;
+
+	free(mp_rep.msgs);
+
+	return ret;
+}
+
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 {
 	struct rte_mp_msg mp_req = { 0 };
diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c
new file mode 100644
index 0000000000..9f4f0fdc06
--- /dev/null
+++ b/drivers/net/mana/mr.c
@@ -0,0 +1,324 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_eal_paging.h>
+
+#include <infiniband/verbs.h>
+
+#include "mana.h"
+
+struct mana_range {
+	uintptr_t	start;
+	uintptr_t	end;
+	uint32_t	len;
+};
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp __rte_unused, void *opaque,
+			   struct rte_mempool_memhdr *memhdr, unsigned int idx)
+{
+	struct mana_range *ranges = opaque;
+	struct mana_range *range = &ranges[idx];
+	uint64_t page_size = rte_mem_page_size();
+
+	range->start = RTE_ALIGN_FLOOR((uintptr_t)memhdr->addr, page_size);
+	range->end = RTE_ALIGN_CEIL((uintptr_t)memhdr->addr + memhdr->len,
+				    page_size);
+	range->len = range->end - range->start;
+}
+
+int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+		    struct rte_mempool *pool)
+{
+	struct ibv_mr *ibv_mr;
+	struct mana_range ranges[pool->nb_mem_chunks];
+	uint32_t i;
+	struct mana_mr_cache *mr;
+	int ret;
+
+	rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
+
+	for (i = 0; i < pool->nb_mem_chunks; i++) {
+		if (ranges[i].len > priv->max_mr_size) {
+			DRV_LOG(ERR, "memory chunk size %u exceeding max MR\n",
+				ranges[i].len);
+			return -ENOMEM;
+		}
+
+		DRV_LOG(DEBUG,
+			"registering memory chunk start 0x%" PRIx64 " len %u",
+			ranges[i].start, ranges[i].len);
+
+		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+			/* Send a message to the primary to do MR */
+			ret = mana_mp_req_mr_create(priv, ranges[i].start,
+						    ranges[i].len);
+			if (ret) {
+				DRV_LOG(ERR,
+					"MR failed start 0x%" PRIx64 " len %u",
+					ranges[i].start, ranges[i].len);
+				return ret;
+			}
+			continue;
+		}
+
+		ibv_mr = ibv_reg_mr(priv->ib_pd, (void *)ranges[i].start,
+				    ranges[i].len, IBV_ACCESS_LOCAL_WRITE);
+		if (ibv_mr) {
+			DRV_LOG(DEBUG, "MR lkey %u addr %p len %" PRIu64,
+				ibv_mr->lkey, ibv_mr->addr, ibv_mr->length);
+
+			mr = rte_calloc("MANA MR", 1, sizeof(*mr), 0);
+			mr->lkey = ibv_mr->lkey;
+			mr->addr = (uintptr_t)ibv_mr->addr;
+			mr->len = ibv_mr->length;
+			mr->verb_obj = ibv_mr;
+
+			rte_spinlock_lock(&priv->mr_btree_lock);
+			ret = mana_mr_btree_insert(&priv->mr_btree, mr);
+			rte_spinlock_unlock(&priv->mr_btree_lock);
+			if (ret) {
+				ibv_dereg_mr(ibv_mr);
+				DRV_LOG(ERR, "Failed to add to global MR btree");
+				return ret;
+			}
+
+			ret = mana_mr_btree_insert(local_tree, mr);
+			if (ret) {
+				/* Don't need to clean up MR as it's already
+				 * in the global tree
+				 */
+				DRV_LOG(ERR, "Failed to add to local MR btree");
+				return ret;
+			}
+		} else {
+			DRV_LOG(ERR, "MR failed at 0x%" PRIx64 " len %u",
+				ranges[i].start, ranges[i].len);
+			return -errno;
+		}
+	}
+	return 0;
+}
+
+void mana_del_pmd_mr(struct mana_mr_cache *mr)
+{
+	int ret;
+	struct ibv_mr *ibv_mr = (struct ibv_mr *)mr->verb_obj;
+
+	ret = ibv_dereg_mr(ibv_mr);
+	if (ret)
+		DRV_LOG(ERR, "dereg MR failed ret %d", ret);
+}
+
+struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_mr_btree,
+				       struct mana_priv *priv,
+				       struct rte_mbuf *mbuf)
+{
+	struct rte_mempool *pool = mbuf->pool;
+	int ret, second_try = 0;
+	struct mana_mr_cache *mr;
+	uint16_t idx;
+
+	DRV_LOG(DEBUG, "finding mr for mbuf addr %p len %d",
+		mbuf->buf_addr, mbuf->buf_len);
+
+try_again:
+	/* First try to find the MR in local queue tree */
+	mr = mana_mr_btree_lookup(local_mr_btree, &idx,
+				  (uintptr_t)mbuf->buf_addr, mbuf->buf_len);
+	if (mr) {
+		DRV_LOG(DEBUG,
+			"Local mr lkey %u addr 0x%" PRIx64 " len %" PRIu64,
+			mr->lkey, mr->addr, mr->len);
+		return mr;
+	}
+
+	/* If not found, try to find the MR in global tree */
+	rte_spinlock_lock(&priv->mr_btree_lock);
+	mr = mana_mr_btree_lookup(&priv->mr_btree, &idx,
+				  (uintptr_t)mbuf->buf_addr,
+				  mbuf->buf_len);
+	rte_spinlock_unlock(&priv->mr_btree_lock);
+
+	/* If found in the global tree, add it to the local tree */
+	if (mr) {
+		ret = mana_mr_btree_insert(local_mr_btree, mr);
+		if (ret) {
+			DRV_LOG(DEBUG, "Failed to add MR to local tree.");
+			return NULL;
+		}
+
+		DRV_LOG(DEBUG,
+			"Added local MR key %u addr 0x%" PRIx64 " len %" PRIu64,
+			mr->lkey, mr->addr, mr->len);
+		return mr;
+	}
+
+	if (second_try) {
+		DRV_LOG(ERR, "Internal error second try failed");
+		return NULL;
+	}
+
+	ret = mana_new_pmd_mr(local_mr_btree, priv, pool);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to allocate MR ret %d addr %p len %d",
+			ret, mbuf->buf_addr, mbuf->buf_len);
+		return NULL;
+	}
+
+	second_try = 1;
+	goto try_again;
+}
+
+void mana_remove_all_mr(struct mana_priv *priv)
+{
+	struct mana_mr_btree *bt = &priv->mr_btree;
+	struct mana_mr_cache *mr;
+	struct ibv_mr *ibv_mr;
+	uint16_t i;
+
+	rte_spinlock_lock(&priv->mr_btree_lock);
+	/* Start with index 1 as the 1st entry is always NULL */
+	for (i = 1; i < bt->len; i++) {
+		mr = &bt->table[i];
+		ibv_mr = mr->verb_obj;
+		ibv_dereg_mr(ibv_mr);
+	}
+	bt->len = 1;
+	rte_spinlock_unlock(&priv->mr_btree_lock);
+}
+
+static int mana_mr_btree_expand(struct mana_mr_btree *bt, int n)
+{
+	void *mem;
+
+	mem = rte_realloc_socket(bt->table, n * sizeof(struct mana_mr_cache),
+				 0, bt->socket);
+	if (!mem) {
+		DRV_LOG(ERR, "Failed to expand btree size %d", n);
+		return -1;
+	}
+
+	DRV_LOG(ERR, "Expanded btree to size %d", n);
+	bt->table = mem;
+	bt->size = n;
+
+	return 0;
+}
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+					   uint16_t *idx,
+					   uintptr_t addr, size_t len)
+{
+	struct mana_mr_cache *table;
+	uint16_t n;
+	uint16_t base = 0;
+	int ret;
+
+	n = bt->len;
+
+	/* Try to double the cache if it's full */
+	if (n == bt->size) {
+		ret = mana_mr_btree_expand(bt, bt->size << 1);
+		if (ret)
+			return NULL;
+	}
+
+	table = bt->table;
+
+	/* Do binary search on addr */
+	do {
+		uint16_t delta = n >> 1;
+
+		if (addr < table[base + delta].addr) {
+			n = delta;
+		} else {
+			base += delta;
+			n -= delta;
+		}
+	} while (n > 1);
+
+	*idx = base;
+
+	if (addr + len <= table[base].addr + table[base].len)
+		return &table[base];
+
+	DRV_LOG(DEBUG,
+		"addr 0x%" PRIx64 " len %zu idx %u sum 0x%" PRIx64 " not found",
+		addr, len, *idx, addr + len);
+
+	return NULL;
+}
+
+int mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
+{
+	memset(bt, 0, sizeof(*bt));
+	bt->table = rte_calloc_socket("MANA B-tree table",
+				      n,
+				      sizeof(struct mana_mr_cache),
+				      0, socket);
+	if (!bt->table) {
+		DRV_LOG(ERR, "Failed to allocate B-tree n %d socket %d",
+			n, socket);
+		return -ENOMEM;
+	}
+
+	bt->socket = socket;
+	bt->size = n;
+
+	/* First entry must be NULL for binary search to work */
+	bt->table[0] = (struct mana_mr_cache) {
+		.lkey = UINT32_MAX,
+	};
+	bt->len = 1;
+
+	DRV_LOG(ERR, "B-tree initialized table %p size %d len %d",
+		bt->table, n, bt->len);
+
+	return 0;
+}
+
+void mana_mr_btree_free(struct mana_mr_btree *bt)
+{
+	rte_free(bt->table);
+	memset(bt, 0, sizeof(*bt));
+}
+
+int mana_mr_btree_insert(struct mana_mr_btree *bt, struct mana_mr_cache *entry)
+{
+	struct mana_mr_cache *table;
+	uint16_t idx = 0;
+	uint16_t shift;
+
+	if (mana_mr_btree_lookup(bt, &idx, entry->addr, entry->len)) {
+		DRV_LOG(DEBUG, "Addr 0x%" PRIx64 " len %zu exists in btree",
+			entry->addr, entry->len);
+		return 0;
+	}
+
+	if (bt->len >= bt->size) {
+		bt->overflow = 1;
+		return -1;
+	}
+
+	table = bt->table;
+
+	idx++;
+	shift = (bt->len - idx) * sizeof(struct mana_mr_cache);
+	if (shift) {
+		DRV_LOG(DEBUG, "Moving %u bytes from idx %u to %u",
+			shift, idx, idx + 1);
+		memmove(&table[idx + 1], &table[idx], shift);
+	}
+
+	table[idx] = *entry;
+	bt->len++;
+
+	DRV_LOG(DEBUG,
+		"Inserted MR b-tree table %p idx %d addr 0x%" PRIx64 " len %zu",
+		table, idx, entry->addr, entry->len);
+
+	return 0;
+}
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * Re: [Patch v3 10/17] net/mana: implement memory registration
  2022-07-07 20:30 ` [Patch v3 10/17] net/mana: implement memory registration longli
@ 2022-07-07 21:50   ` Stephen Hemminger
  2022-07-07 22:12     ` Long Li
  0 siblings, 1 reply; 23+ messages in thread
From: Stephen Hemminger @ 2022-07-07 21:50 UTC (permalink / raw)
  To: longli; +Cc: longli, Ferruh Yigit, dev, Ajay Sharma, Stephen Hemminger
On Thu,  7 Jul 2022 13:30:15 -0700
longli@linuxonhyperv.com wrote:
> +int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
> +		    struct rte_mempool *pool)
> +{
> +	struct ibv_mr *ibv_mr;
> +	struct mana_range ranges[pool->nb_mem_chunks];
> +	uint32_t i;
> +	struct mana_mr_cache *mr;
> +	int ret;
> +
> +	rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
> +
> +	for (i = 0; i < pool->nb_mem_chunks; i++) {
> +		if (ranges[i].len > priv->max_mr_size) {
> +			DRV_LOG(ERR, "memory chunk size %u exceeding max MR\n",
> +				ranges[i].len);
> +			return -ENOMEM;
Did a quick search for extra newlines.
Looks like this message will end up double spaced in log.
DRV_LOG already adds a newline.
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * RE: [Patch v3 10/17] net/mana: implement memory registration
  2022-07-07 21:50   ` Stephen Hemminger
@ 2022-07-07 22:12     ` Long Li
  0 siblings, 0 replies; 23+ messages in thread
From: Long Li @ 2022-07-07 22:12 UTC (permalink / raw)
  To: Stephen Hemminger, longli
  Cc: Ferruh Yigit, dev, Ajay Sharma, Stephen Hemminger
> Subject: Re: [Patch v3 10/17] net/mana: implement memory registration
> 
> On Thu,  7 Jul 2022 13:30:15 -0700
> longli@linuxonhyperv.com wrote:
> 
> > +int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv
> *priv,
> > +		    struct rte_mempool *pool)
> > +{
> > +	struct ibv_mr *ibv_mr;
> > +	struct mana_range ranges[pool->nb_mem_chunks];
> > +	uint32_t i;
> > +	struct mana_mr_cache *mr;
> > +	int ret;
> > +
> > +	rte_mempool_mem_iter(pool, mana_mempool_chunk_cb, ranges);
> > +
> > +	for (i = 0; i < pool->nb_mem_chunks; i++) {
> > +		if (ranges[i].len > priv->max_mr_size) {
> > +			DRV_LOG(ERR, "memory chunk size %u exceeding max
> MR\n",
> > +				ranges[i].len);
> > +			return -ENOMEM;
> 
> Did a quick search for extra newlines.
> Looks like this message will end up double spaced in log.
> DRV_LOG already adds a newline.
Yes, I will remove all extra spaces in other places as well.
^ permalink raw reply	[flat|nested] 23+ messages in thread
 
 
- * [Patch v3 11/17] net/mana: implement the hardware layer operations
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (9 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 10/17] net/mana: implement memory registration longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 12/17] net/mana: add function to start/stop TX queues longli
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
The hardware layer of MANA understands the device queue and doorbell
formats. Those functions are implemented for use by packet RX/TX code.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Remove unused header files.
Rename a camel case.
 drivers/net/mana/gdma.c      | 284 +++++++++++++++++++++++++++++++++++
 drivers/net/mana/mana.h      | 183 ++++++++++++++++++++++
 drivers/net/mana/meson.build |   1 +
 3 files changed, 468 insertions(+)
 create mode 100644 drivers/net/mana/gdma.c
diff --git a/drivers/net/mana/gdma.c b/drivers/net/mana/gdma.c
new file mode 100644
index 0000000000..077ac7744b
--- /dev/null
+++ b/drivers/net/mana/gdma.c
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <ethdev_driver.h>
+#include <rte_io.h>
+
+#include "mana.h"
+
+uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue)
+{
+	uint32_t offset_in_bytes =
+		(queue->head * GDMA_WQE_ALIGNMENT_UNIT_SIZE) &
+		(queue->size - 1);
+
+	DRV_LOG(DEBUG, "txq sq_head %u sq_size %u offset_in_bytes %u",
+		queue->head, queue->size, offset_in_bytes);
+
+	if (offset_in_bytes + GDMA_WQE_ALIGNMENT_UNIT_SIZE > queue->size)
+		DRV_LOG(ERR, "fatal error: offset_in_bytes %u too big",
+			offset_in_bytes);
+
+	return ((uint8_t *)queue->buffer) + offset_in_bytes;
+}
+
+static uint32_t
+write_dma_client_oob(uint8_t *work_queue_buffer_pointer,
+		     const struct gdma_work_request *work_request,
+		     uint32_t client_oob_size)
+{
+	uint8_t *p = work_queue_buffer_pointer;
+
+	struct gdma_wqe_dma_oob *header = (struct gdma_wqe_dma_oob *)p;
+
+	memset(header, 0, sizeof(struct gdma_wqe_dma_oob));
+	header->num_sgl_entries = work_request->num_sgl_elements;
+	header->inline_client_oob_size_in_dwords =
+		client_oob_size / sizeof(uint32_t);
+	header->client_data_unit = work_request->client_data_unit;
+
+	DRV_LOG(DEBUG, "queue buf %p sgl %u oob_h %u du %u oob_buf %p oob_b %u",
+		work_queue_buffer_pointer, header->num_sgl_entries,
+		header->inline_client_oob_size_in_dwords,
+		header->client_data_unit, work_request->inline_oob_data,
+		work_request->inline_oob_size_in_bytes);
+
+	p += sizeof(struct gdma_wqe_dma_oob);
+	if (work_request->inline_oob_data &&
+	    work_request->inline_oob_size_in_bytes > 0) {
+		memcpy(p, work_request->inline_oob_data,
+		       work_request->inline_oob_size_in_bytes);
+		if (client_oob_size > work_request->inline_oob_size_in_bytes)
+			memset(p + work_request->inline_oob_size_in_bytes, 0,
+			       client_oob_size -
+			       work_request->inline_oob_size_in_bytes);
+	}
+
+	return sizeof(struct gdma_wqe_dma_oob) + client_oob_size;
+}
+
+static uint32_t
+write_scatter_gather_list(uint8_t *work_queue_head_pointer,
+			  uint8_t *work_queue_end_pointer,
+			  uint8_t *work_queue_cur_pointer,
+			  struct gdma_work_request *work_request)
+{
+	struct gdma_sgl_element *sge_list;
+	struct gdma_sgl_element dummy_sgl[1];
+	uint8_t *address;
+	uint32_t size;
+	uint32_t num_sge;
+	uint32_t size_to_queue_end;
+	uint32_t sge_list_size;
+
+	DRV_LOG(DEBUG, "work_queue_cur_pointer %p work_request->flags %x",
+		work_queue_cur_pointer, work_request->flags);
+
+	num_sge = work_request->num_sgl_elements;
+	sge_list = work_request->sgl;
+	size_to_queue_end = (uint32_t)(work_queue_end_pointer -
+				       work_queue_cur_pointer);
+
+	if (num_sge == 0) {
+		/* Per spec, the case of an empty SGL should be handled as
+		 * follows to avoid corrupted WQE errors:
+		 * Write one dummy SGL entry
+		 * Set the address to 1, leave the rest as 0
+		 */
+		dummy_sgl[num_sge].address = 1;
+		dummy_sgl[num_sge].size = 0;
+		dummy_sgl[num_sge].memory_key = 0;
+		num_sge++;
+		sge_list = dummy_sgl;
+	}
+
+	sge_list_size = 0;
+	{
+		address = (uint8_t *)sge_list;
+		size = sizeof(struct gdma_sgl_element) * num_sge;
+		if (size_to_queue_end < size) {
+			memcpy(work_queue_cur_pointer, address,
+			       size_to_queue_end);
+			work_queue_cur_pointer = work_queue_head_pointer;
+			address += size_to_queue_end;
+			size -= size_to_queue_end;
+		}
+
+		memcpy(work_queue_cur_pointer, address, size);
+		sge_list_size = size;
+	}
+
+	DRV_LOG(DEBUG, "sge %u address 0x%" PRIx64 " size %u key %u list_s %u",
+		num_sge, sge_list->address, sge_list->size,
+		sge_list->memory_key, sge_list_size);
+
+	return sge_list_size;
+}
+
+int gdma_post_work_request(struct mana_gdma_queue *queue,
+			   struct gdma_work_request *work_req,
+			   struct gdma_posted_wqe_info *wqe_info)
+{
+	uint32_t client_oob_size =
+		work_req->inline_oob_size_in_bytes >
+				INLINE_OOB_SMALL_SIZE_IN_BYTES ?
+			INLINE_OOB_LARGE_SIZE_IN_BYTES :
+			INLINE_OOB_SMALL_SIZE_IN_BYTES;
+
+	uint32_t sgl_data_size = sizeof(struct gdma_sgl_element) *
+			RTE_MAX((uint32_t)1, work_req->num_sgl_elements);
+	uint32_t wqe_size =
+		RTE_ALIGN(sizeof(struct gdma_wqe_dma_oob) +
+				client_oob_size + sgl_data_size,
+			  GDMA_WQE_ALIGNMENT_UNIT_SIZE);
+	uint8_t *wq_buffer_pointer;
+	uint32_t queue_free_units = queue->count - (queue->head - queue->tail);
+
+	if (wqe_size / GDMA_WQE_ALIGNMENT_UNIT_SIZE > queue_free_units) {
+		DRV_LOG(DEBUG, "WQE size %u queue count %u head %u tail %u",
+			wqe_size, queue->count, queue->head, queue->tail);
+		return -EBUSY;
+	}
+
+	DRV_LOG(DEBUG, "client_oob_size %u sgl_data_size %u wqe_size %u",
+		client_oob_size, sgl_data_size, wqe_size);
+
+	if (wqe_info) {
+		wqe_info->wqe_index =
+			((queue->head * GDMA_WQE_ALIGNMENT_UNIT_SIZE) &
+			 (queue->size - 1)) / GDMA_WQE_ALIGNMENT_UNIT_SIZE;
+		wqe_info->unmasked_queue_offset = queue->head;
+		wqe_info->wqe_size_in_bu =
+			wqe_size / GDMA_WQE_ALIGNMENT_UNIT_SIZE;
+	}
+
+	wq_buffer_pointer = gdma_get_wqe_pointer(queue);
+	wq_buffer_pointer += write_dma_client_oob(wq_buffer_pointer, work_req,
+						  client_oob_size);
+	if (wq_buffer_pointer >= ((uint8_t *)queue->buffer) + queue->size)
+		wq_buffer_pointer -= queue->size;
+
+	write_scatter_gather_list((uint8_t *)queue->buffer,
+				  (uint8_t *)queue->buffer + queue->size,
+				  wq_buffer_pointer, work_req);
+
+	queue->head += wqe_size / GDMA_WQE_ALIGNMENT_UNIT_SIZE;
+
+	return 0;
+}
+
+union gdma_doorbell_entry {
+	uint64_t     as_uint64;
+
+	struct {
+		uint64_t id	  : 24;
+		uint64_t reserved    : 8;
+		uint64_t tail_ptr    : 31;
+		uint64_t arm	 : 1;
+	} cq;
+
+	struct {
+		uint64_t id	  : 24;
+		uint64_t wqe_cnt     : 8;
+		uint64_t tail_ptr    : 32;
+	} rq;
+
+	struct {
+		uint64_t id	  : 24;
+		uint64_t reserved    : 8;
+		uint64_t tail_ptr    : 32;
+	} sq;
+
+	struct {
+		uint64_t id	  : 16;
+		uint64_t reserved    : 16;
+		uint64_t tail_ptr    : 31;
+		uint64_t arm	 : 1;
+	} eq;
+}; /* HW DATA */
+
+#define DOORBELL_OFFSET_SQ      0x0
+#define DOORBELL_OFFSET_RQ      0x400
+#define DOORBELL_OFFSET_CQ      0x800
+#define DOORBELL_OFFSET_EQ      0xFF8
+
+int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
+		       uint32_t queue_id, uint32_t tail)
+{
+	uint8_t *addr = db_page;
+	union gdma_doorbell_entry e = {};
+
+	switch (queue_type) {
+	case gdma_queue_send:
+		e.sq.id = queue_id;
+		e.sq.tail_ptr = tail;
+		addr += DOORBELL_OFFSET_SQ;
+		break;
+
+	case gdma_queue_receive:
+		e.rq.id = queue_id;
+		e.rq.tail_ptr = tail;
+		e.rq.wqe_cnt = 1;
+		addr += DOORBELL_OFFSET_RQ;
+		break;
+
+	case gdma_queue_completion:
+		e.cq.id = queue_id;
+		e.cq.tail_ptr = tail;
+		e.cq.arm = 1;
+		addr += DOORBELL_OFFSET_CQ;
+		break;
+
+	default:
+		DRV_LOG(ERR, "Unsupported queue type %d", queue_type);
+		return -1;
+	}
+
+	rte_wmb();
+	DRV_LOG(DEBUG, "db_page %p addr %p queue_id %u type %u tail %u",
+		db_page, addr, queue_id, queue_type, tail);
+
+	rte_write64(e.as_uint64, addr);
+	return 0;
+}
+
+int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
+			       struct gdma_comp *comp)
+{
+	struct gdma_hardware_completion_entry *cqe;
+	uint32_t head = cq->head % cq->count;
+	uint32_t new_owner_bits, old_owner_bits;
+	uint32_t cqe_owner_bits;
+	struct gdma_hardware_completion_entry *buffer = cq->buffer;
+
+	cqe = &buffer[head];
+	new_owner_bits = (cq->head / cq->count) & COMPLETION_QUEUE_OWNER_MASK;
+	old_owner_bits = (cq->head / cq->count - 1) &
+				COMPLETION_QUEUE_OWNER_MASK;
+	cqe_owner_bits = cqe->owner_bits;
+
+	DRV_LOG(DEBUG, "comp cqe bits 0x%x owner bits 0x%x",
+		cqe_owner_bits, old_owner_bits);
+
+	if (cqe_owner_bits == old_owner_bits)
+		return 0; /* No new entry */
+
+	if (cqe_owner_bits != new_owner_bits) {
+		DRV_LOG(ERR, "CQ overflowed, ID %u cqe 0x%x new 0x%x",
+			cq->id, cqe_owner_bits, new_owner_bits);
+		return -1;
+	}
+
+	comp->work_queue_number = cqe->wq_num;
+	comp->send_work_queue = cqe->is_sq;
+
+	memcpy(comp->completion_data, cqe->dma_client_data, GDMA_COMP_DATA_SIZE);
+
+	cq->head++;
+
+	DRV_LOG(DEBUG, "comp new 0x%x old 0x%x cqe 0x%x wq %u sq %u head %u",
+		new_owner_bits, old_owner_bits, cqe_owner_bits,
+		comp->work_queue_number, comp->send_work_queue, cq->head);
+	return 1;
+}
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 9e15b43275..d87358ab15 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -50,6 +50,178 @@ struct mana_shared_data {
 #define MAX_RECEIVE_BUFFERS_PER_QUEUE	256
 #define MAX_SEND_BUFFERS_PER_QUEUE	256
 
+#define GDMA_WQE_ALIGNMENT_UNIT_SIZE 32
+
+#define COMP_ENTRY_SIZE 64
+#define MAX_TX_WQE_SIZE 512
+#define MAX_RX_WQE_SIZE 256
+
+/* Values from the GDMA specification document, WQE format description */
+#define INLINE_OOB_SMALL_SIZE_IN_BYTES 8
+#define INLINE_OOB_LARGE_SIZE_IN_BYTES 24
+
+#define NOT_USING_CLIENT_DATA_UNIT 0
+
+enum gdma_queue_types {
+	gdma_queue_type_invalid = 0,
+	gdma_queue_send,
+	gdma_queue_receive,
+	gdma_queue_completion,
+	gdma_queue_event,
+	gdma_queue_type_max = 16,
+	/*Room for expansion */
+
+	/* This enum can be expanded to add more queue types but
+	 * it's expected to be done in a contiguous manner.
+	 * Failing that will result in unexpected behavior.
+	 */
+};
+
+#define WORK_QUEUE_NUMBER_BASE_BITS 10
+
+struct gdma_header {
+	/* size of the entire gdma structure, including the entire length of
+	 * the struct that is formed by extending other gdma struct. i.e.
+	 * GDMA_BASE_SPEC extends gdma_header, GDMA_EVENT_QUEUE_SPEC extends
+	 * GDMA_BASE_SPEC, StructSize for GDMA_EVENT_QUEUE_SPEC will be size of
+	 * GDMA_EVENT_QUEUE_SPEC which includes size of GDMA_BASE_SPEC and size
+	 * of gdma_header.
+	 * Above example is for illustration purpose and is not in code
+	 */
+	size_t struct_size;
+};
+
+/* The following macros are from GDMA SPEC 3.6, "Table 2: CQE data structure"
+ * and "Table 4: Event Queue Entry (EQE) data format"
+ */
+#define GDMA_COMP_DATA_SIZE 0x3C /* Must be a multiple of 4 */
+#define GDMA_COMP_DATA_SIZE_IN_UINT32 (GDMA_COMP_DATA_SIZE / 4)
+
+#define COMPLETION_QUEUE_ENTRY_WORK_QUEUE_INDEX 0
+#define COMPLETION_QUEUE_ENTRY_WORK_QUEUE_SIZE 24
+#define COMPLETION_QUEUE_ENTRY_SEND_WORK_QUEUE_INDEX 24
+#define COMPLETION_QUEUE_ENTRY_SEND_WORK_QUEUE_SIZE 1
+#define COMPLETION_QUEUE_ENTRY_OWNER_BITS_INDEX 29
+#define COMPLETION_QUEUE_ENTRY_OWNER_BITS_SIZE 3
+
+#define COMPLETION_QUEUE_OWNER_MASK \
+	((1 << (COMPLETION_QUEUE_ENTRY_OWNER_BITS_SIZE)) - 1)
+
+struct gdma_comp {
+	struct gdma_header gdma_header;
+
+	/* Filled by GDMA core */
+	uint32_t completion_data[GDMA_COMP_DATA_SIZE_IN_UINT32];
+
+	/* Filled by GDMA core */
+	uint32_t work_queue_number;
+
+	/* Filled by GDMA core */
+	bool send_work_queue;
+};
+
+struct gdma_hardware_completion_entry {
+	char dma_client_data[GDMA_COMP_DATA_SIZE];
+	union {
+		uint32_t work_queue_owner_bits;
+		struct {
+			uint32_t wq_num		: 24;
+			uint32_t is_sq		: 1;
+			uint32_t reserved	: 4;
+			uint32_t owner_bits	: 3;
+		};
+	};
+}; /* HW DATA */
+
+struct gdma_posted_wqe_info {
+	struct gdma_header gdma_header;
+
+	/* size of the written wqe in basic units (32B), filled by GDMA core.
+	 * Use this value to progress the work queue after the wqe is processed
+	 * by hardware.
+	 */
+	uint32_t wqe_size_in_bu;
+
+	/* At the time of writing the wqe to the work queue, the offset in the
+	 * work queue buffer where by the wqe will be written. Each unit
+	 * represents 32B of buffer space.
+	 */
+	uint32_t wqe_index;
+
+	/* Unmasked offset in the queue to which the WQE was written.
+	 * In 32 byte units.
+	 */
+	uint32_t unmasked_queue_offset;
+};
+
+struct gdma_sgl_element {
+	uint64_t address;
+	uint32_t memory_key;
+	uint32_t size;
+};
+
+#define MAX_SGL_ENTRIES_FOR_TRANSMIT 30
+
+struct one_sgl {
+	struct gdma_sgl_element gdma_sgl[MAX_SGL_ENTRIES_FOR_TRANSMIT];
+};
+
+struct gdma_work_request {
+	struct gdma_header gdma_header;
+	struct gdma_sgl_element *sgl;
+	uint32_t num_sgl_elements;
+	uint32_t inline_oob_size_in_bytes;
+	void *inline_oob_data;
+	uint32_t flags; /* From _gdma_work_request_FLAGS */
+	uint32_t client_data_unit; /* For LSO, this is the MTU of the data */
+};
+
+enum mana_cqe_type {
+	CQE_INVALID                     = 0,
+};
+
+struct mana_cqe_header {
+	uint32_t cqe_type    : 6;
+	uint32_t client_type : 2;
+	uint32_t vendor_err  : 24;
+}; /* HW DATA */
+
+/* NDIS HASH Types */
+#define BIT(nr)		(1 << (nr))
+#define NDIS_HASH_IPV4          BIT(0)
+#define NDIS_HASH_TCP_IPV4      BIT(1)
+#define NDIS_HASH_UDP_IPV4      BIT(2)
+#define NDIS_HASH_IPV6          BIT(3)
+#define NDIS_HASH_TCP_IPV6      BIT(4)
+#define NDIS_HASH_UDP_IPV6      BIT(5)
+#define NDIS_HASH_IPV6_EX       BIT(6)
+#define NDIS_HASH_TCP_IPV6_EX   BIT(7)
+#define NDIS_HASH_UDP_IPV6_EX   BIT(8)
+
+#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
+#define MANA_HASH_L4                                                         \
+	(NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 |      \
+	 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
+
+struct gdma_wqe_dma_oob {
+	uint32_t reserved:24;
+	uint32_t last_v_bytes:8;
+	union {
+		uint32_t flags;
+		struct {
+			uint32_t num_sgl_entries:8;
+			uint32_t inline_client_oob_size_in_dwords:3;
+			uint32_t client_oob_in_sgl:1;
+			uint32_t consume_credit:1;
+			uint32_t fence:1;
+			uint32_t reserved1:2;
+			uint32_t client_data_unit:14;
+			uint32_t check_sn:1;
+			uint32_t sgl_direct:1;
+		};
+	};
+};
+
 struct mana_mr_cache {
 	uint32_t	lkey;
 	uintptr_t	addr;
@@ -190,12 +362,23 @@ extern int mana_logtype_init;
 
 #define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
 
+int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
+		       uint32_t queue_id, uint32_t tail);
+
+int gdma_post_work_request(struct mana_gdma_queue *queue,
+			   struct gdma_work_request *work_req,
+			   struct gdma_posted_wqe_info *wqe_info);
+uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue);
+
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 
 uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 
+int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
+			       struct gdma_comp *comp);
+
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
 				       struct mana_priv *priv,
 				       struct rte_mbuf *mbuf);
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 9771394370..364d57a619 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -12,6 +12,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 sources += files(
 	'mana.c',
 	'mr.c',
+	'gdma.c',
 	'mp.c',
 )
 
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 12/17] net/mana: add function to start/stop TX queues
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (10 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 11/17] net/mana: implement the hardware layer operations longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 13/17] net/mana: add function to start/stop RX queues longli
                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
MANA allocate device queues through the IB layer when starting TX queues.
When device is stopped all the queues are unmapped and freed.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Add prefix mana_ to all function names.
Remove unused header files.
 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/mana.h           |   4 +
 drivers/net/mana/meson.build      |   1 +
 drivers/net/mana/tx.c             | 157 ++++++++++++++++++++++++++++++
 4 files changed, 163 insertions(+)
 create mode 100644 drivers/net/mana/tx.c
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index a59c21cc10..821443b292 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -7,6 +7,7 @@
 Link status          = P
 Linux                = Y
 Multiprocess aware   = Y
+Queue start/stop     = Y
 Removal event        = Y
 RSS hash             = Y
 Speed capabilities   = P
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index d87358ab15..3613ba7ca2 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -379,6 +379,10 @@ uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
 			       struct gdma_comp *comp);
 
+int mana_start_tx_queues(struct rte_eth_dev *dev);
+
+int mana_stop_tx_queues(struct rte_eth_dev *dev);
+
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
 				       struct mana_priv *priv,
 				       struct rte_mbuf *mbuf);
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 364d57a619..031f443d16 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
 	'mana.c',
+	'tx.c',
 	'mr.c',
 	'gdma.c',
 	'mp.c',
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
new file mode 100644
index 0000000000..db7859c8c4
--- /dev/null
+++ b/drivers/net/mana/tx.c
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include <ethdev_driver.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include "mana.h"
+
+int mana_stop_tx_queues(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int i;
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (txq->qp) {
+			ibv_destroy_qp(txq->qp);
+			txq->qp = NULL;
+		}
+
+		if (txq->cq) {
+			ibv_destroy_cq(txq->cq);
+			txq->cq = NULL;
+		}
+
+		/* Drain and free posted WQEs */
+		while (txq->desc_ring_tail != txq->desc_ring_head) {
+			struct mana_txq_desc *desc =
+				&txq->desc_ring[txq->desc_ring_tail];
+
+			rte_pktmbuf_free(desc->pkt);
+
+			txq->desc_ring_tail =
+				(txq->desc_ring_tail + 1) % txq->num_desc;
+		}
+		txq->desc_ring_head = 0;
+		txq->desc_ring_tail = 0;
+
+		memset(&txq->gdma_sq, 0, sizeof(txq->gdma_sq));
+		memset(&txq->gdma_cq, 0, sizeof(txq->gdma_cq));
+	}
+
+	return 0;
+}
+
+int mana_start_tx_queues(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret, i;
+
+	/* start TX queues */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_txq *txq;
+		struct ibv_qp_init_attr qp_attr = { 0 };
+		struct manadv_obj obj = {};
+		struct manadv_qp dv_qp;
+		struct manadv_cq dv_cq;
+
+		txq = dev->data->tx_queues[i];
+
+		manadv_set_context_attr(priv->ib_ctx,
+			MANADV_CTX_ATTR_BUF_ALLOCATORS,
+			(void *)((uintptr_t)&(struct manadv_ctx_allocators){
+				.alloc = &mana_alloc_verbs_buf,
+				.free = &mana_free_verbs_buf,
+				.data = (void *)(uintptr_t)txq->socket,
+			}));
+
+		txq->cq = ibv_create_cq(priv->ib_ctx, txq->num_desc,
+					NULL, NULL, 0);
+		if (!txq->cq) {
+			DRV_LOG(ERR, "failed to create cq queue index %d", i);
+			ret = -errno;
+			goto fail;
+		}
+
+		qp_attr.send_cq = txq->cq;
+		qp_attr.recv_cq = txq->cq;
+		qp_attr.cap.max_send_wr = txq->num_desc;
+		qp_attr.cap.max_send_sge = priv->max_send_sge;
+
+		/* Skip setting qp_attr.cap.max_inline_data */
+
+		qp_attr.qp_type = IBV_QPT_RAW_PACKET;
+		qp_attr.sq_sig_all = 0;
+
+		txq->qp = ibv_create_qp(priv->ib_parent_pd, &qp_attr);
+		if (!txq->qp) {
+			DRV_LOG(ERR, "Failed to create qp queue index %d", i);
+			ret = -errno;
+			goto fail;
+		}
+
+		/* Get the addresses of CQ, QP and DB */
+		obj.qp.in = txq->qp;
+		obj.qp.out = &dv_qp;
+		obj.cq.in = txq->cq;
+		obj.cq.out = &dv_cq;
+		ret = manadv_init_obj(&obj, MANADV_OBJ_QP | MANADV_OBJ_CQ);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to get manadv objects");
+			goto fail;
+		}
+
+		txq->gdma_sq.buffer = obj.qp.out->sq_buf;
+		txq->gdma_sq.count = obj.qp.out->sq_count;
+		txq->gdma_sq.size = obj.qp.out->sq_size;
+		txq->gdma_sq.id = obj.qp.out->sq_id;
+
+		txq->tx_vp_offset = obj.qp.out->tx_vp_offset;
+		priv->db_page = obj.qp.out->db_page;
+		DRV_LOG(INFO, "txq sq id %u vp_offset %u db_page %p "
+				" buf %p count %u size %u",
+				txq->gdma_sq.id, txq->tx_vp_offset,
+				priv->db_page,
+				txq->gdma_sq.buffer, txq->gdma_sq.count,
+				txq->gdma_sq.size);
+
+		txq->gdma_cq.buffer = obj.cq.out->buf;
+		txq->gdma_cq.count = obj.cq.out->count;
+		txq->gdma_cq.size = txq->gdma_cq.count * COMP_ENTRY_SIZE;
+		txq->gdma_cq.id = obj.cq.out->cq_id;
+
+		/* CQ head starts with count (not 0) */
+		txq->gdma_cq.head = txq->gdma_cq.count;
+
+		DRV_LOG(INFO, "txq cq id %u buf %p count %u size %u head %u",
+			txq->gdma_cq.id, txq->gdma_cq.buffer,
+			txq->gdma_cq.count, txq->gdma_cq.size,
+			txq->gdma_cq.head);
+	}
+
+	return 0;
+
+fail:
+	mana_stop_tx_queues(dev);
+	return ret;
+}
+
+static inline uint16_t get_vsq_frame_num(uint32_t vsq)
+{
+	union {
+		uint32_t gdma_txq_id;
+		struct {
+			uint32_t reserved1	: 10;
+			uint32_t vsq_frame	: 14;
+			uint32_t reserved2	: 8;
+		};
+	} v;
+
+	v.gdma_txq_id = vsq;
+	return v.vsq_frame;
+}
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 13/17] net/mana: add function to start/stop RX queues
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (11 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 12/17] net/mana: add function to start/stop TX queues longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 14/17] net/mana: add function to receive packets longli
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
MANA allocates device queues through the IB layer when starting RX queues.
When device is stopped all the queues are unmapped and freed.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Add prefix mana_ to all function names.
Remove unused header files.
 drivers/net/mana/mana.h      |   3 +
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/rx.c        | 345 +++++++++++++++++++++++++++++++++++
 3 files changed, 349 insertions(+)
 create mode 100644 drivers/net/mana/rx.c
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 3613ba7ca2..dc808d363f 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -364,6 +364,7 @@ extern int mana_logtype_init;
 
 int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
 		       uint32_t queue_id, uint32_t tail);
+int mana_rq_ring_doorbell(struct mana_rxq *rxq);
 
 int gdma_post_work_request(struct mana_gdma_queue *queue,
 			   struct gdma_work_request *work_req,
@@ -379,8 +380,10 @@ uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
 			       struct gdma_comp *comp);
 
+int mana_start_rx_queues(struct rte_eth_dev *dev);
 int mana_start_tx_queues(struct rte_eth_dev *dev);
 
+int mana_stop_rx_queues(struct rte_eth_dev *dev);
 int mana_stop_tx_queues(struct rte_eth_dev *dev);
 
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 031f443d16..62e103a510 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
 	'mana.c',
+	'rx.c',
 	'tx.c',
 	'mr.c',
 	'gdma.c',
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
new file mode 100644
index 0000000000..f0cab0d0c9
--- /dev/null
+++ b/drivers/net/mana/rx.c
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+#include <ethdev_driver.h>
+
+#include <infiniband/verbs.h>
+#include <infiniband/manadv.h>
+
+#include "mana.h"
+
+static uint8_t mana_rss_hash_key_default[TOEPLITZ_HASH_KEY_SIZE_IN_BYTES] = {
+	0x2c, 0xc6, 0x81, 0xd1,
+	0x5b, 0xdb, 0xf4, 0xf7,
+	0xfc, 0xa2, 0x83, 0x19,
+	0xdb, 0x1a, 0x3e, 0x94,
+	0x6b, 0x9e, 0x38, 0xd9,
+	0x2c, 0x9c, 0x03, 0xd1,
+	0xad, 0x99, 0x44, 0xa7,
+	0xd9, 0x56, 0x3d, 0x59,
+	0x06, 0x3c, 0x25, 0xf3,
+	0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+int mana_rq_ring_doorbell(struct mana_rxq *rxq)
+{
+	struct mana_priv *priv = rxq->priv;
+	int ret;
+	void *db_page = priv->db_page;
+
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct rte_eth_dev *dev =
+			&rte_eth_devices[priv->dev_data->port_id];
+		struct mana_process_priv *process_priv = dev->process_private;
+
+		db_page = process_priv->db_page;
+	}
+
+	ret = mana_ring_doorbell(db_page, gdma_queue_receive,
+				 rxq->gdma_rq.id,
+				 rxq->gdma_rq.head *
+					GDMA_WQE_ALIGNMENT_UNIT_SIZE);
+
+	if (ret)
+		DRV_LOG(ERR, "failed to ring RX doorbell ret %d", ret);
+
+	return ret;
+}
+
+static int mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+{
+	struct rte_mbuf *mbuf = NULL;
+	struct gdma_sgl_element sgl[1];
+	struct gdma_work_request request = {0};
+	struct gdma_posted_wqe_info wqe_info = {0};
+	struct mana_priv *priv = rxq->priv;
+	int ret;
+	struct mana_mr_cache *mr;
+
+	mbuf = rte_pktmbuf_alloc(rxq->mp);
+	if (!mbuf) {
+		rxq->stats.nombuf++;
+		return -ENOMEM;
+	}
+
+	mr = mana_find_pmd_mr(&rxq->mr_btree, priv, mbuf);
+	if (!mr) {
+		DRV_LOG(ERR, "failed to register RX MR");
+		rte_pktmbuf_free(mbuf);
+		return -ENOMEM;
+	}
+
+	request.gdma_header.struct_size = sizeof(request);
+	wqe_info.gdma_header.struct_size = sizeof(wqe_info);
+
+	sgl[0].address = rte_cpu_to_le_64(rte_pktmbuf_mtod(mbuf, uint64_t));
+	sgl[0].memory_key = mr->lkey;
+	sgl[0].size =
+		rte_pktmbuf_data_room_size(rxq->mp) -
+		RTE_PKTMBUF_HEADROOM;
+
+	request.sgl = sgl;
+	request.num_sgl_elements = 1;
+	request.inline_oob_data = NULL;
+	request.inline_oob_size_in_bytes = 0;
+	request.flags = 0;
+	request.client_data_unit = NOT_USING_CLIENT_DATA_UNIT;
+
+	ret = gdma_post_work_request(&rxq->gdma_rq, &request, &wqe_info);
+	if (!ret) {
+		struct mana_rxq_desc *desc =
+			&rxq->desc_ring[rxq->desc_ring_head];
+
+		/* update queue for tracking pending packets */
+		desc->pkt = mbuf;
+		desc->wqe_size_in_bu = wqe_info.wqe_size_in_bu;
+		rxq->desc_ring_head = (rxq->desc_ring_head + 1) % rxq->num_desc;
+	} else {
+		DRV_LOG(ERR, "failed to post recv ret %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int mana_alloc_and_post_rx_wqes(struct mana_rxq *rxq)
+{
+	int ret;
+
+	for (uint32_t i = 0; i < rxq->num_desc; i++) {
+		ret = mana_alloc_and_post_rx_wqe(rxq);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post RX ret = %d", ret);
+			return ret;
+		}
+	}
+
+	mana_rq_ring_doorbell(rxq);
+
+	return ret;
+}
+
+int mana_stop_rx_queues(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret, i;
+
+	if (priv->rwq_qp) {
+		ret = ibv_destroy_qp(priv->rwq_qp);
+		if (ret)
+			DRV_LOG(ERR, "rx_queue destroy_qp failed %d", ret);
+		priv->rwq_qp = NULL;
+	}
+
+	if (priv->ind_table) {
+		ret = ibv_destroy_rwq_ind_table(priv->ind_table);
+		if (ret)
+			DRV_LOG(ERR, "destroy rwq ind table failed %d", ret);
+		priv->ind_table = NULL;
+	}
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		if (rxq->wq) {
+			ret = ibv_destroy_wq(rxq->wq);
+			if (ret)
+				DRV_LOG(ERR,
+					"rx_queue destroy_wq failed %d", ret);
+			rxq->wq = NULL;
+		}
+
+		if (rxq->cq) {
+			ret = ibv_destroy_cq(rxq->cq);
+			if (ret)
+				DRV_LOG(ERR,
+					"rx_queue destroy_cq failed %d", ret);
+			rxq->cq = NULL;
+		}
+
+		/* Drain and free posted WQEs */
+		while (rxq->desc_ring_tail != rxq->desc_ring_head) {
+			struct mana_rxq_desc *desc =
+				&rxq->desc_ring[rxq->desc_ring_tail];
+
+			rte_pktmbuf_free(desc->pkt);
+
+			rxq->desc_ring_tail =
+				(rxq->desc_ring_tail + 1) % rxq->num_desc;
+		}
+		rxq->desc_ring_head = 0;
+		rxq->desc_ring_tail = 0;
+
+		memset(&rxq->gdma_rq, 0, sizeof(rxq->gdma_rq));
+		memset(&rxq->gdma_cq, 0, sizeof(rxq->gdma_cq));
+	}
+	return 0;
+}
+
+int mana_start_rx_queues(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret, i;
+	struct ibv_wq *ind_tbl[priv->num_queues];
+
+	DRV_LOG(INFO, "start rx queues");
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct ibv_wq_init_attr wq_attr = {};
+
+		manadv_set_context_attr(priv->ib_ctx,
+			MANADV_CTX_ATTR_BUF_ALLOCATORS,
+			(void *)((uintptr_t)&(struct manadv_ctx_allocators){
+				.alloc = &mana_alloc_verbs_buf,
+				.free = &mana_free_verbs_buf,
+				.data = (void *)(uintptr_t)rxq->socket,
+			}));
+
+		rxq->cq = ibv_create_cq(priv->ib_ctx, rxq->num_desc,
+					NULL, NULL, 0);
+		if (!rxq->cq) {
+			ret = -errno;
+			DRV_LOG(ERR, "failed to create rx cq queue %d", i);
+			goto fail;
+		}
+
+		wq_attr.wq_type = IBV_WQT_RQ;
+		wq_attr.max_wr = rxq->num_desc;
+		wq_attr.max_sge = 1;
+		wq_attr.pd = priv->ib_parent_pd;
+		wq_attr.cq = rxq->cq;
+
+		rxq->wq = ibv_create_wq(priv->ib_ctx, &wq_attr);
+		if (!rxq->wq) {
+			ret = -errno;
+			DRV_LOG(ERR, "failed to create rx wq %d", i);
+			goto fail;
+		}
+
+		ind_tbl[i] = rxq->wq;
+	}
+
+	struct ibv_rwq_ind_table_init_attr ind_table_attr = {
+		.log_ind_tbl_size = rte_log2_u32(RTE_DIM(ind_tbl)),
+		.ind_tbl = ind_tbl,
+		.comp_mask = 0,
+	};
+
+	priv->ind_table = ibv_create_rwq_ind_table(priv->ib_ctx,
+						   &ind_table_attr);
+	if (!priv->ind_table) {
+		ret = -errno;
+		DRV_LOG(ERR, "failed to create ind_table ret %d", ret);
+		goto fail;
+	}
+
+	DRV_LOG(INFO, "ind_table handle %d num %d",
+		priv->ind_table->ind_tbl_handle,
+		priv->ind_table->ind_tbl_num);
+
+	struct ibv_qp_init_attr_ex qp_attr_ex = {
+		.comp_mask = IBV_QP_INIT_ATTR_PD |
+			     IBV_QP_INIT_ATTR_RX_HASH |
+			     IBV_QP_INIT_ATTR_IND_TABLE,
+		.qp_type = IBV_QPT_RAW_PACKET,
+		.pd = priv->ib_parent_pd,
+		.rwq_ind_tbl = priv->ind_table,
+		.rx_hash_conf = {
+			.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+			.rx_hash_key_len = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES,
+			.rx_hash_key = mana_rss_hash_key_default,
+			.rx_hash_fields_mask =
+				IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4,
+		},
+
+	};
+
+	/* overwrite default if rss key is set */
+	if (priv->rss_conf.rss_key_len && priv->rss_conf.rss_key)
+		qp_attr_ex.rx_hash_conf.rx_hash_key =
+			priv->rss_conf.rss_key;
+
+	/* overwrite default if rss hash fields are set */
+	if (priv->rss_conf.rss_hf) {
+		qp_attr_ex.rx_hash_conf.rx_hash_fields_mask = 0;
+
+		if (priv->rss_conf.rss_hf & ETH_RSS_IPV4)
+			qp_attr_ex.rx_hash_conf.rx_hash_fields_mask |=
+				IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_DST_IPV4;
+
+		if (priv->rss_conf.rss_hf & ETH_RSS_IPV6)
+			qp_attr_ex.rx_hash_conf.rx_hash_fields_mask |=
+				IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_SRC_IPV6;
+
+		if (priv->rss_conf.rss_hf &
+		    (ETH_RSS_NONFRAG_IPV4_TCP | ETH_RSS_NONFRAG_IPV6_TCP))
+			qp_attr_ex.rx_hash_conf.rx_hash_fields_mask |=
+				IBV_RX_HASH_SRC_PORT_TCP |
+				IBV_RX_HASH_DST_PORT_TCP;
+
+		if (priv->rss_conf.rss_hf &
+		    (ETH_RSS_NONFRAG_IPV4_UDP | ETH_RSS_NONFRAG_IPV6_UDP))
+			qp_attr_ex.rx_hash_conf.rx_hash_fields_mask |=
+				IBV_RX_HASH_SRC_PORT_UDP |
+				IBV_RX_HASH_DST_PORT_UDP;
+	}
+
+	priv->rwq_qp = ibv_create_qp_ex(priv->ib_ctx, &qp_attr_ex);
+	if (!priv->rwq_qp) {
+		ret = -errno;
+		DRV_LOG(ERR, "rx ibv_create_qp_ex failed");
+		goto fail;
+	}
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct manadv_obj obj = {};
+		struct manadv_cq dv_cq;
+		struct manadv_rwq dv_wq;
+
+		obj.cq.in = rxq->cq;
+		obj.cq.out = &dv_cq;
+		obj.rwq.in = rxq->wq;
+		obj.rwq.out = &dv_wq;
+		ret = manadv_init_obj(&obj, MANADV_OBJ_CQ | MANADV_OBJ_RWQ);
+		if (ret) {
+			DRV_LOG(ERR, "manadv_init_obj failed ret %d", ret);
+			goto fail;
+		}
+
+		rxq->gdma_cq.buffer = obj.cq.out->buf;
+		rxq->gdma_cq.count = obj.cq.out->count;
+		rxq->gdma_cq.size = rxq->gdma_cq.count * COMP_ENTRY_SIZE;
+		rxq->gdma_cq.id = obj.cq.out->cq_id;
+
+		/* CQ head starts with count */
+		rxq->gdma_cq.head = rxq->gdma_cq.count;
+
+		DRV_LOG(INFO, "rxq cq id %u buf %p count %u size %u",
+			rxq->gdma_cq.id, rxq->gdma_cq.buffer,
+			rxq->gdma_cq.count, rxq->gdma_cq.size);
+
+		priv->db_page = obj.rwq.out->db_page;
+
+		rxq->gdma_rq.buffer = obj.rwq.out->buf;
+		rxq->gdma_rq.count = obj.rwq.out->count;
+		rxq->gdma_rq.size = obj.rwq.out->size;
+		rxq->gdma_rq.id = obj.rwq.out->wq_id;
+
+		DRV_LOG(INFO, "rxq rq id %u buf %p count %u size %u",
+			rxq->gdma_rq.id, rxq->gdma_rq.buffer,
+			rxq->gdma_rq.count, rxq->gdma_rq.size);
+	}
+
+	for (i = 0; i < priv->num_queues; i++) {
+		ret = mana_alloc_and_post_rx_wqes(dev->data->rx_queues[i]);
+		if (ret)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	mana_stop_rx_queues(dev);
+	return ret;
+}
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 14/17] net/mana: add function to receive packets
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (12 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 13/17] net/mana: add function to start/stop RX queues longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 15/17] net/mana: add function to send packets longli
                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
With all the RX queues created, MANA can use those queues to receive packets.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Add mana_ to all function names.
Rename a camel case.
 doc/guides/nics/features/mana.ini |   2 +
 drivers/net/mana/mana.c           |   2 +
 drivers/net/mana/mana.h           |  37 +++++++++++
 drivers/net/mana/mp.c             |   2 +
 drivers/net/mana/rx.c             | 104 ++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index 821443b292..fdbf22d335 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -6,6 +6,8 @@
 [Features]
 Link status          = P
 Linux                = Y
+L3 checksum offload  = Y
+L4 checksum offload  = Y
 Multiprocess aware   = Y
 Queue start/stop     = Y
 Removal event        = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 24741197c9..d255f79a87 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -950,6 +950,8 @@ static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
 				/* fd is no not used after mapping doorbell */
 				close(fd);
 
+				eth_dev->rx_pkt_burst = mana_rx_burst;
+
 				rte_spinlock_lock(&mana_shared_data->lock);
 				mana_shared_data->secondary_cnt++;
 				mana_local_data.secondary_cnt++;
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index dc808d363f..bafc4d6082 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -178,6 +178,11 @@ struct gdma_work_request {
 
 enum mana_cqe_type {
 	CQE_INVALID                     = 0,
+
+	CQE_RX_OKAY                     = 1,
+	CQE_RX_COALESCED_4              = 2,
+	CQE_RX_OBJECT_FENCE             = 3,
+	CQE_RX_TRUNCATED                = 4,
 };
 
 struct mana_cqe_header {
@@ -203,6 +208,35 @@ struct mana_cqe_header {
 	(NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 |      \
 	 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
 
+struct mana_rx_comp_per_packet_info {
+	uint32_t packet_length	: 16;
+	uint32_t reserved0	: 16;
+	uint32_t reserved1;
+	uint32_t packet_hash;
+}; /* HW DATA */
+#define RX_COM_OOB_NUM_PACKETINFO_SEGMENTS 4
+
+struct mana_rx_comp_oob {
+	struct mana_cqe_header cqe_hdr;
+
+	uint32_t rx_vlan_id				: 12;
+	uint32_t rx_vlan_tag_present			: 1;
+	uint32_t rx_outer_ip_header_checksum_succeeded	: 1;
+	uint32_t rx_outer_ip_header_checksum_failed	: 1;
+	uint32_t reserved				: 1;
+	uint32_t rx_hash_type				: 9;
+	uint32_t rx_ip_header_checksum_succeeded	: 1;
+	uint32_t rx_ip_header_checksum_failed		: 1;
+	uint32_t rx_tcp_checksum_succeeded		: 1;
+	uint32_t rx_tcp_checksum_failed			: 1;
+	uint32_t rx_udp_checksum_succeeded		: 1;
+	uint32_t rx_udp_checksum_failed			: 1;
+	uint32_t reserved1				: 1;
+	struct mana_rx_comp_per_packet_info
+		packet_info[RX_COM_OOB_NUM_PACKETINFO_SEGMENTS];
+	uint32_t received_wqe_offset;
+}; /* HW DATA */
+
 struct gdma_wqe_dma_oob {
 	uint32_t reserved:24;
 	uint32_t last_v_bytes:8;
@@ -371,6 +405,9 @@ int gdma_post_work_request(struct mana_gdma_queue *queue,
 			   struct gdma_posted_wqe_info *wqe_info);
 uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue);
 
+uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **rx_pkts,
+		       uint16_t pkts_n);
+
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index f4f78d2787..36a88c561a 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -138,6 +138,8 @@ static int mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg,
 	case MANA_MP_REQ_START_RXTX:
 		DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
 
+		dev->rx_pkt_burst = mana_rx_burst;
+
 		rte_mb();
 
 		res->result = 0;
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index f0cab0d0c9..9912f19977 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -343,3 +343,107 @@ int mana_start_rx_queues(struct rte_eth_dev *dev)
 	mana_stop_rx_queues(dev);
 	return ret;
 }
+
+uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	uint16_t pkt_received = 0, cqe_processed = 0;
+	struct mana_rxq *rxq = dpdk_rxq;
+	struct mana_priv *priv = rxq->priv;
+	struct gdma_comp comp;
+	struct rte_mbuf *mbuf;
+	int ret;
+
+	while (pkt_received < pkts_n &&
+	       gdma_poll_completion_queue(&rxq->gdma_cq, &comp) == 1) {
+		struct mana_rxq_desc *desc;
+		struct mana_rx_comp_oob *oob =
+			(struct mana_rx_comp_oob *)&comp.completion_data[0];
+
+		if (comp.work_queue_number != rxq->gdma_rq.id) {
+			DRV_LOG(ERR, "rxq comp id mismatch wqid=0x%x rcid=0x%x",
+				comp.work_queue_number, rxq->gdma_rq.id);
+			rxq->stats.errors++;
+			break;
+		}
+
+		desc = &rxq->desc_ring[rxq->desc_ring_tail];
+		rxq->gdma_rq.tail += desc->wqe_size_in_bu;
+		mbuf = desc->pkt;
+
+		switch (oob->cqe_hdr.cqe_type) {
+		case CQE_RX_OKAY:
+			/* Proceed to process mbuf */
+			break;
+
+		case CQE_RX_TRUNCATED:
+			DRV_LOG(ERR, "Drop a truncated packet");
+			rxq->stats.errors++;
+			rte_pktmbuf_free(mbuf);
+			goto drop;
+
+		case CQE_RX_COALESCED_4:
+			DRV_LOG(ERR, "RX coalescing is not supported");
+			continue;
+
+		default:
+			DRV_LOG(ERR, "Unknown RX CQE type %d",
+				oob->cqe_hdr.cqe_type);
+			continue;
+		}
+
+		DRV_LOG(DEBUG, "mana_rx_comp_oob CQE_RX_OKAY rxq %p", rxq);
+
+		mbuf->data_off = RTE_PKTMBUF_HEADROOM;
+		mbuf->nb_segs = 1;
+		mbuf->next = NULL;
+		mbuf->pkt_len = oob->packet_info[0].packet_length;
+		mbuf->data_len = oob->packet_info[0].packet_length;
+		mbuf->port = priv->port_id;
+
+		if (oob->rx_ip_header_checksum_succeeded)
+			mbuf->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD;
+
+		if (oob->rx_ip_header_checksum_failed)
+			mbuf->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
+
+		if (oob->rx_outer_ip_header_checksum_failed)
+			mbuf->ol_flags |= RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD;
+
+		if (oob->rx_tcp_checksum_succeeded ||
+		    oob->rx_udp_checksum_succeeded)
+			mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
+
+		if (oob->rx_tcp_checksum_failed ||
+		    oob->rx_udp_checksum_failed)
+			mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_BAD;
+
+		if (oob->rx_hash_type == MANA_HASH_L3 ||
+		    oob->rx_hash_type == MANA_HASH_L4) {
+			mbuf->ol_flags |= RTE_MBUF_F_RX_RSS_HASH;
+			mbuf->hash.rss = oob->packet_info[0].packet_hash;
+		}
+
+		pkts[pkt_received++] = mbuf;
+		rxq->stats.packets++;
+		rxq->stats.bytes += mbuf->data_len;
+
+drop:
+		rxq->desc_ring_tail++;
+		if (rxq->desc_ring_tail >= rxq->num_desc)
+			rxq->desc_ring_tail = 0;
+
+		cqe_processed++;
+
+		/* Post another request */
+		ret = mana_alloc_and_post_rx_wqe(rxq);
+		if (ret) {
+			DRV_LOG(ERR, "failed to post rx wqe ret=%d", ret);
+			break;
+		}
+	}
+
+	if (cqe_processed)
+		mana_rq_ring_doorbell(rxq);
+
+	return pkt_received;
+}
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 15/17] net/mana: add function to send packets
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (13 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 14/17] net/mana: add function to receive packets longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 16/17] net/mana: add function to start/stop device longli
  2022-07-07 20:30 ` [Patch v3 17/17] net/mana: add function to report queue stats longli
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
With all the TX queues created, MANA can send packets over those queues.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Rename all camel cases.
 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/mana.c           |   1 +
 drivers/net/mana/mana.h           |  65 ++++++++
 drivers/net/mana/mp.c             |   1 +
 drivers/net/mana/tx.c             | 241 ++++++++++++++++++++++++++++++
 5 files changed, 309 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index fdbf22d335..7922816d66 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Free Tx mbuf on demand = Y
 Link status          = P
 Linux                = Y
 L3 checksum offload  = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index d255f79a87..ca81dce669 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -950,6 +950,7 @@ static int mana_pci_probe_mac(struct rte_pci_driver *pci_drv __rte_unused,
 				/* fd is no not used after mapping doorbell */
 				close(fd);
 
+				eth_dev->tx_pkt_burst = mana_tx_burst;
 				eth_dev->rx_pkt_burst = mana_rx_burst;
 
 				rte_spinlock_lock(&mana_shared_data->lock);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index bafc4d6082..b4056bd50b 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -62,6 +62,47 @@ struct mana_shared_data {
 
 #define NOT_USING_CLIENT_DATA_UNIT 0
 
+enum tx_packet_format_v2 {
+	short_packet_format = 0,
+	long_packet_format = 1
+};
+
+struct transmit_short_oob_v2 {
+	enum tx_packet_format_v2 packet_format : 2;
+	uint32_t tx_is_outer_ipv4 : 1;
+	uint32_t tx_is_outer_ipv6 : 1;
+	uint32_t tx_compute_IP_header_checksum : 1;
+	uint32_t tx_compute_TCP_checksum : 1;
+	uint32_t tx_compute_UDP_checksum : 1;
+	uint32_t suppress_tx_CQE_generation : 1;
+	uint32_t VCQ_number : 24;
+	uint32_t tx_transport_header_offset : 10;
+	uint32_t VSQ_frame_num : 14;
+	uint32_t short_vport_offset : 8;
+};
+
+struct transmit_long_oob_v2 {
+	uint32_t tx_is_encapsulated_packet : 1;
+	uint32_t tx_inner_is_ipv6 : 1;
+	uint32_t tx_inner_TCP_options_present : 1;
+	uint32_t inject_vlan_prior_tag : 1;
+	uint32_t reserved1 : 12;
+	uint32_t priority_code_point : 3;
+	uint32_t drop_eligible_indicator : 1;
+	uint32_t vlan_identifier : 12;
+	uint32_t tx_inner_frame_offset : 10;
+	uint32_t tx_inner_IP_header_relative_offset : 6;
+	uint32_t long_vport_offset : 12;
+	uint32_t reserved3 : 4;
+	uint32_t reserved4 : 32;
+	uint32_t reserved5 : 32;
+};
+
+struct transmit_oob_v2 {
+	struct transmit_short_oob_v2 short_oob;
+	struct transmit_long_oob_v2 long_oob;
+};
+
 enum gdma_queue_types {
 	gdma_queue_type_invalid = 0,
 	gdma_queue_send,
@@ -183,6 +224,17 @@ enum mana_cqe_type {
 	CQE_RX_COALESCED_4              = 2,
 	CQE_RX_OBJECT_FENCE             = 3,
 	CQE_RX_TRUNCATED                = 4,
+
+	CQE_TX_OKAY                     = 32,
+	CQE_TX_SA_DROP                  = 33,
+	CQE_TX_MTU_DROP                 = 34,
+	CQE_TX_INVALID_OOB              = 35,
+	CQE_TX_INVALID_ETH_TYPE         = 36,
+	CQE_TX_HDR_PROCESSING_ERROR     = 37,
+	CQE_TX_VF_DISABLED              = 38,
+	CQE_TX_VPORT_IDX_OUT_OF_RANGE   = 39,
+	CQE_TX_VPORT_DISABLED           = 40,
+	CQE_TX_VLAN_TAGGING_VIOLATION   = 41,
 };
 
 struct mana_cqe_header {
@@ -191,6 +243,17 @@ struct mana_cqe_header {
 	uint32_t vendor_err  : 24;
 }; /* HW DATA */
 
+struct mana_tx_comp_oob {
+	struct mana_cqe_header cqe_hdr;
+
+	uint32_t tx_data_offset;
+
+	uint32_t tx_sgl_offset       : 5;
+	uint32_t tx_wqe_offset       : 27;
+
+	uint32_t reserved[12];
+}; /* HW DATA */
+
 /* NDIS HASH Types */
 #define BIT(nr)		(1 << (nr))
 #define NDIS_HASH_IPV4          BIT(0)
@@ -407,6 +470,8 @@ uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue);
 
 uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **rx_pkts,
 		       uint16_t pkts_n);
+uint16_t mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts,
+		       uint16_t pkts_n);
 
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index 36a88c561a..da9c0f36a1 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -138,6 +138,7 @@ static int mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg,
 	case MANA_MP_REQ_START_RXTX:
 		DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
 
+		dev->tx_pkt_burst = mana_tx_burst;
 		dev->rx_pkt_burst = mana_rx_burst;
 
 		rte_mb();
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
index db7859c8c4..26340311c9 100644
--- a/drivers/net/mana/tx.c
+++ b/drivers/net/mana/tx.c
@@ -155,3 +155,244 @@ static inline uint16_t get_vsq_frame_num(uint32_t vsq)
 	v.gdma_txq_id = vsq;
 	return v.vsq_frame;
 }
+
+uint16_t mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts,
+		       uint16_t nb_pkts)
+{
+	struct mana_txq *txq = dpdk_txq;
+	struct mana_priv *priv = txq->priv;
+	struct gdma_comp comp;
+	int ret;
+	void *db_page;
+
+	/* Process send completions from GDMA */
+	while (gdma_poll_completion_queue(&txq->gdma_cq, &comp) == 1) {
+		struct mana_txq_desc *desc =
+			&txq->desc_ring[txq->desc_ring_tail];
+		struct mana_tx_comp_oob *oob =
+			(struct mana_tx_comp_oob *)&comp.completion_data[0];
+
+		if (oob->cqe_hdr.cqe_type != CQE_TX_OKAY) {
+			DRV_LOG(ERR,
+				"mana_tx_comp_oob cqe_type %u vendor_err %u",
+				oob->cqe_hdr.cqe_type, oob->cqe_hdr.vendor_err);
+			txq->stats.errors++;
+		} else {
+			DRV_LOG(DEBUG, "mana_tx_comp_oob CQE_TX_OKAY");
+			txq->stats.packets++;
+		}
+
+		if (!desc->pkt) {
+			DRV_LOG(ERR, "mana_txq_desc has a NULL pkt");
+		} else {
+			txq->stats.bytes += desc->pkt->data_len;
+			rte_pktmbuf_free(desc->pkt);
+		}
+
+		desc->pkt = NULL;
+		txq->desc_ring_tail = (txq->desc_ring_tail + 1) % txq->num_desc;
+		txq->gdma_sq.tail += desc->wqe_size_in_bu;
+	}
+
+	/* Post send requests to GDMA */
+	uint16_t pkt_idx;
+
+	for (pkt_idx = 0; pkt_idx < nb_pkts; pkt_idx++) {
+		struct rte_mbuf *m_pkt = tx_pkts[pkt_idx];
+		struct rte_mbuf *m_seg = m_pkt;
+		struct transmit_oob_v2 tx_oob = {0};
+		struct one_sgl sgl = {0};
+
+		/* Drop the packet if it exceeds max segments */
+		if (m_pkt->nb_segs > priv->max_send_sge) {
+			DRV_LOG(ERR, "send packet segments %d exceeding max",
+				m_pkt->nb_segs);
+			continue;
+		}
+
+		/* Fill in the oob */
+		tx_oob.short_oob.packet_format = short_packet_format;
+		tx_oob.short_oob.tx_is_outer_ipv4 =
+			m_pkt->ol_flags & RTE_MBUF_F_TX_IPV4 ? 1 : 0;
+		tx_oob.short_oob.tx_is_outer_ipv6 =
+			m_pkt->ol_flags & RTE_MBUF_F_TX_IPV6 ? 1 : 0;
+
+		tx_oob.short_oob.tx_compute_IP_header_checksum =
+			m_pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM ? 1 : 0;
+
+		if ((m_pkt->ol_flags & RTE_MBUF_F_TX_L4_MASK) ==
+				RTE_MBUF_F_TX_TCP_CKSUM) {
+			struct rte_tcp_hdr *tcp_hdr;
+
+			/* HW needs partial TCP checksum */
+
+			tcp_hdr = rte_pktmbuf_mtod_offset(m_pkt,
+					  struct rte_tcp_hdr *,
+					  m_pkt->l2_len + m_pkt->l3_len);
+
+			if (m_pkt->ol_flags & RTE_MBUF_F_TX_IPV4) {
+				struct rte_ipv4_hdr *ip_hdr;
+
+				ip_hdr = rte_pktmbuf_mtod_offset(m_pkt,
+						struct rte_ipv4_hdr *,
+						m_pkt->l2_len);
+				tcp_hdr->cksum = rte_ipv4_phdr_cksum(ip_hdr,
+							m_pkt->ol_flags);
+
+			} else if (m_pkt->ol_flags & RTE_MBUF_F_TX_IPV6) {
+				struct rte_ipv6_hdr *ip_hdr;
+
+				ip_hdr = rte_pktmbuf_mtod_offset(m_pkt,
+						struct rte_ipv6_hdr *,
+						m_pkt->l2_len);
+				tcp_hdr->cksum = rte_ipv6_phdr_cksum(ip_hdr,
+							m_pkt->ol_flags);
+			} else {
+				DRV_LOG(ERR, "Invalid input for TCP CKSUM");
+			}
+
+			tx_oob.short_oob.tx_compute_TCP_checksum = 1;
+			tx_oob.short_oob.tx_transport_header_offset =
+				m_pkt->l2_len + m_pkt->l3_len;
+		}
+
+		if ((m_pkt->ol_flags & RTE_MBUF_F_TX_L4_MASK) ==
+				RTE_MBUF_F_TX_UDP_CKSUM) {
+			struct rte_udp_hdr *udp_hdr;
+
+			/* HW needs partial UDP checksum */
+			udp_hdr = rte_pktmbuf_mtod_offset(m_pkt,
+					struct rte_udp_hdr *,
+					m_pkt->l2_len + m_pkt->l3_len);
+
+			if (m_pkt->ol_flags & RTE_MBUF_F_TX_IPV4) {
+				struct rte_ipv4_hdr *ip_hdr;
+
+				ip_hdr = rte_pktmbuf_mtod_offset(m_pkt,
+						struct rte_ipv4_hdr *,
+						m_pkt->l2_len);
+
+				udp_hdr->dgram_cksum =
+					rte_ipv4_phdr_cksum(ip_hdr,
+							    m_pkt->ol_flags);
+
+			} else if (m_pkt->ol_flags & RTE_MBUF_F_TX_IPV6) {
+				struct rte_ipv6_hdr *ip_hdr;
+
+				ip_hdr = rte_pktmbuf_mtod_offset(m_pkt,
+						struct rte_ipv6_hdr *,
+						m_pkt->l2_len);
+
+				udp_hdr->dgram_cksum =
+					rte_ipv6_phdr_cksum(ip_hdr,
+							    m_pkt->ol_flags);
+
+			} else {
+				DRV_LOG(ERR, "Invalid input for UDP CKSUM");
+			}
+
+			tx_oob.short_oob.tx_compute_UDP_checksum = 1;
+		}
+
+		tx_oob.short_oob.suppress_tx_CQE_generation = 0;
+		tx_oob.short_oob.VCQ_number = txq->gdma_cq.id;
+
+		tx_oob.short_oob.VSQ_frame_num =
+			get_vsq_frame_num(txq->gdma_sq.id);
+		tx_oob.short_oob.short_vport_offset = txq->tx_vp_offset;
+
+		DRV_LOG(DEBUG, "tx_oob packet_format %u ipv4 %u ipv6 %u",
+			tx_oob.short_oob.packet_format,
+			tx_oob.short_oob.tx_is_outer_ipv4,
+			tx_oob.short_oob.tx_is_outer_ipv6);
+
+		DRV_LOG(DEBUG, "tx_oob checksum ip %u tcp %u udp %u offset %u",
+			tx_oob.short_oob.tx_compute_IP_header_checksum,
+			tx_oob.short_oob.tx_compute_TCP_checksum,
+			tx_oob.short_oob.tx_compute_UDP_checksum,
+			tx_oob.short_oob.tx_transport_header_offset);
+
+		DRV_LOG(DEBUG, "pkt[%d]: buf_addr 0x%p, nb_segs %d, pkt_len %d",
+			pkt_idx, m_pkt->buf_addr, m_pkt->nb_segs,
+			m_pkt->pkt_len);
+
+		/* Create SGL for packet data buffers */
+		for (uint16_t seg_idx = 0; seg_idx < m_pkt->nb_segs; seg_idx++) {
+			struct mana_mr_cache *mr =
+				mana_find_pmd_mr(&txq->mr_btree, priv, m_seg);
+
+			if (!mr) {
+				DRV_LOG(ERR, "failed to get MR, pkt_idx %u",
+					pkt_idx);
+				return pkt_idx;
+			}
+
+			sgl.gdma_sgl[seg_idx].address =
+				rte_cpu_to_le_64(rte_pktmbuf_mtod(m_seg,
+								  uint64_t));
+			sgl.gdma_sgl[seg_idx].size = m_seg->data_len;
+			sgl.gdma_sgl[seg_idx].memory_key = mr->lkey;
+
+			DRV_LOG(DEBUG,
+				"seg idx %u addr 0x%" PRIx64 " size %x key %x",
+				seg_idx, sgl.gdma_sgl[seg_idx].address,
+				sgl.gdma_sgl[seg_idx].size,
+				sgl.gdma_sgl[seg_idx].memory_key);
+
+			m_seg = m_seg->next;
+		}
+
+		struct gdma_work_request work_req = {0};
+		struct gdma_posted_wqe_info wqe_info = {0};
+
+		work_req.gdma_header.struct_size = sizeof(work_req);
+		wqe_info.gdma_header.struct_size = sizeof(wqe_info);
+
+		work_req.sgl = sgl.gdma_sgl;
+		work_req.num_sgl_elements = m_pkt->nb_segs;
+		work_req.inline_oob_size_in_bytes =
+			sizeof(struct transmit_short_oob_v2);
+		work_req.inline_oob_data = &tx_oob;
+		work_req.flags = 0;
+		work_req.client_data_unit = NOT_USING_CLIENT_DATA_UNIT;
+
+		ret = gdma_post_work_request(&txq->gdma_sq, &work_req,
+					     &wqe_info);
+		if (!ret) {
+			struct mana_txq_desc *desc =
+				&txq->desc_ring[txq->desc_ring_head];
+
+			/* Update queue for tracking pending requests */
+			desc->pkt = m_pkt;
+			desc->wqe_size_in_bu = wqe_info.wqe_size_in_bu;
+			txq->desc_ring_head =
+				(txq->desc_ring_head + 1) % txq->num_desc;
+
+			DRV_LOG(DEBUG, "nb_pkts %u pkt[%d] sent",
+				nb_pkts, pkt_idx);
+		} else {
+			DRV_LOG(INFO, "pkt[%d] failed to post send ret %d",
+				pkt_idx, ret);
+			break;
+		}
+	}
+
+	/* Ring hardware door bell */
+	db_page = priv->db_page;
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct rte_eth_dev *dev =
+			&rte_eth_devices[priv->dev_data->port_id];
+		struct mana_process_priv *process_priv = dev->process_private;
+
+		db_page = process_priv->db_page;
+	}
+
+	ret = mana_ring_doorbell(db_page, gdma_queue_send,
+				 txq->gdma_sq.id,
+				 txq->gdma_sq.head *
+					GDMA_WQE_ALIGNMENT_UNIT_SIZE);
+	if (ret)
+		DRV_LOG(ERR, "mana_ring_doorbell failed ret %d", ret);
+
+	return pkt_idx;
+}
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 16/17] net/mana: add function to start/stop device
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (14 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 15/17] net/mana: add function to send packets longli
@ 2022-07-07 20:30 ` longli
  2022-07-07 20:30 ` [Patch v3 17/17] net/mana: add function to report queue stats longli
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
Add support for starting/stopping the device.
Signed-off-by: Long Li <longli@microsoft.com>
---
Change log:
v2:
Use spinlock for memory registration cache.
Add prefix mana_ to all function names.
 drivers/net/mana/mana.c | 70 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index ca81dce669..266fcd56d6 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -97,6 +97,74 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
 
 static int mana_intr_uninstall(struct mana_priv *priv);
 
+static int
+mana_dev_start(struct rte_eth_dev *dev)
+{
+	int ret;
+	struct mana_priv *priv = dev->data->dev_private;
+
+	rte_spinlock_init(&priv->mr_btree_lock);
+	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
+				 dev->device->numa_node);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
+		return ret;
+	}
+
+	ret = mana_start_tx_queues(dev);
+	if (ret) {
+		DRV_LOG(ERR, "failed to start tx queues %d", ret);
+		return ret;
+	}
+
+	ret = mana_start_rx_queues(dev);
+	if (ret) {
+		DRV_LOG(ERR, "failed to start rx queues %d", ret);
+		mana_stop_tx_queues(dev);
+		return ret;
+	}
+
+	rte_wmb();
+
+	dev->tx_pkt_burst = mana_tx_burst;
+	dev->rx_pkt_burst = mana_rx_burst;
+
+	DRV_LOG(INFO, "TX/RX queues have started");
+
+	/* Enable datapath for secondary processes */
+	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
+
+	return 0;
+}
+
+static int
+mana_dev_stop(struct rte_eth_dev *dev __rte_unused)
+{
+	int ret;
+
+	dev->tx_pkt_burst = mana_tx_burst_removed;
+	dev->rx_pkt_burst = mana_rx_burst_removed;
+
+	/* Stop datapath on secondary processes */
+	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
+
+	rte_wmb();
+
+	ret = mana_stop_tx_queues(dev);
+	if (ret) {
+		DRV_LOG(ERR, "failed to stop tx queues");
+		return ret;
+	}
+
+	ret = mana_stop_rx_queues(dev);
+	if (ret) {
+		DRV_LOG(ERR, "failed to stop tx queues");
+		return ret;
+	}
+
+	return 0;
+}
+
 static int
 mana_dev_close(struct rte_eth_dev *dev)
 {
@@ -435,6 +503,8 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 
 const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
+	.dev_start		= mana_dev_start,
+	.dev_stop		= mana_dev_stop,
 	.dev_close		= mana_dev_close,
 	.dev_infos_get		= mana_dev_info_get,
 	.txq_info_get		= mana_dev_tx_queue_info,
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread
- * [Patch v3 17/17] net/mana: add function to report queue stats
  2022-07-07 20:30 [Patch v3 00/17] Introduce Microsoft Azure Network Adatper (MANA) PMD longli
                   ` (15 preceding siblings ...)
  2022-07-07 20:30 ` [Patch v3 16/17] net/mana: add function to start/stop device longli
@ 2022-07-07 20:30 ` longli
  16 siblings, 0 replies; 23+ messages in thread
From: longli @ 2022-07-07 20:30 UTC (permalink / raw)
  To: Ferruh Yigit; +Cc: dev, Ajay Sharma, Stephen Hemminger, Long Li
From: Long Li <longli@microsoft.com>
Report packet statistics.
Signed-off-by: Long Li <longli@microsoft.com>
---
 doc/guides/nics/features/mana.ini |  2 +
 drivers/net/mana/mana.c           | 77 +++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)
diff --git a/doc/guides/nics/features/mana.ini b/doc/guides/nics/features/mana.ini
index 7922816d66..b2729aba3a 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Basic stats          = Y
 Free Tx mbuf on demand = Y
 Link status          = P
 Linux                = Y
@@ -14,5 +15,6 @@ Queue start/stop     = Y
 Removal event        = Y
 RSS hash             = Y
 Speed capabilities   = P
+Stats per queue      = Y
 Usage doc            = Y
 x86-64               = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 266fcd56d6..bbcd04794d 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -501,6 +501,79 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 	return rte_eth_linkstatus_set(dev, &link);
 }
 
+static int mana_dev_stats_get(struct rte_eth_dev *dev,
+			      struct rte_eth_stats *stats)
+{
+	unsigned int i;
+
+	for (i = 0; i < dev->data->nb_tx_queues; i++) {
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (!txq)
+			continue;
+
+		stats->opackets = txq->stats.packets;
+		stats->obytes = txq->stats.bytes;
+		stats->oerrors = txq->stats.errors;
+
+		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			stats->q_opackets[i] = txq->stats.packets;
+			stats->q_obytes[i] = txq->stats.bytes;
+		}
+	}
+
+	stats->rx_nombuf = 0;
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		if (!rxq)
+			continue;
+
+		stats->ipackets = rxq->stats.packets;
+		stats->ibytes = rxq->stats.bytes;
+		stats->ierrors = rxq->stats.errors;
+
+		/* There is no good way to get stats->imissed, not setting it */
+
+		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+			stats->q_ipackets[i] = rxq->stats.packets;
+			stats->q_ibytes[i] = rxq->stats.bytes;
+		}
+
+		stats->rx_nombuf += rxq->stats.nombuf;
+	}
+
+	return 0;
+}
+
+static int
+mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
+{
+	unsigned int i;
+
+	PMD_INIT_FUNC_TRACE();
+
+	for (i = 0; i < dev->data->nb_tx_queues; i++) {
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (!txq)
+			continue;
+
+		memset(&txq->stats, 0, sizeof(txq->stats));
+	}
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+		if (!rxq)
+			continue;
+
+		memset(&rxq->stats, 0, sizeof(rxq->stats));
+	}
+
+	return 0;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
 	.dev_configure		= mana_dev_configure,
 	.dev_start		= mana_dev_start,
@@ -517,9 +590,13 @@ const struct eth_dev_ops mana_dev_ops = {
 	.rx_queue_setup		= mana_dev_rx_queue_setup,
 	.rx_queue_release	= mana_dev_rx_queue_release,
 	.link_update		= mana_dev_link_update,
+	.stats_get		= mana_dev_stats_get,
+	.stats_reset		= mana_dev_stats_reset,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
+	.stats_get = mana_dev_stats_get,
+	.stats_reset = mana_dev_stats_reset,
 	.dev_infos_get = mana_dev_info_get,
 };
 
-- 
2.17.1
^ permalink raw reply	[flat|nested] 23+ messages in thread