DPDK patches and discussions
 help / color / mirror / Atom feed
From: Yongseok Koh <yskoh@mellanox.com>
To: shahafs@mellanox.com
Cc: dev@dpdk.org
Subject: [dpdk-dev] [PATCH v3 3/3] net/mlx4: add secondary process support
Date: Mon,  1 Apr 2019 14:15:53 -0700	[thread overview]
Message-ID: <20190401211553.26063-4-yskoh@mellanox.com> (raw)
Message-ID: <20190401211553.SijbzDAUa8mhmpHtOaiWgxVnirUaRTQiRhONcsz_K1k@z> (raw)
In-Reply-To: <20190401211553.26063-1-yskoh@mellanox.com>

In order to support secondary process, a few features are required.

a) rdma-core library should allocate device resources using DPDK's memory
   allocator.

b) UAR should be remapped for secondary processes. Currently, in order not
   to use different data structure for secondary processes, PMD tries to
   reserve identical virtual address space for both primary and secondary
   processes.

c) IPC channel is necessary, which can be easily set with rte_mp APIs.
   Through the channel, Verbs command FD is delivered to the secondary
   process and the device stop/start event is also broadcast from primary
   process.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |  10 +
 drivers/net/mlx4/Makefile         |   6 +
 drivers/net/mlx4/meson.build      |   3 +
 drivers/net/mlx4/mlx4.c           | 384 ++++++++++++++++++++++++++++++++++++--
 drivers/net/mlx4/mlx4.h           |  61 ++++++
 drivers/net/mlx4/mlx4_mp.c        | 304 ++++++++++++++++++++++++++++++
 drivers/net/mlx4/mlx4_mr.c        |  32 +++-
 drivers/net/mlx4/mlx4_prm.h       |   4 +-
 drivers/net/mlx4/mlx4_rxtx.c      |   2 +
 drivers/net/mlx4/mlx4_rxtx.h      |   1 +
 drivers/net/mlx4/mlx4_txq.c       | 113 +++++++++++
 12 files changed, 898 insertions(+), 23 deletions(-)
 create mode 100644 drivers/net/mlx4/mlx4_mp.c

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index a211aef332..4502aa2a87 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -29,6 +29,7 @@ Packet type parsing  = Y
 Basic stats          = Y
 Stats per queue      = Y
 FW version           = Y
+Multiprocess aware   = Y
 Other kdrv           = Y
 Power8               = Y
 x86-32               = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 4ad361a2c2..cd34838f41 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -145,6 +145,16 @@ below.
 Limitations
 -----------
 
+- For secondary process:
+
+  - Forked secondary process not supported.
+  - All mempools must be initialized before rte_eth_dev_start().
+  - External memory unregistered in EAL memseg list cannot be used for DMA
+    unless such memory has been registered by ``mlx4_mr_update_ext_mp()`` in
+    primary process and remapped to the same virtual address in secondary
+    process. If the external memory is registered by primary process but has
+    different virtual address in secondary process, unexpected error may happen.
+
 - CRC stripping is supported by default and always reported as "true".
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index b527efd625..8126b0dfc6 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -18,6 +18,7 @@ ifneq ($(CONFIG_RTE_IBVERBS_LINK_DLOPEN),y)
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_glue.c
 endif
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_intr.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mp.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_mr.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxq.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += mlx4_rxtx.c
@@ -93,6 +94,11 @@ mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_UAR_MMAP_OFFSET \
+		infiniband/mlx4dv.h \
+		enum MLX4DV_QP_MASK_UAR_MMAP_OFFSET \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_IBV_MLX4_WQE_LSO_SEG \
 		infiniband/mlx4dv.h \
 		type 'struct mlx4_wqe_lso_seg' \
diff --git a/drivers/net/mlx4/meson.build b/drivers/net/mlx4/meson.build
index 650e2c8fbc..de020701d1 100644
--- a/drivers/net/mlx4/meson.build
+++ b/drivers/net/mlx4/meson.build
@@ -33,6 +33,7 @@ if build
 		'mlx4_ethdev.c',
 		'mlx4_flow.c',
 		'mlx4_intr.c',
+		'mlx4_mp.c',
 		'mlx4_mr.c',
 		'mlx4_rxq.c',
 		'mlx4_rxtx.c',
@@ -76,6 +77,8 @@ if build
 	has_sym_args = [
 		[ 'HAVE_IBV_MLX4_BUF_ALLOCATORS', 'infiniband/mlx4dv.h',
 		'MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS' ],
+		[ 'HAVE_IBV_MLX4_UAR_MMAP_OFFSET', 'infiniband/mlx4dv.h',
+		'MLX4DV_QP_MASK_UAR_MMAP_OFFSET' ],
 	]
 	config = configuration_data()
 	foreach arg:has_sym_args
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index 0e0b035df0..315640a6d7 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -17,6 +17,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <unistd.h>
 
 /* Verbs headers do not support -pedantic. */
@@ -48,10 +49,16 @@
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
 
-struct mlx4_dev_list mlx4_mem_event_cb_list =
-	LIST_HEAD_INITIALIZER(mlx4_mem_event_cb_list);
+static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data";
 
-rte_rwlock_t mlx4_mem_event_rwlock = RTE_RWLOCK_INITIALIZER;
+/* Shared memory between primary and secondary processes. */
+struct mlx4_shared_data *mlx4_shared_data;
+
+/* Spinlock for mlx4_shared_data allocation. */
+static rte_spinlock_t mlx4_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx4_local_data mlx4_local_data;
 
 /** Configuration structure for device arguments. */
 struct mlx4_conf {
@@ -69,6 +76,77 @@ const char *pmd_mlx4_init_params[] = {
 
 static void mlx4_dev_stop(struct rte_eth_dev *dev);
 
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_init_shared_data(void)
+{
+	const struct rte_memzone *mz;
+	int ret = 0;
+
+	rte_spinlock_lock(&mlx4_shared_data_lock);
+	if (mlx4_shared_data == NULL) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			/* Allocate shared memory. */
+			mz = rte_memzone_reserve(MZ_MLX4_PMD_SHARED_DATA,
+						 sizeof(*mlx4_shared_data),
+						 SOCKET_ID_ANY, 0);
+			if (mz == NULL) {
+				ERROR("Cannot allocate mlx4 shared data\n");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx4_shared_data = mz->addr;
+			memset(mlx4_shared_data, 0, sizeof(*mlx4_shared_data));
+			rte_spinlock_init(&mlx4_shared_data->lock);
+		} else {
+			/* Lookup allocated shared memory. */
+			mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
+			if (mz == NULL) {
+				ERROR("Cannot attach mlx4 shared data\n");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx4_shared_data = mz->addr;
+			memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
+		}
+	}
+error:
+	rte_spinlock_unlock(&mlx4_shared_data_lock);
+	return ret;
+}
+
+/**
+ * Uninitialize shared data between primary and secondary process.
+ *
+ * The pointer of secondary process is dereferenced and primary process frees
+ * the memzone.
+ */
+static void
+mlx4_uninit_shared_data(void)
+{
+	const struct rte_memzone *mz;
+
+	rte_spinlock_lock(&mlx4_shared_data_lock);
+	if (mlx4_shared_data) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			mz = rte_memzone_lookup(MZ_MLX4_PMD_SHARED_DATA);
+			rte_memzone_free(mz);
+		} else {
+			memset(&mlx4_local_data, 0, sizeof(mlx4_local_data));
+		}
+		mlx4_shared_data = NULL;
+	}
+	rte_spinlock_unlock(&mlx4_shared_data_lock);
+}
+
 #ifdef HAVE_IBV_MLX4_BUF_ALLOCATORS
 /**
  * Verbs callback to allocate a memory. This function should allocate the space
@@ -181,6 +259,11 @@ mlx4_dev_start(struct rte_eth_dev *dev)
 		return 0;
 	DEBUG("%p: attaching configured flows to all RX queues", (void *)dev);
 	priv->started = 1;
+	ret = mlx4_tx_uar_remap(dev, priv->ctx->cmd_fd);
+	if (ret) {
+		ERROR("%p: cannot remap UAR", (void *)dev);
+		goto err;
+	}
 	ret = mlx4_rss_init(priv);
 	if (ret) {
 		ERROR("%p: cannot initialize RSS resources: %s",
@@ -208,6 +291,8 @@ mlx4_dev_start(struct rte_eth_dev *dev)
 	rte_wmb();
 	dev->tx_pkt_burst = mlx4_tx_burst;
 	dev->rx_pkt_burst = mlx4_rx_burst;
+	/* Enable datapath on secondary process. */
+	mlx4_mp_req_start_rxtx(dev);
 	return 0;
 err:
 	mlx4_dev_stop(dev);
@@ -226,6 +311,8 @@ static void
 mlx4_dev_stop(struct rte_eth_dev *dev)
 {
 	struct mlx4_priv *priv = dev->data->dev_private;
+	const size_t page_size = sysconf(_SC_PAGESIZE);
+	int i;
 
 	if (!priv->started)
 		return;
@@ -234,9 +321,20 @@ mlx4_dev_stop(struct rte_eth_dev *dev)
 	dev->tx_pkt_burst = mlx4_tx_burst_removed;
 	dev->rx_pkt_burst = mlx4_rx_burst_removed;
 	rte_wmb();
+	/* Disable datapath on secondary process. */
+	mlx4_mp_req_stop_rxtx(dev);
 	mlx4_flow_sync(priv, NULL);
 	mlx4_rxq_intr_disable(priv);
 	mlx4_rss_deinit(priv);
+	for (i = 0; i != dev->data->nb_tx_queues; ++i) {
+		struct txq *txq;
+
+		txq = dev->data->tx_queues[i];
+		if (!txq)
+			continue;
+		munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->msq.db,
+					       page_size), page_size);
+	}
 }
 
 /**
@@ -259,6 +357,8 @@ mlx4_dev_close(struct rte_eth_dev *dev)
 	dev->rx_pkt_burst = mlx4_rx_burst_removed;
 	dev->tx_pkt_burst = mlx4_tx_burst_removed;
 	rte_wmb();
+	/* Disable datapath on secondary process. */
+	mlx4_mp_req_stop_rxtx(dev);
 	mlx4_flow_clean(priv);
 	mlx4_rss_deinit(priv);
 	for (i = 0; i != dev->data->nb_rx_queues; ++i)
@@ -310,6 +410,14 @@ static const struct eth_dev_ops mlx4_dev_ops = {
 	.is_removed = mlx4_is_removed,
 };
 
+/* Available operations from secondary process. */
+static const struct eth_dev_ops mlx4_dev_sec_ops = {
+	.stats_get = mlx4_stats_get,
+	.stats_reset = mlx4_stats_reset,
+	.fw_version_get = mlx4_fw_version_get,
+	.dev_infos_get = mlx4_dev_infos_get,
+};
+
 /**
  * Get PCI information from struct ibv_device.
  *
@@ -549,6 +657,200 @@ mlx4_hw_rss_sup(struct ibv_context *ctx, struct ibv_pd *pd,
 
 static struct rte_pci_driver mlx4_driver;
 
+static int
+find_lower_va_bound(const struct rte_memseg_list *msl,
+		const struct rte_memseg *ms, void *arg)
+{
+	void **addr = arg;
+
+	if (msl->external)
+		return 0;
+	if (*addr == NULL)
+		*addr = ms->addr;
+	else
+		*addr = RTE_MIN(*addr, ms->addr);
+
+	return 0;
+}
+
+/**
+ * Reserve UAR address space for primary process.
+ *
+ * Process local resource is used by both primary and secondary to avoid
+ * duplicate reservation. The space has to be available on both primary and
+ * secondary process, TXQ UAR maps to this area using fixed mmap w/o double
+ * check.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_uar_init_primary(void)
+{
+	struct mlx4_shared_data *sd = mlx4_shared_data;
+	void *addr = (void *)0;
+
+	if (sd->uar_base)
+		return 0;
+	/* find out lower bound of hugepage segments */
+	rte_memseg_walk(find_lower_va_bound, &addr);
+	/* keep distance to hugepages to minimize potential conflicts. */
+	addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX4_UAR_OFFSET + MLX4_UAR_SIZE));
+	/* anonymous mmap, no real memory consumption. */
+	addr = mmap(addr, MLX4_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("failed to reserve UAR address space, please"
+		      " adjust MLX4_UAR_SIZE or try --base-virtaddr");
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	/* Accept either same addr or a new addr returned from mmap if target
+	 * range occupied.
+	 */
+	INFO("reserved UAR address space: %p", addr);
+	sd->uar_base = addr; /* for primary and secondary UAR re-mmap. */
+	return 0;
+}
+
+/**
+ * Unmap UAR address space reserved for primary process.
+ */
+static void
+mlx4_uar_uninit_primary(void)
+{
+	struct mlx4_shared_data *sd = mlx4_shared_data;
+
+	if (!sd->uar_base)
+		return;
+	munmap(sd->uar_base, MLX4_UAR_SIZE);
+	sd->uar_base = NULL;
+}
+
+/**
+ * Reserve UAR address space for secondary process, align with primary process.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_uar_init_secondary(void)
+{
+	struct mlx4_shared_data *sd = mlx4_shared_data;
+	struct mlx4_local_data *ld = &mlx4_local_data;
+	void *addr;
+
+	if (ld->uar_base) { /* Already reserved. */
+		assert(sd->uar_base == ld->uar_base);
+		return 0;
+	}
+	assert(sd->uar_base);
+	/* anonymous mmap, no real memory consumption. */
+	addr = mmap(sd->uar_base, MLX4_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("UAR mmap failed: %p size: %llu",
+		      sd->uar_base, MLX4_UAR_SIZE);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	if (sd->uar_base != addr) {
+		ERROR("UAR address %p size %llu occupied, please"
+		      " adjust MLX4_UAR_OFFSET or try EAL parameter"
+		      " --base-virtaddr",
+		      sd->uar_base, MLX4_UAR_SIZE);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	ld->uar_base = addr;
+	INFO("reserved UAR address space: %p", addr);
+	return 0;
+}
+
+/**
+ * Unmap UAR address space reserved for secondary process.
+ */
+static void
+mlx4_uar_uninit_secondary(void)
+{
+	struct mlx4_local_data *ld = &mlx4_local_data;
+
+	if (!ld->uar_base)
+		return;
+	munmap(ld->uar_base, MLX4_UAR_SIZE);
+	ld->uar_base = NULL;
+}
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx4_init_once(void)
+{
+	struct mlx4_shared_data *sd;
+	struct mlx4_local_data *ld = &mlx4_local_data;
+	int ret;
+
+	if (mlx4_init_shared_data())
+		return -rte_errno;
+	sd = mlx4_shared_data;
+	assert(sd);
+	rte_spinlock_lock(&sd->lock);
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		if (sd->init_done)
+			break;
+		LIST_INIT(&sd->mem_event_cb_list);
+		rte_rwlock_init(&sd->mem_event_rwlock);
+		rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
+						mlx4_mr_mem_event_cb, NULL);
+		mlx4_mp_init_primary();
+		ret = mlx4_uar_init_primary();
+		if (ret)
+			goto error;
+		sd->init_done = true;
+		break;
+	case RTE_PROC_SECONDARY:
+		if (ld->init_done)
+			break;
+		mlx4_mp_init_secondary();
+		ret = mlx4_uar_init_secondary();
+		if (ret)
+			goto error;
+		++sd->secondary_cnt;
+		ld->init_done = true;
+		break;
+	default:
+		break;
+	}
+	rte_spinlock_unlock(&sd->lock);
+	return 0;
+error:
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		mlx4_uar_uninit_primary();
+		mlx4_mp_uninit_primary();
+		rte_mem_event_callback_unregister("MLX4_MEM_EVENT_CB", NULL);
+		break;
+	case RTE_PROC_SECONDARY:
+		mlx4_uar_uninit_secondary();
+		mlx4_mp_uninit_secondary();
+		break;
+	default:
+		break;
+	}
+	rte_spinlock_unlock(&sd->lock);
+	mlx4_uninit_shared_data();
+	return -rte_errno;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -579,6 +881,12 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 	int i;
 
 	(void)pci_drv;
+	err = mlx4_init_once();
+	if (err) {
+		ERROR("unable to init PMD global data: %s",
+		      strerror(rte_errno));
+		return -rte_errno;
+	}
 	assert(pci_drv == &mlx4_driver);
 	list = mlx4_glue->get_device_list(&i);
 	if (list == NULL) {
@@ -659,6 +967,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		struct mlx4_priv *priv = NULL;
 		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
+		char name[RTE_ETH_NAME_MAX_LEN];
 
 		/* If port is not enabled, skip. */
 		if (!(conf.ports.enabled & (1 << i)))
@@ -669,6 +978,51 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			err = ENODEV;
 			goto port_error;
 		}
+		snprintf(name, sizeof(name), "%s port %u",
+			 mlx4_glue->get_device_name(ibv_dev), port);
+		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+			eth_dev = rte_eth_dev_attach_secondary(name);
+			if (eth_dev == NULL) {
+				ERROR("can not attach rte ethdev");
+				rte_errno = ENOMEM;
+				err = rte_errno;
+				goto error;
+			}
+			priv = eth_dev->data->dev_private;
+			if (!priv->verbs_alloc_ctx.enabled) {
+				ERROR("secondary process is not supported"
+				      " due to lack of external allocator"
+				      " from Verbs");
+				rte_errno = ENOTSUP;
+				err = rte_errno;
+				goto error;
+			}
+			eth_dev->device = &pci_dev->device;
+			eth_dev->dev_ops = &mlx4_dev_sec_ops;
+			/* Receive command fd from primary process. */
+			err = mlx4_mp_req_verbs_cmd_fd(eth_dev);
+			if (err < 0) {
+				err = rte_errno;
+				goto error;
+			}
+			/* Remap UAR for Tx queues. */
+			err = mlx4_tx_uar_remap(eth_dev, err);
+			if (err) {
+				err = rte_errno;
+				goto error;
+			}
+			/*
+			 * Ethdev pointer is still required as input since
+			 * the primary device is not accessible from the
+			 * secondary process.
+			 */
+			eth_dev->tx_pkt_burst = mlx4_tx_burst;
+			eth_dev->rx_pkt_burst = mlx4_rx_burst;
+			claim_zero(mlx4_glue->close_device(ctx));
+			rte_eth_copy_pci_info(eth_dev, pci_dev);
+			rte_eth_dev_probing_finish(eth_dev);
+			continue;
+		}
 		/* Check port status. */
 		err = mlx4_glue->query_port(ctx, port, &port_attr);
 		if (err) {
@@ -774,14 +1128,7 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		/* Get actual MTU if possible. */
 		mlx4_mtu_get(priv, &priv->mtu);
 		DEBUG("port %u MTU is %u", priv->port, priv->mtu);
-		/* from rte_ethdev.c */
-		{
-			char name[RTE_ETH_NAME_MAX_LEN];
-
-			snprintf(name, sizeof(name), "%s port %u",
-				 mlx4_glue->get_device_name(ibv_dev), port);
-			eth_dev = rte_eth_dev_allocate(name);
-		}
+		eth_dev = rte_eth_dev_allocate(name);
 		if (eth_dev == NULL) {
 			err = ENOMEM;
 			ERROR("can not allocate rte ethdev");
@@ -818,9 +1165,13 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			.free = &mlx4_free_verbs_buf,
 			.data = priv,
 		};
-		mlx4_glue->dv_set_context_attr
+		err = mlx4_glue->dv_set_context_attr
 			(ctx, MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS,
 			 (void *)((uintptr_t)&alctr));
+		if (err)
+			WARN("Verbs external allocator is not supported");
+		else
+			priv->verbs_alloc_ctx.enabled = 1;
 #endif
 		/* Bring Ethernet device up. */
 		DEBUG("forcing Ethernet interface up");
@@ -842,9 +1193,10 @@ mlx4_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			goto port_error;
 		}
 		/* Add device to memory callback list. */
-		rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
-		LIST_INSERT_HEAD(&mlx4_mem_event_cb_list, priv, mem_event_cb);
-		rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+		rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
+		LIST_INSERT_HEAD(&mlx4_shared_data->mem_event_cb_list,
+				 priv, mem_event_cb);
+		rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
 		continue;
 port_error:
@@ -1075,8 +1427,6 @@ RTE_INIT(rte_mlx4_pmd_init)
 	}
 	mlx4_glue->fork_init();
 	rte_pci_register(&mlx4_driver);
-	rte_mem_event_callback_register("MLX4_MEM_EVENT_CB",
-					mlx4_mr_mem_event_cb, NULL);
 }
 
 RTE_PMD_EXPORT_NAME(net_mlx4, __COUNTER__);
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index d43e05ea74..1a7b1fb541 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -53,6 +53,16 @@
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
+/* Reserved address space for UAR mapping. */
+#define MLX4_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4))
+
+/* Offset of reserved UAR address space to hugepage memory. Offset is used here
+ * to minimize possibility of address next to hugepage being used by other code
+ * in either primary or secondary process, failing to map TX UAR would make TX
+ * packets invisible to HW.
+ */
+#define MLX4_UAR_OFFSET (2ULL << (sizeof(uintptr_t) * 4))
+
 enum {
 	PCI_VENDOR_ID_MELLANOX = 0x15b3,
 };
@@ -63,6 +73,26 @@ enum {
 	PCI_DEVICE_ID_MELLANOX_CONNECTX3PRO = 0x1007,
 };
 
+/* Request types for IPC. */
+enum mlx4_mp_req_type {
+	MLX4_MP_REQ_VERBS_CMD_FD = 1,
+	MLX4_MP_REQ_START_RXTX,
+	MLX4_MP_REQ_STOP_RXTX,
+};
+
+/* Pameters for IPC. */
+struct mlx4_mp_param {
+	enum mlx4_mp_req_type type;
+	int port_id;
+	int result;
+};
+
+/** Request timeout for IPC. */
+#define MLX4_MP_REQ_TIMEOUT_SEC 5
+
+/** Key string for IPC. */
+#define MLX4_MP_NAME "net_mlx4_mp"
+
 /** Driver name reported to lower layers and used in log output. */
 #define MLX4_DRIVER_NAME "net_mlx4"
 
@@ -86,6 +116,7 @@ enum mlx4_verbs_alloc_type {
  * resources it is allocating.
  */
 struct mlx4_verbs_alloc_ctx {
+	int enabled;
 	enum mlx4_verbs_alloc_type type; /* Kind of object being allocated. */
 	const void *obj; /* Pointer to the DPDK object. */
 };
@@ -93,6 +124,27 @@ struct mlx4_verbs_alloc_ctx {
 LIST_HEAD(mlx4_dev_list, mlx4_priv);
 LIST_HEAD(mlx4_mr_list, mlx4_mr);
 
+/* Shared data between primary and secondary processes. */
+struct mlx4_shared_data {
+	rte_spinlock_t lock;
+	/* Global spinlock for primary and secondary processes. */
+	int init_done; /* Whether primary has done initialization. */
+	unsigned int secondary_cnt; /* Number of secondary processes init'd. */
+	void *uar_base;
+	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+	struct mlx4_dev_list mem_event_cb_list;
+	rte_rwlock_t mem_event_rwlock;
+};
+
+/* Per-process data structure, not visible to other processes. */
+struct mlx4_local_data {
+	int init_done; /* Whether a secondary has done initialization. */
+	void *uar_base;
+	/* Reserved UAR address space for TXQ UAR(hw doorbell) mapping. */
+};
+
+extern struct mlx4_shared_data *mlx4_shared_data;
+
 /** Private data structure. */
 struct mlx4_priv {
 	LIST_ENTRY(mlx4_priv) mem_event_cb;
@@ -175,4 +227,13 @@ void mlx4_rxq_intr_disable(struct mlx4_priv *priv);
 int mlx4_rx_intr_disable(struct rte_eth_dev *dev, uint16_t idx);
 int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
 
+/* mlx4_mp.c */
+void mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev);
+void mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev);
+int mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
+void mlx4_mp_init_primary(void);
+void mlx4_mp_uninit_primary(void);
+void mlx4_mp_init_secondary(void);
+void mlx4_mp_uninit_secondary(void);
+
 #endif /* RTE_PMD_MLX4_H_ */
diff --git a/drivers/net/mlx4/mlx4_mp.c b/drivers/net/mlx4/mlx4_mp.c
new file mode 100644
index 0000000000..eaeb257348
--- /dev/null
+++ b/drivers/net/mlx4/mlx4_mp.c
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <rte_ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include "mlx4.h"
+#include "mlx4_rxtx.h"
+#include "mlx4_utils.h"
+
+/**
+ * Initialize IPC message.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[out] msg
+ *   Pointer to message to fill in.
+ * @param[in] type
+ *   Message type.
+ */
+static inline void
+mp_init_msg(struct rte_eth_dev *dev, struct rte_mp_msg *msg,
+	    enum mlx4_mp_req_type type)
+{
+	struct mlx4_mp_param *param = (struct mlx4_mp_param *)msg->param;
+
+	memset(msg, 0, sizeof(*msg));
+	strlcpy(msg->name, MLX4_MP_NAME, sizeof(msg->name));
+	msg->len_param = sizeof(*param);
+	param->type = type;
+	param->port_id = dev->data->port_id;
+}
+
+/**
+ * IPC message handler of primary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] peer
+ *   Pointer to the peer socket path.
+ *
+ * @return
+ *   0 on success, negative errno value otherwise and rte_errno is set.
+ */
+static int
+mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
+	const struct mlx4_mp_param *param =
+		(const struct mlx4_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	struct mlx4_priv *priv;
+	int ret;
+
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		rte_errno = ENODEV;
+		ERROR("port %u invalid port ID", param->port_id);
+		return -rte_errno;
+	}
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+	switch (param->type) {
+	case MLX4_MP_REQ_VERBS_CMD_FD:
+		mp_init_msg(dev, &mp_res, param->type);
+		mp_res.num_fds = 1;
+		mp_res.fds[0] = priv->ctx->cmd_fd;
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		ERROR("port %u invalid mp request type", dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] peer
+ *   Pointer to the peer socket path.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx4_mp_param *res = (struct mlx4_mp_param *)mp_res.param;
+	const struct mlx4_mp_param *param =
+		(const struct mlx4_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	int ret;
+
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		rte_errno = ENODEV;
+		ERROR("port %u invalid port ID", param->port_id);
+		return -rte_errno;
+	}
+	dev = &rte_eth_devices[param->port_id];
+	switch (param->type) {
+	case MLX4_MP_REQ_START_RXTX:
+		INFO("port %u starting datapath", dev->data->port_id);
+		rte_mb();
+		dev->tx_pkt_burst = mlx4_tx_burst;
+		dev->rx_pkt_burst = mlx4_rx_burst;
+		mp_init_msg(dev, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX4_MP_REQ_STOP_RXTX:
+		INFO("port %u stopping datapath", dev->data->port_id);
+		dev->tx_pkt_burst = mlx4_tx_burst_removed;
+		dev->rx_pkt_burst = mlx4_rx_burst_removed;
+		rte_mb();
+		mp_init_msg(dev, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		ERROR("port %u invalid mp request type", dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] type
+ *   Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx4_mp_req_type type)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx4_mp_param *res __rte_unused;
+	struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int ret;
+	int i;
+
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!mlx4_shared_data->secondary_cnt)
+		return;
+	if (type != MLX4_MP_REQ_START_RXTX && type != MLX4_MP_REQ_STOP_RXTX) {
+		ERROR("port %u unknown request (req_type %d)",
+		      dev->data->port_id, type);
+		return;
+	}
+	mp_init_msg(dev, &mp_req, type);
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		ERROR("port %u failed to request stop/start Rx/Tx (%d)",
+		      dev->data->port_id, type);
+		goto exit;
+	}
+	if (mp_rep.nb_sent != mp_rep.nb_received) {
+		ERROR("port %u not all secondaries responded (req_type %d)",
+		      dev->data->port_id, type);
+		goto exit;
+	}
+	for (i = 0; i < mp_rep.nb_received; i++) {
+		mp_res = &mp_rep.msgs[i];
+		res = (struct mlx4_mp_param *)mp_res->param;
+		if (res->result) {
+			ERROR("port %u request failed on secondary #%d",
+			      dev->data->port_id, i);
+			goto exit;
+		}
+	}
+exit:
+	free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx4_mp_req_start_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX4_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx4_mp_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX4_MP_REQ_STOP_RXTX);
+}
+
+/**
+ * IPC message handler of primary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ *
+ * @return
+ *   fd on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx4_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx4_mp_param *res;
+	struct timespec ts = {.tv_sec = MLX4_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	int ret;
+
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	mp_init_msg(dev, &mp_req, MLX4_MP_REQ_VERBS_CMD_FD);
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		ERROR("port %u request to primary process failed",
+		      dev->data->port_id);
+		return -rte_errno;
+	}
+	assert(mp_rep.nb_received == 1);
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mlx4_mp_param *)mp_res->param;
+	if (res->result) {
+		rte_errno = -res->result;
+		ERROR("port %u failed to get command FD from primary process",
+		      dev->data->port_id);
+		ret = -rte_errno;
+		goto exit;
+	}
+	assert(mp_res->num_fds == 1);
+	ret = mp_res->fds[0];
+	DEBUG("port %u command FD from primary is %d",
+	      dev->data->port_id, ret);
+exit:
+	free(mp_rep.msgs);
+	return ret;
+}
+
+/**
+ * Initialize by primary process.
+ */
+void
+mlx4_mp_init_primary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	rte_mp_action_register(MLX4_MP_NAME, mp_primary_handle);
+}
+
+/**
+ * Un-initialize by primary process.
+ */
+void
+mlx4_mp_uninit_primary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	rte_mp_action_unregister(MLX4_MP_NAME);
+}
+
+/**
+ * Initialize by secondary process.
+ */
+void
+mlx4_mp_init_secondary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	rte_mp_action_register(MLX4_MP_NAME, mp_secondary_handle);
+}
+
+/**
+ * Un-initialize by secondary process.
+ */
+void
+mlx4_mp_uninit_secondary(void)
+{
+	assert(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	rte_mp_action_unregister(MLX4_MP_NAME);
+}
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index e4be46ab2a..01894faecf 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -489,6 +489,8 @@ mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
 	struct mlx4_mr *mr_next;
 	struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 
+	/* Must be called from the primary process. */
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/*
 	 * MR can't be freed with holding the lock because rte_free() could call
 	 * memory free callback function. This will be a deadlock situation.
@@ -561,6 +563,14 @@ mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 
 	DEBUG("port %u creating a MR using address (%p)",
 	      dev->data->port_id, (void *)addr);
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+		WARN("port %u using address (%p) of unregistered mempool"
+		     " in secondary process, please create mempool"
+		     " before rte_eth_dev_start()",
+		     dev->data->port_id, (void *)addr);
+		rte_errno = EPERM;
+		goto err_nolock;
+	}
 	/*
 	 * Release detached MRs if any. This can't be called with holding either
 	 * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
@@ -890,14 +900,17 @@ mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 		     size_t len, void *arg __rte_unused)
 {
 	struct mlx4_priv *priv;
+	struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list;
 
+	/* Must be called from the primary process. */
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	switch (event_type) {
 	case RTE_MEM_EVENT_FREE:
-		rte_rwlock_read_lock(&mlx4_mem_event_rwlock);
+		rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock);
 		/* Iterate all the existing mlx4 devices. */
-		LIST_FOREACH(priv, &mlx4_mem_event_cb_list, mem_event_cb)
+		LIST_FOREACH(priv, dev_list, mem_event_cb)
 			mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
-		rte_rwlock_read_unlock(&mlx4_mem_event_rwlock);
+		rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock);
 		break;
 	case RTE_MEM_EVENT_ALLOC:
 	default:
@@ -1130,6 +1143,7 @@ mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
 	struct mlx4_mr_cache entry;
 	uint32_t lkey;
 
+	assert(rte_eal_process_type() == RTE_PROC_PRIMARY);
 	/* If already registered, it should return. */
 	rte_rwlock_read_lock(&priv->mr.rwlock);
 	lkey = mr_lookup_dev(dev, &entry, addr);
@@ -1225,6 +1239,14 @@ mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp)
 	struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
 	struct mlx4_priv *priv = txq->priv;
 
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+		WARN("port %u using address (%p) from unregistered mempool"
+		     " having externally allocated memory"
+		     " in secondary process, please create mempool"
+		     " prior to rte_eth_dev_start()",
+		     PORT_ID(priv), (void *)addr);
+		return UINT32_MAX;
+	}
 	mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
 	return mlx4_tx_addr2mr_bh(txq, addr);
 }
@@ -1336,9 +1358,9 @@ mlx4_mr_release(struct rte_eth_dev *dev)
 	struct mlx4_mr *mr_next = LIST_FIRST(&priv->mr.mr_list);
 
 	/* Remove from memory callback device list. */
-	rte_rwlock_write_lock(&mlx4_mem_event_rwlock);
+	rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
 	LIST_REMOVE(priv, mem_event_cb);
-	rte_rwlock_write_unlock(&mlx4_mem_event_rwlock);
+	rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
 #ifndef NDEBUG
 	mlx4_mr_dump_dev(dev);
 #endif
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index aef77ba06e..b3e11dde25 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -77,7 +77,9 @@ struct mlx4_sq {
 	uint32_t owner_opcode;
 	/**< Default owner opcode with HW valid owner bit. */
 	uint32_t stamp; /**< Stamp value with an invalid HW owner bit. */
-	volatile uint32_t *db; /**< Pointer to the doorbell. */
+	volatile uint32_t *qp_sdb; /**< Pointer to the doorbell. */
+	volatile uint32_t *db; /**< Pointer to the doorbell remapped. */
+	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
 	uint32_t doorbell_qpn; /**< qp number to write to the doorbell. */
 };
 
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 8c88effcd1..f22f1ba559 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -1365,6 +1365,7 @@ mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	(void)dpdk_txq;
 	(void)pkts;
 	(void)pkts_n;
+	rte_mb();
 	return 0;
 }
 
@@ -1390,5 +1391,6 @@ mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	(void)dpdk_rxq;
 	(void)pkts;
 	(void)pkts_n;
+	rte_mb();
 	return 0;
 }
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 9409602b32..7d7a8988ed 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -152,6 +152,7 @@ uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 
 /* mlx4_txq.c */
 
+int mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd);
 uint64_t mlx4_get_tx_port_offloads(struct mlx4_priv *priv);
 int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			uint16_t desc, unsigned int socket,
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 2dc198e77f..698a648c8d 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -13,7 +13,9 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <inttypes.h>
+#include <unistd.h>
 
 /* Verbs headers do not support -pedantic. */
 #ifdef PEDANTIC
@@ -38,6 +40,100 @@
 #include "mlx4_utils.h"
 
 /**
+ * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
+ * Both primary and secondary process do mmap to make UAR address
+ * aligned.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param fd
+ *   Verbs file descriptor to map UAR pages.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+int
+mlx4_tx_uar_remap(struct rte_eth_dev *dev, int fd)
+{
+	unsigned int i, j;
+	const unsigned int txqs_n = dev->data->nb_tx_queues;
+	uintptr_t pages[txqs_n];
+	unsigned int pages_n = 0;
+	uintptr_t uar_va;
+	uintptr_t off;
+	void *addr;
+	void *ret;
+	struct txq *txq;
+	int already_mapped;
+	size_t page_size = sysconf(_SC_PAGESIZE);
+
+	memset(pages, 0, txqs_n * sizeof(uintptr_t));
+	/*
+	 * As rdma-core, UARs are mapped in size of OS page size.
+	 * Use aligned address to avoid duplicate mmap.
+	 * Ref to libmlx4 function: mlx4_init_context()
+	 */
+	for (i = 0; i != txqs_n; ++i) {
+		txq = dev->data->tx_queues[i];
+		if (!txq)
+			continue;
+		/* UAR addr form verbs used to find dup and offset in page. */
+		uar_va = (uintptr_t)txq->msq.qp_sdb;
+		off = uar_va & (page_size - 1); /* offset in page. */
+		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
+		already_mapped = 0;
+		for (j = 0; j != pages_n; ++j) {
+			if (pages[j] == uar_va) {
+				already_mapped = 1;
+				break;
+			}
+		}
+		/* new address in reserved UAR address space. */
+		addr = RTE_PTR_ADD(mlx4_shared_data->uar_base,
+				   uar_va & (uintptr_t)(MLX4_UAR_SIZE - 1));
+		if (!already_mapped) {
+			pages[pages_n++] = uar_va;
+			/* fixed mmap to specified address in reserved
+			 * address space.
+			 */
+			ret = mmap(addr, page_size,
+				   PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+				   txq->msq.uar_mmap_offset);
+			if (ret != addr) {
+				/* fixed mmap has to return same address. */
+				ERROR("port %u call to mmap failed on UAR"
+				      " for txq %u",
+				      dev->data->port_id, i);
+				rte_errno = ENXIO;
+				return -rte_errno;
+			}
+		}
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once. */
+			txq->msq.db = RTE_PTR_ADD((void *)addr, off);
+		else
+			assert(txq->msq.db ==
+			       RTE_PTR_ADD((void *)addr, off));
+	}
+	return 0;
+}
+#else
+int
+mlx4_tx_uar_remap(struct rte_eth_dev *dev __rte_unused, int fd __rte_unused)
+{
+	/*
+	 * Even if rdma-core doesn't support UAR remap, primary process
+	 * shouldn't be interrupted.
+	 */
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		return 0;
+	ERROR("UAR remap is not supported");
+	rte_errno = ENOTSUP;
+	return -rte_errno;
+}
+#endif
+
+/**
  * Free Tx queue elements.
  *
  * @param txq
@@ -89,7 +185,13 @@ mlx4_txq_fill_dv_obj_info(struct txq *txq, struct mlx4dv_obj *mlxdv)
 	sq->owner_opcode = MLX4_OPCODE_SEND | (0u << MLX4_SQ_OWNER_BIT);
 	sq->stamp = rte_cpu_to_be_32(MLX4_SQ_STAMP_VAL |
 				     (0u << MLX4_SQ_OWNER_BIT));
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+	sq->uar_mmap_offset = dqp->uar_mmap_offset;
+	sq->qp_sdb = dqp->sdb;
+#else
+	sq->uar_mmap_offset = -1; /* Make mmap() fail. */
 	sq->db = dqp->sdb;
+#endif
 	sq->doorbell_qpn = dqp->doorbell_qpn;
 	cq->buf = dcq->buf.buf;
 	cq->cqe_cnt = dcq->cqe_cnt;
@@ -307,6 +409,11 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		goto error;
 	}
 	/* Retrieve device queue information. */
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+	dv_qp = (struct mlx4dv_qp){
+		.comp_mask = MLX4DV_QP_MASK_UAR_MMAP_OFFSET,
+	};
+#endif
 	mlxdv.cq.in = txq->cq;
 	mlxdv.cq.out = &dv_cq;
 	mlxdv.qp.in = txq->qp;
@@ -318,6 +425,12 @@ mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		      " accessing the device queues", (void *)dev);
 		goto error;
 	}
+#ifdef HAVE_IBV_MLX4_UAR_MMAP_OFFSET
+	if (!(dv_qp.comp_mask & MLX4DV_QP_MASK_UAR_MMAP_OFFSET)) {
+		WARN("%p: failed to obtain UAR mmap offset", (void *)dev);
+		dv_qp.uar_mmap_offset = -1; /* Make mmap() fail. */
+	}
+#endif
 	mlx4_txq_fill_dv_obj_info(txq, &mlxdv);
 	/* Save first wqe pointer in the first element. */
 	(&(*txq->elts)[0])->wqe =
-- 
2.11.0


  parent reply	other threads:[~2019-04-01 21:17 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-03-07  7:39 [dpdk-dev] [PATCH 0/3] " Yongseok Koh
2019-03-07  7:39 ` [dpdk-dev] [PATCH 1/3] net/mlx4: change device reference for secondary process Yongseok Koh
2019-03-07  7:39 ` [dpdk-dev] [PATCH 2/3] net/mlx4: add external allocator for Verbs object Yongseok Koh
2019-03-07  7:39 ` [dpdk-dev] [PATCH 3/3] net/mlx4: add secondary process support Yongseok Koh
2019-03-25 19:17 ` [dpdk-dev] [PATCH v2 0/3] " Yongseok Koh
2019-03-25 19:17   ` Yongseok Koh
2019-03-25 19:17   ` [dpdk-dev] [PATCH v2 1/3] net/mlx4: change device reference for secondary process Yongseok Koh
2019-03-25 19:17     ` Yongseok Koh
2019-03-26 19:16     ` Shahaf Shuler
2019-03-26 19:16       ` Shahaf Shuler
2019-03-25 19:18   ` [dpdk-dev] [PATCH v2 2/3] net/mlx4: add external allocator for Verbs object Yongseok Koh
2019-03-25 19:18     ` Yongseok Koh
2019-03-26 19:21     ` Shahaf Shuler
2019-03-26 19:21       ` Shahaf Shuler
2019-03-25 19:18   ` [dpdk-dev] [PATCH v2 3/3] net/mlx4: add secondary process support Yongseok Koh
2019-03-25 19:18     ` Yongseok Koh
2019-03-26 19:33     ` Shahaf Shuler
2019-03-26 19:33       ` Shahaf Shuler
2019-03-28 19:01       ` Yongseok Koh
2019-03-28 19:01         ` Yongseok Koh
2019-04-01 21:15 ` [dpdk-dev] [PATCH v3 0/3] " Yongseok Koh
2019-04-01 21:15   ` Yongseok Koh
2019-04-01 21:15   ` [dpdk-dev] [PATCH v3 1/3] net/mlx4: change device reference for secondary process Yongseok Koh
2019-04-01 21:15     ` Yongseok Koh
2019-04-01 21:15   ` [dpdk-dev] [PATCH v3 2/3] net/mlx4: add external allocator for Verbs object Yongseok Koh
2019-04-01 21:15     ` Yongseok Koh
2019-04-01 21:15   ` Yongseok Koh [this message]
2019-04-01 21:15     ` [dpdk-dev] [PATCH v3 3/3] net/mlx4: add secondary process support Yongseok Koh
2019-04-02  7:12   ` [dpdk-dev] [PATCH v3 0/3] " Shahaf Shuler
2019-04-02  7:12     ` Shahaf Shuler

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190401211553.26063-4-yskoh@mellanox.com \
    --to=yskoh@mellanox.com \
    --cc=dev@dpdk.org \
    --cc=shahafs@mellanox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).