DPDK patches and discussions
 help / color / mirror / Atom feed
From: Srikanth Kaka <srikanth.k@oneconvergence.com>
To: Matan Azrad <matan@nvidia.com>,
	Viacheslav Ovsiienko <viacheslavo@nvidia.com>
Cc: dev@dpdk.org, Vag Singh <vag.singh@oneconvergence.com>,
	Anand Thulasiram <avelu@juniper.net>,
	Srikanth Kaka <srikanth.k@oneconvergence.com>
Subject: [dpdk-dev] [PATCH 02/19] net/mlx5: stub for FreeBSD
Date: Mon, 27 Sep 2021 19:04:33 +0530	[thread overview]
Message-ID: <20210927133450.10653-3-srikanth.k@oneconvergence.com> (raw)
In-Reply-To: <20210927133450.10653-1-srikanth.k@oneconvergence.com>

These files are a copy of their Linux equivalents.
They will be ported to FreeBSD.

Signed-off-by: Srikanth Kaka <srikanth.k@oneconvergence.com>
Signed-off-by: Vag Singh <vag.singh@oneconvergence.com>
Signed-off-by: Anand Thulasiram <avelu@juniper.net>
---
 drivers/net/mlx5/freebsd/mlx5_ethdev_os.c | 1632 +++++++++++
 drivers/net/mlx5/freebsd/mlx5_flow_os.c   |   38 +
 drivers/net/mlx5/freebsd/mlx5_flow_os.h   |  484 ++++
 drivers/net/mlx5/freebsd/mlx5_mp_os.c     |  305 ++
 drivers/net/mlx5/freebsd/mlx5_os.c        | 3208 +++++++++++++++++++++
 drivers/net/mlx5/freebsd/mlx5_os.h        |   24 +
 drivers/net/mlx5/freebsd/mlx5_socket.c    |  249 ++
 drivers/net/mlx5/freebsd/mlx5_verbs.c     | 1208 ++++++++
 drivers/net/mlx5/freebsd/mlx5_verbs.h     |   18 +
 9 files changed, 7166 insertions(+)
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_ethdev_os.c
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_flow_os.c
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_flow_os.h
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_mp_os.c
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_os.c
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_os.h
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_socket.c
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_verbs.c
 create mode 100644 drivers/net/mlx5/freebsd/mlx5_verbs.h

diff --git a/drivers/net/mlx5/freebsd/mlx5_ethdev_os.c b/drivers/net/mlx5/freebsd/mlx5_ethdev_os.c
new file mode 100644
index 0000000000..f34133e2c6
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_ethdev_os.c
@@ -0,0 +1,1632 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <fcntl.h>
+#include <stdalign.h>
+#include <sys/un.h>
+#include <time.h>
+
+#include <ethdev_driver.h>
+#include <rte_bus_pci.h>
+#include <rte_mbuf.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+#include <rte_rwlock.h>
+#include <rte_cycles.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_malloc.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/* Supported speed values found in /usr/include/linux/ethtool.h */
+#ifndef HAVE_SUPPORTED_40000baseKR4_Full
+#define SUPPORTED_40000baseKR4_Full (1 << 23)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseCR4_Full
+#define SUPPORTED_40000baseCR4_Full (1 << 24)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseSR4_Full
+#define SUPPORTED_40000baseSR4_Full (1 << 25)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseLR4_Full
+#define SUPPORTED_40000baseLR4_Full (1 << 26)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseKR4_Full
+#define SUPPORTED_56000baseKR4_Full (1 << 27)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseCR4_Full
+#define SUPPORTED_56000baseCR4_Full (1 << 28)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseSR4_Full
+#define SUPPORTED_56000baseSR4_Full (1 << 29)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseLR4_Full
+#define SUPPORTED_56000baseLR4_Full (1 << 30)
+#endif
+
+/* Add defines in case the running kernel is not the same as user headers. */
+#ifndef ETHTOOL_GLINKSETTINGS
+struct ethtool_link_settings {
+	uint32_t cmd;
+	uint32_t speed;
+	uint8_t duplex;
+	uint8_t port;
+	uint8_t phy_address;
+	uint8_t autoneg;
+	uint8_t mdio_support;
+	uint8_t eth_to_mdix;
+	uint8_t eth_tp_mdix_ctrl;
+	int8_t link_mode_masks_nwords;
+	uint32_t reserved[8];
+	uint32_t link_mode_masks[];
+};
+
+/* The kernel values can be found in /include/uapi/linux/ethtool.h */
+#define ETHTOOL_GLINKSETTINGS 0x0000004c
+#define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
+#define ETHTOOL_LINK_MODE_Autoneg_BIT 6
+#define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
+#define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
+#define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
+#define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
+#define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
+#define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
+#define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
+#define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
+#define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
+#define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
+#define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
+#define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
+#define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
+#define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_25G
+#define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
+#define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
+#define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_50G
+#define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
+#define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_100G
+#define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
+#define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
+#define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
+#define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_200G
+#define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
+#define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
+#define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
+#endif
+
+/* Get interface index from SubFunction device name. */
+int
+mlx5_auxiliary_get_ifindex(const char *sf_name)
+{
+	char if_name[IF_NAMESIZE] = { 0 };
+
+	if (mlx5_auxiliary_get_child_name(sf_name, "/net",
+					  if_name, sizeof(if_name)) != 0)
+		return -rte_errno;
+	return if_nametoindex(if_name);
+}
+
+/**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_ifname_sysfs().
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] ifname
+ *   Interface name output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	unsigned int ifindex;
+
+	MLX5_ASSERT(priv);
+	MLX5_ASSERT(priv->sh);
+	if (priv->master && priv->sh->bond.ifindex > 0) {
+		memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
+		return 0;
+	}
+	ifindex = mlx5_ifindex(dev);
+	if (!ifindex) {
+		if (!priv->representor)
+			return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
+						     *ifname);
+		rte_errno = ENXIO;
+		return -rte_errno;
+	}
+	if (if_indextoname(ifindex, &(*ifname)[0]))
+		return 0;
+	rte_errno = errno;
+	return -rte_errno;
+}
+
+/**
+ * Perform ifreq ioctl() on associated netdev ifname.
+ *
+ * @param[in] ifname
+ *   Pointer to netdev name.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
+{
+	int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	int ret = 0;
+
+	if (sock == -1) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
+	ret = ioctl(sock, req, ifr);
+	if (ret == -1) {
+		rte_errno = errno;
+		goto error;
+	}
+	close(sock);
+	return 0;
+error:
+	close(sock);
+	return -rte_errno;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param req
+ *   Request number to pass to ioctl().
+ * @param[out] ifr
+ *   Interface request structure output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+{
+	char ifname[sizeof(ifr->ifr_name)];
+	int ret;
+
+	ret = mlx5_get_ifname(dev, &ifname);
+	if (ret)
+		return -rte_errno;
+	return mlx5_ifreq_by_ifname(ifname, req, ifr);
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] mtu
+ *   MTU value output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
+{
+	struct ifreq request;
+	int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+
+	if (ret)
+		return ret;
+	*mtu = request.ifr_mtu;
+	return 0;
+}
+
+/**
+ * Set device MTU.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param mtu
+ *   MTU value to set.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct ifreq request = { .ifr_mtu = mtu, };
+
+	return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+}
+
+/**
+ * Set device flags.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param keep
+ *   Bitmask for flags that must remain untouched.
+ * @param flags
+ *   Bitmask for flags to modify.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
+{
+	struct ifreq request;
+	int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+
+	if (ret)
+		return ret;
+	request.ifr_flags &= keep;
+	request.ifr_flags |= flags & ~keep;
+	return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+}
+
+/**
+ * Get device current raw clock counter
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] time
+ *   Current raw clock counter of the device.
+ *
+ * @return
+ *   0 if the clock has correctly been read
+ *   The value of errno in case of error
+ */
+int
+mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_context *ctx = priv->sh->ctx;
+	struct ibv_values_ex values;
+	int err = 0;
+
+	values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
+	err = mlx5_glue->query_rt_values_ex(ctx, &values);
+	if (err != 0) {
+		DRV_LOG(WARNING, "Could not query the clock !");
+		return err;
+	}
+	*clock = values.raw_clock.tv_nsec;
+	return 0;
+}
+
+/**
+ * Retrieve the master device for representor in the same switch domain.
+ *
+ * @param dev
+ *   Pointer to representor Ethernet device structure.
+ *
+ * @return
+ *   Master device structure  on success, NULL otherwise.
+ */
+static struct rte_eth_dev *
+mlx5_find_master_dev(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv;
+	uint16_t port_id;
+	uint16_t domain_id;
+
+	priv = dev->data->dev_private;
+	domain_id = priv->domain_id;
+	MLX5_ASSERT(priv->representor);
+	MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
+		struct mlx5_priv *opriv =
+			rte_eth_devices[port_id].data->dev_private;
+		if (opriv &&
+		    opriv->master &&
+		    opriv->domain_id == domain_id &&
+		    opriv->sh == priv->sh)
+			return &rte_eth_devices[port_id];
+	}
+	return NULL;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] link
+ *   Storage for current link status.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
+			       struct rte_eth_link *link)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ethtool_cmd edata = {
+		.cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
+	};
+	struct ifreq ifr;
+	struct rte_eth_link dev_link;
+	int link_speed = 0;
+	int ret;
+
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	dev_link = (struct rte_eth_link) {
+		.link_status = ((ifr.ifr_flags & IFF_UP) &&
+				(ifr.ifr_flags & IFF_RUNNING)),
+	};
+	ifr = (struct ifreq) {
+		.ifr_data = (void *)&edata,
+	};
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		if (ret == -ENOTSUP && priv->representor) {
+			struct rte_eth_dev *master;
+
+			/*
+			 * For representors we can try to inherit link
+			 * settings from the master device. Actually
+			 * link settings do not make a lot of sense
+			 * for representors due to missing physical
+			 * link. The old kernel drivers supported
+			 * emulated settings query for representors,
+			 * the new ones do not, so we have to add
+			 * this code for compatibility issues.
+			 */
+			master = mlx5_find_master_dev(dev);
+			if (master) {
+				ifr = (struct ifreq) {
+					.ifr_data = (void *)&edata,
+				};
+				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+			}
+		}
+		if (ret) {
+			DRV_LOG(WARNING,
+				"port %u ioctl(SIOCETHTOOL,"
+				" ETHTOOL_GSET) failed: %s",
+				dev->data->port_id, strerror(rte_errno));
+			return ret;
+		}
+	}
+	link_speed = ethtool_cmd_speed(&edata);
+	if (link_speed == -1)
+		dev_link.link_speed = ETH_SPEED_NUM_UNKNOWN;
+	else
+		dev_link.link_speed = link_speed;
+	priv->link_speed_capa = 0;
+	if (edata.supported & (SUPPORTED_1000baseT_Full |
+			       SUPPORTED_1000baseKX_Full))
+		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+	if (edata.supported & SUPPORTED_10000baseKR_Full)
+		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+	if (edata.supported & (SUPPORTED_40000baseKR4_Full |
+			       SUPPORTED_40000baseCR4_Full |
+			       SUPPORTED_40000baseSR4_Full |
+			       SUPPORTED_40000baseLR4_Full))
+		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+	dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+			ETH_LINK_SPEED_FIXED);
+	*link = dev_link;
+	return 0;
+}
+
+/**
+ * Retrieve physical link information (unlocked version using new ioctl).
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] link
+ *   Storage for current link status.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
+			     struct rte_eth_link *link)
+
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
+	struct ifreq ifr;
+	struct rte_eth_link dev_link;
+	struct rte_eth_dev *master = NULL;
+	uint64_t sc;
+	int ret;
+
+	ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	dev_link = (struct rte_eth_link) {
+		.link_status = ((ifr.ifr_flags & IFF_UP) &&
+				(ifr.ifr_flags & IFF_RUNNING)),
+	};
+	ifr = (struct ifreq) {
+		.ifr_data = (void *)&gcmd,
+	};
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		if (ret == -ENOTSUP && priv->representor) {
+			/*
+			 * For representors we can try to inherit link
+			 * settings from the master device. Actually
+			 * link settings do not make a lot of sense
+			 * for representors due to missing physical
+			 * link. The old kernel drivers supported
+			 * emulated settings query for representors,
+			 * the new ones do not, so we have to add
+			 * this code for compatibility issues.
+			 */
+			master = mlx5_find_master_dev(dev);
+			if (master) {
+				ifr = (struct ifreq) {
+					.ifr_data = (void *)&gcmd,
+				};
+				ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+			}
+		}
+		if (ret) {
+			DRV_LOG(DEBUG,
+				"port %u ioctl(SIOCETHTOOL,"
+				" ETHTOOL_GLINKSETTINGS) failed: %s",
+				dev->data->port_id, strerror(rte_errno));
+			return ret;
+		}
+	}
+	gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
+
+	alignas(struct ethtool_link_settings)
+	uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
+		     sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
+	struct ethtool_link_settings *ecmd = (void *)data;
+
+	*ecmd = gcmd;
+	ifr.ifr_data = (void *)ecmd;
+	ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(DEBUG,
+			"port %u ioctl(SIOCETHTOOL,"
+			"ETHTOOL_GLINKSETTINGS) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
+				ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
+	sc = ecmd->link_mode_masks[0] |
+		((uint64_t)ecmd->link_mode_masks[1] << 32);
+	priv->link_speed_capa = 0;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_20G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_56G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_25G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_50G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_100G;
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+
+	sc = ecmd->link_mode_masks[2] |
+		((uint64_t)ecmd->link_mode_masks[3] << 32);
+	if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
+		  MLX5_BITSHIFT
+		       (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
+		  MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
+		priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+	dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
+				ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+	dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+				  ETH_LINK_SPEED_FIXED);
+	*link = dev_link;
+	return 0;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ *   Wait for request completion.
+ *
+ * @return
+ *   0 if link status was not updated, positive if it was, a negative errno
+ *   value otherwise and rte_errno is set.
+ */
+int
+mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+	int ret;
+	struct rte_eth_link dev_link;
+	time_t start_time = time(NULL);
+	int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
+
+	do {
+		ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
+		if (ret == -ENOTSUP)
+			ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
+		if (ret == 0)
+			break;
+		/* Handle wait to complete situation. */
+		if ((wait_to_complete || retry) && ret == -EAGAIN) {
+			if (abs((int)difftime(time(NULL), start_time)) <
+			    MLX5_LINK_STATUS_TIMEOUT) {
+				usleep(0);
+				continue;
+			} else {
+				rte_errno = EBUSY;
+				return -rte_errno;
+			}
+		} else if (ret < 0) {
+			return ret;
+		}
+	} while (wait_to_complete || retry-- > 0);
+	ret = !!memcmp(&dev->data->dev_link, &dev_link,
+		       sizeof(struct rte_eth_link));
+	dev->data->dev_link = dev_link;
+	return ret;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ *   Flow control output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_GPAUSEPARAM
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
+			" %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	fc_conf->autoneg = ethpause.autoneg;
+	if (ethpause.rx_pause && ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_FULL;
+	else if (ethpause.rx_pause)
+		fc_conf->mode = RTE_FC_RX_PAUSE;
+	else if (ethpause.tx_pause)
+		fc_conf->mode = RTE_FC_TX_PAUSE;
+	else
+		fc_conf->mode = RTE_FC_NONE;
+	return 0;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ *   Flow control parameters.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+	struct ifreq ifr;
+	struct ethtool_pauseparam ethpause = {
+		.cmd = ETHTOOL_SPAUSEPARAM
+	};
+	int ret;
+
+	ifr.ifr_data = (void *)&ethpause;
+	ethpause.autoneg = fc_conf->autoneg;
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_RX_PAUSE))
+		ethpause.rx_pause = 1;
+	else
+		ethpause.rx_pause = 0;
+
+	if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+	    (fc_conf->mode & RTE_FC_TX_PAUSE))
+		ethpause.tx_pause = 1;
+	else
+		ethpause.tx_pause = 0;
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+			" failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	return 0;
+}
+
+/**
+ * Handle asynchronous removal event for entire multiport device.
+ *
+ * @param sh
+ *   Infiniband device shared context.
+ */
+static void
+mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
+{
+	uint32_t i;
+
+	for (i = 0; i < sh->max_port; ++i) {
+		struct rte_eth_dev *dev;
+
+		if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
+			/*
+			 * Or not existing port either no
+			 * handler installed for this port.
+			 */
+			continue;
+		}
+		dev = &rte_eth_devices[sh->port[i].ih_port_id];
+		MLX5_ASSERT(dev);
+		if (dev->data->dev_conf.intr_conf.rmv)
+			rte_eth_dev_callback_process
+				(dev, RTE_ETH_EVENT_INTR_RMV, NULL);
+	}
+}
+
+/**
+ * Handle shared asynchronous events the NIC (removal event
+ * and link status change). Supports multiport IB device.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler(void *cb_arg)
+{
+	struct mlx5_dev_ctx_shared *sh = cb_arg;
+	struct ibv_async_event event;
+
+	/* Read all message from the IB device and acknowledge them. */
+	for (;;) {
+		struct rte_eth_dev *dev;
+		uint32_t tmp;
+
+		if (mlx5_glue->get_async_event(sh->ctx, &event))
+			break;
+		/* Retrieve and check IB port index. */
+		tmp = (uint32_t)event.element.port_num;
+		if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
+			/*
+			 * The DEVICE_FATAL event is called once for
+			 * entire device without port specifying.
+			 * We should notify all existing ports.
+			 */
+			mlx5_glue->ack_async_event(&event);
+			mlx5_dev_interrupt_device_fatal(sh);
+			continue;
+		}
+		MLX5_ASSERT(tmp && (tmp <= sh->max_port));
+		if (!tmp) {
+			/* Unsupported device level event. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"unsupported common event (type %d)",
+				event.event_type);
+			continue;
+		}
+		if (tmp > sh->max_port) {
+			/* Invalid IB port index. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"cannot handle an event (type %d)"
+				"due to invalid IB port index (%u)",
+				event.event_type, tmp);
+			continue;
+		}
+		if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+			/* No handler installed. */
+			mlx5_glue->ack_async_event(&event);
+			DRV_LOG(DEBUG,
+				"cannot handle an event (type %d)"
+				"due to no handler installed for port %u",
+				event.event_type, tmp);
+			continue;
+		}
+		/* Retrieve ethernet device descriptor. */
+		tmp = sh->port[tmp - 1].ih_port_id;
+		dev = &rte_eth_devices[tmp];
+		MLX5_ASSERT(dev);
+		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+		     event.event_type == IBV_EVENT_PORT_ERR) &&
+			dev->data->dev_conf.intr_conf.lsc) {
+			mlx5_glue->ack_async_event(&event);
+			if (mlx5_link_update(dev, 0) == -EAGAIN) {
+				usleep(0);
+				continue;
+			}
+			rte_eth_dev_callback_process
+				(dev, RTE_ETH_EVENT_INTR_LSC, NULL);
+			continue;
+		}
+		DRV_LOG(DEBUG,
+			"port %u cannot handle an unknown event (type %d)",
+			dev->data->port_id, event.event_type);
+		mlx5_glue->ack_async_event(&event);
+	}
+}
+
+/*
+ * Unregister callback handler safely. The handler may be active
+ * while we are trying to unregister it, in this case code -EAGAIN
+ * is returned by rte_intr_callback_unregister(). This routine checks
+ * the return code and tries to unregister handler again.
+ *
+ * @param handle
+ *   interrupt handle
+ * @param cb_fn
+ *   pointer to callback routine
+ * @cb_arg
+ *   opaque callback parameter
+ */
+void
+mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+			      rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+	/*
+	 * Try to reduce timeout management overhead by not calling
+	 * the timer related routines on the first iteration. If the
+	 * unregistering succeeds on first call there will be no
+	 * timer calls at all.
+	 */
+	uint64_t twait = 0;
+	uint64_t start = 0;
+
+	do {
+		int ret;
+
+		ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
+		if (ret >= 0)
+			return;
+		if (ret != -EAGAIN) {
+			DRV_LOG(INFO, "failed to unregister interrupt"
+				      " handler (error: %d)", ret);
+			MLX5_ASSERT(false);
+			return;
+		}
+		if (twait) {
+			struct timespec onems;
+
+			/* Wait one millisecond and try again. */
+			onems.tv_sec = 0;
+			onems.tv_nsec = NS_PER_S / MS_PER_S;
+			nanosleep(&onems, 0);
+			/* Check whether one second elapsed. */
+			if ((rte_get_timer_cycles() - start) <= twait)
+				continue;
+		} else {
+			/*
+			 * We get the amount of timer ticks for one second.
+			 * If this amount elapsed it means we spent one
+			 * second in waiting. This branch is executed once
+			 * on first iteration.
+			 */
+			twait = rte_get_timer_hz();
+			MLX5_ASSERT(twait);
+		}
+		/*
+		 * Timeout elapsed, show message (once a second) and retry.
+		 * We have no other acceptable option here, if we ignore
+		 * the unregistering return code the handler will not
+		 * be unregistered, fd will be closed and we may get the
+		 * crush. Hanging and messaging in the loop seems not to be
+		 * the worst choice.
+		 */
+		DRV_LOG(INFO, "Retrying to unregister interrupt handler");
+		start = rte_get_timer_cycles();
+	} while (true);
+}
+
+/**
+ * Handle DEVX interrupts from the NIC.
+ * This function is probably called from the DPDK host thread.
+ *
+ * @param cb_arg
+ *   Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler_devx(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_ASYNC
+	(void)cb_arg;
+	return;
+#else
+	struct mlx5_dev_ctx_shared *sh = cb_arg;
+	union {
+		struct mlx5dv_devx_async_cmd_hdr cmd_resp;
+		uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+			    MLX5_ST_SZ_BYTES(traffic_counter) +
+			    sizeof(struct mlx5dv_devx_async_cmd_hdr)];
+	} out;
+	uint8_t *buf = out.buf + sizeof(out.cmd_resp);
+
+	while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
+						   &out.cmd_resp,
+						   sizeof(out.buf)))
+		mlx5_flow_async_pool_query_handle
+			(sh, (uint64_t)out.cmd_resp.wr_id,
+			 mlx5_devx_get_out_command_status(buf));
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_down(struct rte_eth_dev *dev)
+{
+	return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_up(struct rte_eth_dev *dev)
+{
+	return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
+}
+
+/**
+ * Check if mlx5 device was removed.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ * @return
+ *   1 when device is removed, otherwise 0.
+ */
+int
+mlx5_is_removed(struct rte_eth_dev *dev)
+{
+	struct ibv_device_attr device_attr;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
+		return 1;
+	return 0;
+}
+
+/**
+ * Analyze gathered port parameters via sysfs to recognize master
+ * and representor devices for E-Switch configuration.
+ *
+ * @param[in] device_dir
+ *   flag of presence of "device" directory under port device key.
+ * @param[inout] switch_info
+ *   Port information, including port name as a number and port name
+ *   type if recognized
+ *
+ * @return
+ *   master and representor flags are set in switch_info according to
+ *   recognized parameters (if any).
+ */
+static void
+mlx5_sysfs_check_switch_info(bool device_dir,
+			     struct mlx5_switch_info *switch_info)
+{
+	switch (switch_info->name_type) {
+	case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+		/*
+		 * Name is not recognized, assume the master,
+		 * check the device directory presence.
+		 */
+		switch_info->master = device_dir;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+		/*
+		 * Name is not set, this assumes the legacy naming
+		 * schema for master, just check if there is
+		 * a device directory.
+		 */
+		switch_info->master = device_dir;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+		/* New uplink naming schema recognized. */
+		switch_info->master = 1;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+		/* Legacy representors naming schema. */
+		switch_info->representor = !device_dir;
+		break;
+	case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+		/* Fallthrough */
+	case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+		/* Fallthrough */
+	case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+		/* New representors naming schema. */
+		switch_info->representor = 1;
+		break;
+	default:
+		switch_info->master = device_dir;
+		break;
+	}
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param ifindex
+ *   Network interface index.
+ * @param[out] info
+ *   Switch information object, populated in case of success.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
+{
+	char ifname[IF_NAMESIZE];
+	char port_name[IF_NAMESIZE];
+	FILE *file;
+	struct mlx5_switch_info data = {
+		.master = 0,
+		.representor = 0,
+		.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
+		.port_name = 0,
+		.switch_id = 0,
+	};
+	DIR *dir;
+	bool port_switch_id_set = false;
+	bool device_dir = false;
+	char c;
+	int ret;
+
+	if (!if_indextoname(ifindex, ifname)) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+
+	MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
+	      ifname);
+	MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
+	      ifname);
+	MKSTR(pci_device, "/sys/class/net/%s/device",
+	      ifname);
+
+	file = fopen(phys_port_name, "rb");
+	if (file != NULL) {
+		ret = fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", port_name);
+		fclose(file);
+		if (ret == 1)
+			mlx5_translate_port_name(port_name, &data);
+	}
+	file = fopen(phys_switch_id, "rb");
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	port_switch_id_set =
+		fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
+		c == '\n';
+	fclose(file);
+	dir = opendir(pci_device);
+	if (dir != NULL) {
+		closedir(dir);
+		device_dir = true;
+	}
+	if (port_switch_id_set) {
+		/* We have some E-Switch configuration. */
+		mlx5_sysfs_check_switch_info(device_dir, &data);
+	}
+	*info = data;
+	MLX5_ASSERT(!(data.master && data.representor));
+	if (data.master && data.representor) {
+		DRV_LOG(ERR, "ifindex %u device is recognized as master"
+			     " and as representor", ifindex);
+		rte_errno = ENODEV;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Get bond information associated with network interface.
+ *
+ * @param pf_ifindex
+ *   Network interface index of bond slave interface
+ * @param[out] ifindex
+ *   Pointer to bond ifindex.
+ * @param[out] ifname
+ *   Pointer to bond ifname.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
+		     char *ifname)
+{
+	char name[IF_NAMESIZE];
+	FILE *file;
+	unsigned int index;
+	int ret;
+
+	if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
+	/* read bond ifindex */
+	file = fopen(bond_if, "rb");
+	if (file == NULL) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	ret = fscanf(file, "%u", &index);
+	fclose(file);
+	if (ret <= 0) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	if (ifindex)
+		*ifindex = index;
+
+	/* read bond device name from symbol link */
+	if (ifname) {
+		if (!if_indextoname(index, ifname)) {
+			rte_errno = errno;
+			return -rte_errno;
+		}
+	}
+	return 0;
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM information (type and size).
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] modinfo
+ *   Storage for plug-in module EEPROM information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_module_info(struct rte_eth_dev *dev,
+		     struct rte_eth_dev_module_info *modinfo)
+{
+	struct ethtool_modinfo info = {
+		.cmd = ETHTOOL_GMODULEINFO,
+	};
+	struct ifreq ifr = (struct ifreq) {
+		.ifr_data = (void *)&info,
+	};
+	int ret = 0;
+
+	if (!dev) {
+		DRV_LOG(WARNING, "missing argument, cannot get module info");
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+		return ret;
+	}
+	modinfo->type = info.type;
+	modinfo->eeprom_len = info.eeprom_len;
+	return ret;
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM data.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param[out] info
+ *   Storage for plug-in module EEPROM data.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
+			   struct rte_dev_eeprom_info *info)
+{
+	struct ethtool_eeprom *eeprom;
+	struct ifreq ifr;
+	int ret = 0;
+
+	if (!dev) {
+		DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	eeprom = mlx5_malloc(MLX5_MEM_ZERO,
+			     (sizeof(struct ethtool_eeprom) + info->length), 0,
+			     SOCKET_ID_ANY);
+	if (!eeprom) {
+		DRV_LOG(WARNING, "port %u cannot allocate memory for "
+			"eeprom data", dev->data->port_id);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	eeprom->cmd = ETHTOOL_GMODULEEEPROM;
+	eeprom->offset = info->offset;
+	eeprom->len = info->length;
+	ifr = (struct ifreq) {
+		.ifr_data = (void *)eeprom,
+	};
+	ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret)
+		DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+			dev->data->port_id, strerror(rte_errno));
+	else
+		rte_memcpy(info->data, eeprom->data, info->length);
+	mlx5_free(eeprom);
+	return ret;
+}
+
+/**
+ * Read device counters table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[in] pf
+ *   PF index in case of bonding device, -1 otherwise
+ * @param[out] stats
+ *   Counters table output buffer.
+ *
+ * @return
+ *   0 on success and stats is filled, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+static int
+_mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	unsigned int i;
+	struct ifreq ifr;
+	unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
+	unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
+	struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
+	int ret;
+
+	et_stats->cmd = ETHTOOL_GSTATS;
+	et_stats->n_stats = xstats_ctrl->stats_n;
+	ifr.ifr_data = (caddr_t)et_stats;
+	if (pf >= 0)
+		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
+					   SIOCETHTOOL, &ifr);
+	else
+		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING,
+			"port %u unable to read statistic values from device",
+			dev->data->port_id);
+		return ret;
+	}
+	for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+		if (xstats_ctrl->info[i].dev)
+			continue;
+		stats[i] += (uint64_t)
+			    et_stats->data[xstats_ctrl->dev_table_idx[i]];
+	}
+	return 0;
+}
+
+/**
+ * Read device counters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] stats
+ *   Counters table output buffer.
+ *
+ * @return
+ *   0 on success and stats is filled, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	int ret = 0, i;
+
+	memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
+	/* Read ifreq counters. */
+	if (priv->master && priv->pf_bond >= 0) {
+		/* Sum xstats from bonding device member ports. */
+		for (i = 0; i < priv->sh->bond.n_port; i++) {
+			ret = _mlx5_os_read_dev_counters(dev, i, stats);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ret = _mlx5_os_read_dev_counters(dev, -1, stats);
+	}
+	/* Read IB counters. */
+	for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+		if (!xstats_ctrl->info[i].dev)
+			continue;
+		ret = mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
+					    &stats[i]);
+		/* return last xstats counter if fail to read. */
+		if (ret != 0)
+			xstats_ctrl->xstats[i] = stats[i];
+		else
+			stats[i] = xstats_ctrl->xstats[i];
+	}
+	return ret;
+}
+
+/**
+ * Query the number of statistics provided by ETHTOOL.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   Number of statistics on success, negative errno value otherwise and
+ *   rte_errno is set.
+ */
+int
+mlx5_os_get_stats_n(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ethtool_drvinfo drvinfo;
+	struct ifreq ifr;
+	int ret;
+
+	drvinfo.cmd = ETHTOOL_GDRVINFO;
+	ifr.ifr_data = (caddr_t)&drvinfo;
+	if (priv->master && priv->pf_bond >= 0)
+		/* Bonding PF. */
+		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
+					   SIOCETHTOOL, &ifr);
+	else
+		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u unable to query number of statistics",
+			dev->data->port_id);
+		return ret;
+	}
+	return drvinfo.n_stats;
+}
+
+static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
+	{
+		.dpdk_name = "rx_unicast_bytes",
+		.ctr_name = "rx_vport_unicast_bytes",
+	},
+	{
+		.dpdk_name = "rx_multicast_bytes",
+		.ctr_name = "rx_vport_multicast_bytes",
+	},
+	{
+		.dpdk_name = "rx_broadcast_bytes",
+		.ctr_name = "rx_vport_broadcast_bytes",
+	},
+	{
+		.dpdk_name = "rx_unicast_packets",
+		.ctr_name = "rx_vport_unicast_packets",
+	},
+	{
+		.dpdk_name = "rx_multicast_packets",
+		.ctr_name = "rx_vport_multicast_packets",
+	},
+	{
+		.dpdk_name = "rx_broadcast_packets",
+		.ctr_name = "rx_vport_broadcast_packets",
+	},
+	{
+		.dpdk_name = "tx_unicast_bytes",
+		.ctr_name = "tx_vport_unicast_bytes",
+	},
+	{
+		.dpdk_name = "tx_multicast_bytes",
+		.ctr_name = "tx_vport_multicast_bytes",
+	},
+	{
+		.dpdk_name = "tx_broadcast_bytes",
+		.ctr_name = "tx_vport_broadcast_bytes",
+	},
+	{
+		.dpdk_name = "tx_unicast_packets",
+		.ctr_name = "tx_vport_unicast_packets",
+	},
+	{
+		.dpdk_name = "tx_multicast_packets",
+		.ctr_name = "tx_vport_multicast_packets",
+	},
+	{
+		.dpdk_name = "tx_broadcast_packets",
+		.ctr_name = "tx_vport_broadcast_packets",
+	},
+	{
+		.dpdk_name = "rx_wqe_errors",
+		.ctr_name = "rx_wqe_err",
+	},
+	{
+		.dpdk_name = "rx_phy_crc_errors",
+		.ctr_name = "rx_crc_errors_phy",
+	},
+	{
+		.dpdk_name = "rx_phy_in_range_len_errors",
+		.ctr_name = "rx_in_range_len_errors_phy",
+	},
+	{
+		.dpdk_name = "rx_phy_symbol_errors",
+		.ctr_name = "rx_symbol_err_phy",
+	},
+	{
+		.dpdk_name = "tx_phy_errors",
+		.ctr_name = "tx_errors_phy",
+	},
+	{
+		.dpdk_name = "rx_out_of_buffer",
+		.ctr_name = "out_of_buffer",
+		.dev = 1,
+	},
+	{
+		.dpdk_name = "tx_phy_packets",
+		.ctr_name = "tx_packets_phy",
+	},
+	{
+		.dpdk_name = "rx_phy_packets",
+		.ctr_name = "rx_packets_phy",
+	},
+	{
+		.dpdk_name = "tx_phy_discard_packets",
+		.ctr_name = "tx_discards_phy",
+	},
+	{
+		.dpdk_name = "rx_phy_discard_packets",
+		.ctr_name = "rx_discards_phy",
+	},
+	{
+		.dpdk_name = "tx_phy_bytes",
+		.ctr_name = "tx_bytes_phy",
+	},
+	{
+		.dpdk_name = "rx_phy_bytes",
+		.ctr_name = "rx_bytes_phy",
+	},
+	/* Representor only */
+	{
+		.dpdk_name = "rx_vport_packets",
+		.ctr_name = "vport_rx_packets",
+	},
+	{
+		.dpdk_name = "rx_vport_bytes",
+		.ctr_name = "vport_rx_bytes",
+	},
+	{
+		.dpdk_name = "tx_vport_packets",
+		.ctr_name = "vport_tx_packets",
+	},
+	{
+		.dpdk_name = "tx_vport_bytes",
+		.ctr_name = "vport_tx_bytes",
+	},
+};
+
+static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
+
+/**
+ * Init the structures to read device counters.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_os_stats_init(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+	struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+	unsigned int i;
+	unsigned int j;
+	struct ifreq ifr;
+	struct ethtool_gstrings *strings = NULL;
+	unsigned int dev_stats_n;
+	unsigned int str_sz;
+	int ret;
+
+	/* So that it won't aggregate for each init. */
+	xstats_ctrl->mlx5_stats_n = 0;
+	ret = mlx5_os_get_stats_n(dev);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "port %u no extended statistics available",
+			dev->data->port_id);
+		return;
+	}
+	dev_stats_n = ret;
+	/* Allocate memory to grab stat names and values. */
+	str_sz = dev_stats_n * ETH_GSTRING_LEN;
+	strings = (struct ethtool_gstrings *)
+		  mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
+			      SOCKET_ID_ANY);
+	if (!strings) {
+		DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
+		     dev->data->port_id);
+		return;
+	}
+	strings->cmd = ETHTOOL_GSTRINGS;
+	strings->string_set = ETH_SS_STATS;
+	strings->len = dev_stats_n;
+	ifr.ifr_data = (caddr_t)strings;
+	if (priv->master && priv->pf_bond >= 0)
+		/* Bonding master. */
+		ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
+					   SIOCETHTOOL, &ifr);
+	else
+		ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+	if (ret) {
+		DRV_LOG(WARNING, "port %u unable to get statistic names",
+			dev->data->port_id);
+		goto free;
+	}
+	for (i = 0; i != dev_stats_n; ++i) {
+		const char *curr_string = (const char *)
+			&strings->data[i * ETH_GSTRING_LEN];
+
+		for (j = 0; j != xstats_n; ++j) {
+			if (!strcmp(mlx5_counters_init[j].ctr_name,
+				    curr_string)) {
+				unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+				xstats_ctrl->dev_table_idx[idx] = i;
+				xstats_ctrl->info[idx] = mlx5_counters_init[j];
+				break;
+			}
+		}
+	}
+	/* Add dev counters. */
+	for (i = 0; i != xstats_n; ++i) {
+		if (mlx5_counters_init[i].dev) {
+			unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+			xstats_ctrl->info[idx] = mlx5_counters_init[i];
+			xstats_ctrl->hw_stats[idx] = 0;
+		}
+	}
+	MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
+	xstats_ctrl->stats_n = dev_stats_n;
+	/* Copy to base at first time. */
+	ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base);
+	if (ret)
+		DRV_LOG(ERR, "port %u cannot read device counters: %s",
+			dev->data->port_id, strerror(rte_errno));
+	mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+	stats_ctrl->imissed = 0;
+free:
+	mlx5_free(strings);
+}
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] mac
+ *   MAC address output buffer.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
+{
+	struct ifreq request;
+	int ret;
+
+	ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+	if (ret)
+		return ret;
+	memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
+	return 0;
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_flow_os.c b/drivers/net/mlx5/freebsd/mlx5_flow_os.c
new file mode 100644
index 0000000000..893f00b824
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_flow_os.c
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include "mlx5_flow_os.h"
+
+#include <rte_thread.h>
+
+/* Key of thread specific flow workspace data. */
+static rte_thread_key key_workspace;
+
+int
+mlx5_flow_os_init_workspace_once(void)
+{
+	if (rte_thread_key_create(&key_workspace, flow_release_workspace)) {
+		DRV_LOG(ERR, "Can't create flow workspace data thread key.");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void *
+mlx5_flow_os_get_specific_workspace(void)
+{
+	return rte_thread_value_get(key_workspace);
+}
+
+int
+mlx5_flow_os_set_specific_workspace(struct mlx5_flow_workspace *data)
+{
+	return rte_thread_value_set(key_workspace, data);
+}
+
+void
+mlx5_flow_os_release_workspace(void)
+{
+	rte_thread_key_delete(key_workspace);
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_flow_os.h b/drivers/net/mlx5/freebsd/mlx5_flow_os.h
new file mode 100644
index 0000000000..1926d26410
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_flow_os.h
@@ -0,0 +1,484 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_FLOW_OS_H_
+#define RTE_PMD_MLX5_FLOW_OS_H_
+
+#include "mlx5_flow.h"
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops;
+#endif
+
+/**
+ * Get OS enforced flow type. MLX5_FLOW_TYPE_MAX means "non enforced type".
+ *
+ * @return
+ *   Flow type (MLX5_FLOW_TYPE_MAX)
+ */
+static inline enum mlx5_flow_drv_type
+mlx5_flow_os_get_type(void)
+{
+	return MLX5_FLOW_TYPE_MAX;
+}
+
+/**
+ * Check if item type is supported.
+ *
+ * @param item
+ *   Item type to check.
+ *
+ * @return
+ *   True is this item type is supported, false if not supported.
+ */
+static inline bool
+mlx5_flow_os_item_supported(int item __rte_unused)
+{
+	return true;
+}
+
+/**
+ * Check if action type is supported.
+ *
+ * @param action
+ *   Action type to check.
+ *
+ * @return
+ *   True is this action type is supported, false if not supported.
+ */
+static inline bool
+mlx5_flow_os_action_supported(int action __rte_unused)
+{
+	return true;
+}
+
+/**
+ * Create flow rule.
+ *
+ * @param[in] matcher
+ *   Pointer to match mask structure.
+ * @param[in] match_value
+ *   Pointer to match value structure.
+ * @param[in] num_actions
+ *   Number of actions in flow rule.
+ * @param[in] actions
+ *   Pointer to array of flow rule actions.
+ * @param[out] flow
+ *   Pointer to a valid flow rule object on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow(void *matcher, void *match_value,
+			 size_t num_actions, void *actions[], void **flow)
+{
+	*flow = mlx5_glue->dv_create_flow(matcher, match_value,
+					  num_actions, actions);
+	return (*flow) ? 0 : -1;
+}
+
+/**
+ * Destroy flow rule.
+ *
+ * @param[in] drv_flow_ptr
+ *   Pointer to flow rule object.
+ *
+ * @return
+ *   0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow(void *drv_flow_ptr)
+{
+	return mlx5_glue->dv_destroy_flow(drv_flow_ptr);
+}
+
+/**
+ * Create flow table.
+ *
+ * @param[in] domain
+ *   Pointer to relevant domain.
+ * @param[in] table_id
+ *   Table ID.
+ * @param[out] table
+ *   Pointer to a valid flow table object on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_tbl(void *domain, uint32_t table_id, void **table)
+{
+	*table = mlx5_glue->dr_create_flow_tbl(domain, table_id);
+	return (*table) ? 0 : -1;
+}
+
+/**
+ * Destroy flow table.
+ *
+ * @param[in] table
+ *   Pointer to table object to destroy.
+ *
+ * @return
+ *   0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow_tbl(void *table)
+{
+	return mlx5_glue->dr_destroy_flow_tbl(table);
+}
+
+/**
+ * Create flow matcher in a flow table.
+ *
+ * @param[in] ctx
+ *   Pointer to relevant device context.
+ * @param[in] attr
+ *   Pointer to relevant attributes.
+ * @param[in] table
+ *   Pointer to table object.
+ * @param[out] matcher
+ *   Pointer to a valid flow matcher object on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_matcher(void *ctx, void *attr, void *table,
+				 void **matcher)
+{
+	*matcher = mlx5_glue->dv_create_flow_matcher(ctx, attr, table);
+	return (*matcher) ? 0 : -1;
+}
+
+/**
+ * Destroy flow matcher.
+ *
+ * @param[in] matcher
+ *   Pointer to matcher object to destroy.
+ *
+ * @return
+ *   0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow_matcher(void *matcher)
+{
+	return mlx5_glue->dv_destroy_flow_matcher(matcher);
+}
+
+/**
+ * Create flow action: packet reformat.
+ *
+ * @param[in] ctx
+ *   Pointer to relevant device context.
+ * @param[in] domain
+ *   Pointer to domain handler.
+ * @param[in] resource
+ *   Pointer to action data resource.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_packet_reformat(void *ctx, void *domain,
+						void *resource, void **action)
+{
+	struct mlx5_flow_dv_encap_decap_resource *res =
+			(struct mlx5_flow_dv_encap_decap_resource *)resource;
+
+	*action = mlx5_glue->dv_create_flow_action_packet_reformat
+					(ctx, res->reformat_type, res->ft_type,
+					 domain, res->flags, res->size,
+					 (res->size ? res->buf : NULL));
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: modify header.
+ *
+ * @param[in] ctx
+ *   Pointer to relevant device context.
+ * @param[in] domain
+ *   Pointer to domain handler.
+ * @param[in] resource
+ *   Pointer to action data resource.
+ * @param[in] actions_len
+ *   Total length of actions data in resource.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_modify_header(void *ctx, void *domain,
+					      void *resource,
+					      uint32_t actions_len,
+					      void **action)
+{
+	struct mlx5_flow_dv_modify_hdr_resource *res =
+			(struct mlx5_flow_dv_modify_hdr_resource *)resource;
+
+	*action = mlx5_glue->dv_create_flow_action_modify_header
+					(ctx, res->ft_type, domain, res->root ?
+					 MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL : 0,
+					 actions_len, (uint64_t *)res->actions);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: destination flow table.
+ *
+ * @param[in] tbl_obj
+ *   Pointer to destination table object.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_dest_flow_tbl(void *tbl_obj, void **action)
+{
+	*action = mlx5_glue->dr_create_flow_action_dest_flow_tbl(tbl_obj);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: destination port.
+ *
+ * @param[in] domain
+ *   Pointer to domain handler.
+ * @param[in] port_id
+ *   Destination port ID.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_dest_port(void *domain, uint32_t port_id,
+					  void **action)
+{
+	/*
+	 * Depending on rdma_core version the glue routine calls
+	 * either mlx5dv_dr_action_create_dest_ib_port(domain, dev_port)
+	 * or mlx5dv_dr_action_create_dest_vport(domain, vport_id).
+	 */
+	*action = mlx5_glue->dr_create_flow_action_dest_port(domain, port_id);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: push vlan.
+ *
+ * @param[in] domain
+ *   Pointer to domain handler.
+ * @param[in] vlan_tag
+ *   VLAN tag value.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_push_vlan(void *domain, rte_be32_t vlan_tag,
+					  void **action)
+{
+	*action = mlx5_glue->dr_create_flow_action_push_vlan(domain, vlan_tag);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: count.
+ *
+ * @param[in] cnt_obj
+ *   Pointer to DevX counter object.
+ * @param[in] offset
+ *   Offset of counter in array.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_count(void *cnt_obj, uint16_t offset,
+				      void **action)
+{
+	*action = mlx5_glue->dv_create_flow_action_counter(cnt_obj, offset);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: tag.
+ *
+ * @param[in] tag
+ *   Tag value.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_tag(uint32_t tag, void **action)
+{
+	*action = mlx5_glue->dv_create_flow_action_tag(tag);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: drop.
+ *
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_drop(void **action)
+{
+	*action = mlx5_glue->dr_create_flow_action_drop();
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: default miss.
+ *
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_default_miss(void **action)
+{
+	*action = mlx5_glue->dr_create_flow_action_default_miss();
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: dest_devx_tir
+ *
+ * @param[in] tir
+ *   Pointer to DevX tir object
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_dest_devx_tir(struct mlx5_devx_obj *tir,
+					      void **action)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	*action = mlx5_glue->dv_create_flow_action_dest_devx_tir(tir->obj);
+	return (*action) ? 0 : -1;
+#else
+	/* If no DV support - skip the operation and return success */
+	RTE_SET_USED(tir);
+	*action = 0;
+	return 0;
+#endif
+}
+
+/**
+ * Create flow action: sampler
+ *
+ * @param[in] attr
+ *   Pointer to sampler attribute
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_os_flow_dr_create_flow_action_sampler
+			(struct mlx5dv_dr_flow_sampler_attr *attr,
+			void **action)
+{
+	*action = mlx5_glue->dr_create_flow_action_sampler(attr);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: dest_array
+ *
+ * @param[in] domain
+ *   Pointer to relevant domain.
+ * @param[in] num_dest
+ *   Number of destinations array.
+ * @param[in] dests
+ *   Array of destination attributes.
+ * @param[out] action
+ *   Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ *   0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_os_flow_dr_create_flow_action_dest_array
+			(void *domain,
+			 size_t num_dest,
+			 struct mlx5dv_dr_action_dest_attr *dests[],
+			 void **action)
+{
+	*action = mlx5_glue->dr_create_flow_action_dest_array(
+						domain, num_dest, dests);
+	return (*action) ? 0 : -1;
+}
+
+/**
+ * Destroy flow action.
+ *
+ * @param[in] action
+ *   Pointer to action object to destroy.
+ *
+ * @return
+ *   0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow_action(void *action)
+{
+	return mlx5_glue->destroy_flow_action(action);
+}
+
+/**
+ * OS wrapper over Verbs API.
+ * Adjust flow priority based on the highest layer and the request priority.
+ *
+ * @param[in] dev
+ *    Pointer to the Ethernet device structure.
+ * @param[in] priority
+ *    The rule base priority.
+ * @param[in] subpriority
+ *    The priority based on the items.
+ *
+ * @return
+ *    The new priority.
+ */
+static inline uint32_t
+mlx5_os_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
+			  uint32_t subpriority)
+{
+	return mlx5_flow_adjust_priority(dev, priority, subpriority);
+}
+
+static inline int
+mlx5_os_flow_dr_sync_domain(void *domain, uint32_t flags)
+{
+	return mlx5_glue->dr_sync_domain(domain, flags);
+}
+#endif /* RTE_PMD_MLX5_FLOW_OS_H_ */
diff --git a/drivers/net/mlx5/freebsd/mlx5_mp_os.c b/drivers/net/mlx5/freebsd/mlx5_mp_os.c
new file mode 100644
index 0000000000..3a4aa766f8
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_mp_os.c
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+#include <mlx5_malloc.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rx.h"
+#include "mlx5_tx.h"
+#include "mlx5_utils.h"
+
+int
+mlx5_mp_os_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+	struct rte_mp_msg mp_res;
+	struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+	const struct mlx5_mp_param *param =
+		(const struct mlx5_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	struct mlx5_priv *priv;
+	struct mr_cache_entry entry;
+	uint32_t lkey;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		rte_errno = ENODEV;
+		DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+		return -rte_errno;
+	}
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+	switch (param->type) {
+	case MLX5_MP_REQ_CREATE_MR:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		lkey = mlx5_mr_create_primary(priv->sh->pd,
+					      &priv->sh->share_cache,
+					      &entry, param->args.addr,
+					      priv->config.mr_ext_memseg_en);
+		if (lkey == UINT32_MAX)
+			res->result = -rte_errno;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_VERBS_CMD_FD:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		mp_res.num_fds = 1;
+		mp_res.fds[0] = ((struct ibv_context *)priv->sh->ctx)->cmd_fd;
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_STATE_MODIFY:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_queue_state_modify_primary
+					(dev, &param->args.state_modify);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_RX_STOP:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_rx_queue_stop_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_RX_START:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_rx_queue_start_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_TX_STOP:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_tx_queue_stop_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_QUEUE_TX_START:
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = mlx5_tx_queue_start_primary
+					(dev, param->args.queue_id.queue_id);
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		DRV_LOG(ERR, "port %u invalid mp request type",
+			dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] peer
+ *   Pointer to the peer socket path.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mp_os_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+struct rte_mp_msg mp_res;
+	struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+	const struct mlx5_mp_param *param =
+		(const struct mlx5_mp_param *)mp_msg->param;
+	struct rte_eth_dev *dev;
+	struct mlx5_proc_priv *ppriv;
+	struct mlx5_priv *priv;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	if (!rte_eth_dev_is_valid_port(param->port_id)) {
+		rte_errno = ENODEV;
+		DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+		return -rte_errno;
+	}
+	dev = &rte_eth_devices[param->port_id];
+	priv = dev->data->dev_private;
+	switch (param->type) {
+	case MLX5_MP_REQ_START_RXTX:
+		DRV_LOG(INFO, "port %u starting datapath", dev->data->port_id);
+		dev->rx_pkt_burst = mlx5_select_rx_function(dev);
+		dev->tx_pkt_burst = mlx5_select_tx_function(dev);
+		ppriv = (struct mlx5_proc_priv *)dev->process_private;
+		/* If Tx queue number changes, re-initialize UAR. */
+		if (ppriv->uar_table_sz != priv->txqs_n) {
+			mlx5_tx_uar_uninit_secondary(dev);
+			mlx5_proc_priv_uninit(dev);
+			ret = mlx5_proc_priv_init(dev);
+			if (ret)
+				return -rte_errno;
+			ret = mlx5_tx_uar_init_secondary(dev, mp_msg->fds[0]);
+			if (ret) {
+				mlx5_proc_priv_uninit(dev);
+				return -rte_errno;
+			}
+		}
+		rte_mb();
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	case MLX5_MP_REQ_STOP_RXTX:
+		DRV_LOG(INFO, "port %u stopping datapath", dev->data->port_id);
+		dev->rx_pkt_burst = removed_rx_burst;
+		dev->tx_pkt_burst = removed_tx_burst;
+		rte_mb();
+		mp_init_msg(&priv->mp_id, &mp_res, param->type);
+		res->result = 0;
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+	default:
+		rte_errno = EINVAL;
+		DRV_LOG(ERR, "port %u invalid mp request type",
+			dev->data->port_id);
+		return -rte_errno;
+	}
+	return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param[in] type
+ *   Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx5_mp_req_type type)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx5_mp_param *res;
+	struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	struct mlx5_priv *priv = dev->data->dev_private;
+	int ret;
+	int i;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (!mlx5_shared_data->secondary_cnt)
+		return;
+	if (type != MLX5_MP_REQ_START_RXTX && type != MLX5_MP_REQ_STOP_RXTX) {
+		DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+			dev->data->port_id, type);
+		return;
+	}
+	mp_init_msg(&priv->mp_id, &mp_req, type);
+	if (type == MLX5_MP_REQ_START_RXTX) {
+		mp_req.num_fds = 1;
+		mp_req.fds[0] = ((struct ibv_context *)priv->sh->ctx)->cmd_fd;
+	}
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		if (rte_errno != ENOTSUP)
+			DRV_LOG(ERR, "port %u failed to request stop/start Rx/Tx (%d)",
+				dev->data->port_id, type);
+		goto exit;
+	}
+	if (mp_rep.nb_sent != mp_rep.nb_received) {
+		DRV_LOG(ERR,
+			"port %u not all secondaries responded (req_type %d)",
+			dev->data->port_id, type);
+		goto exit;
+	}
+	for (i = 0; i < mp_rep.nb_received; i++) {
+		mp_res = &mp_rep.msgs[i];
+		res = (struct mlx5_mp_param *)mp_res->param;
+		if (res->result) {
+			DRV_LOG(ERR, "port %u request failed on secondary #%d",
+				dev->data->port_id, i);
+			goto exit;
+		}
+	}
+exit:
+	mlx5_free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_os_req_start_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX5_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_os_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+	mp_req_on_rxtx(dev, MLX5_MP_REQ_STOP_RXTX);
+}
+
+/**
+ * Request Verbs Rx/Tx queue stop or start to the primary process.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet structure.
+ * @param queue_id
+ *   Queue ID to control.
+ * @param req_type
+ *   request type
+ *     MLX5_MP_REQ_QUEUE_RX_START - start Rx queue
+ *     MLX5_MP_REQ_QUEUE_TX_START - stop Tx queue
+ *     MLX5_MP_REQ_QUEUE_RX_STOP - stop Rx queue
+ *     MLX5_MP_REQ_QUEUE_TX_STOP - stop Tx queue
+ * @return
+ *   0 on success, a negative errno value otherwise and
+ *     rte_errno is set.
+ */
+int
+mlx5_mp_os_req_queue_control(struct rte_eth_dev *dev, uint16_t queue_id,
+			  enum mlx5_mp_req_type req_type)
+{
+	struct rte_mp_msg mp_req;
+	struct rte_mp_msg *mp_res;
+	struct rte_mp_reply mp_rep;
+	struct mlx5_mp_param *req = (struct mlx5_mp_param *)mp_req.param;
+	struct mlx5_mp_param *res;
+	struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+	struct mlx5_priv *priv;
+	int ret;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+	priv = dev->data->dev_private;
+	mp_init_msg(&priv->mp_id, &mp_req, req_type);
+	req->args.queue_id.queue_id = queue_id;
+	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+	if (ret) {
+		DRV_LOG(ERR, "port %u request to primary process failed",
+			dev->data->port_id);
+		return -rte_errno;
+	}
+	MLX5_ASSERT(mp_rep.nb_received == 1);
+	mp_res = &mp_rep.msgs[0];
+	res = (struct mlx5_mp_param *)mp_res->param;
+	ret = res->result;
+	free(mp_rep.msgs);
+	return ret;
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_os.c b/drivers/net/mlx5/freebsd/mlx5_os.c
new file mode 100644
index 0000000000..3746057673
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_os.c
@@ -0,0 +1,3208 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <net/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/sockios.h>
+#include <linux/ethtool.h>
+#include <fcntl.h>
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_bus_auxiliary.h>
+#include <rte_common.h>
+#include <rte_kvargs.h>
+#include <rte_rwlock.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+#include <rte_alarm.h>
+#include <rte_eal_paging.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+#include <mlx5_malloc.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_common_os.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rx.h"
+#include "mlx5_tx.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_mr.h"
+#include "mlx5_flow.h"
+#include "rte_pmd_mlx5.h"
+#include "mlx5_verbs.h"
+#include "mlx5_nl.h"
+#include "mlx5_devx.h"
+
+#ifndef HAVE_IBV_MLX5_MOD_MPW
+#define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
+#define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
+#endif
+
+#ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
+#define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
+#endif
+
+static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
+
+/* Spinlock for mlx5_shared_data allocation. */
+static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx5_local_data mlx5_local_data;
+
+/* rte flow indexed pool configuration. */
+static struct mlx5_indexed_pool_config icfg[] = {
+	{
+		.size = sizeof(struct rte_flow),
+		.trunk_size = 64,
+		.need_lock = 1,
+		.release_mem_en = 0,
+		.malloc = mlx5_malloc,
+		.free = mlx5_free,
+		.per_core_cache = 0,
+		.type = "ctl_flow_ipool",
+	},
+	{
+		.size = sizeof(struct rte_flow),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 1,
+		.release_mem_en = 0,
+		.malloc = mlx5_malloc,
+		.free = mlx5_free,
+		.per_core_cache = 1 << 14,
+		.type = "rte_flow_ipool",
+	},
+	{
+		.size = sizeof(struct rte_flow),
+		.trunk_size = 64,
+		.grow_trunk = 3,
+		.grow_shift = 2,
+		.need_lock = 1,
+		.release_mem_en = 0,
+		.malloc = mlx5_malloc,
+		.free = mlx5_free,
+		.per_core_cache = 0,
+		.type = "mcp_flow_ipool",
+	},
+};
+
+/**
+ * Set the completion channel file descriptor interrupt as non-blocking.
+ *
+ * @param[in] rxq_obj
+ *   Pointer to RQ channel object, which includes the channel fd
+ *
+ * @param[out] fd
+ *   The file descriptor (representing the intetrrupt) used in this channel.
+ *
+ * @return
+ *   0 on successfully setting the fd to non-blocking, non-zero otherwise.
+ */
+int
+mlx5_os_set_nonblock_channel_fd(int fd)
+{
+	int flags;
+
+	flags = fcntl(fd, F_GETFL);
+	return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+/**
+ * Get mlx5 device attributes. The glue function query_device_ex() is called
+ * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5
+ * device attributes from the glue out parameter.
+ *
+ * @param dev
+ *   Pointer to ibv context.
+ *
+ * @param device_attr
+ *   Pointer to mlx5 device attributes.
+ *
+ * @return
+ *   0 on success, non zero error number otherwise
+ */
+int
+mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr)
+{
+	int err;
+	struct ibv_device_attr_ex attr_ex;
+	memset(device_attr, 0, sizeof(*device_attr));
+	err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex);
+	if (err)
+		return err;
+
+	device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex;
+	device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr;
+	device_attr->max_sge = attr_ex.orig_attr.max_sge;
+	device_attr->max_cq = attr_ex.orig_attr.max_cq;
+	device_attr->max_cqe = attr_ex.orig_attr.max_cqe;
+	device_attr->max_mr = attr_ex.orig_attr.max_mr;
+	device_attr->max_pd = attr_ex.orig_attr.max_pd;
+	device_attr->max_qp = attr_ex.orig_attr.max_qp;
+	device_attr->max_srq = attr_ex.orig_attr.max_srq;
+	device_attr->max_srq_wr = attr_ex.orig_attr.max_srq_wr;
+	device_attr->raw_packet_caps = attr_ex.raw_packet_caps;
+	device_attr->max_rwq_indirection_table_size =
+		attr_ex.rss_caps.max_rwq_indirection_table_size;
+	device_attr->max_tso = attr_ex.tso_caps.max_tso;
+	device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts;
+
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	err = mlx5_glue->dv_query_device(ctx, &dv_attr);
+	if (err)
+		return err;
+
+	device_attr->flags = dv_attr.flags;
+	device_attr->comp_mask = dv_attr.comp_mask;
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+	device_attr->sw_parsing_offloads =
+		dv_attr.sw_parsing_caps.sw_parsing_offloads;
+#endif
+	device_attr->min_single_stride_log_num_of_bytes =
+		dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes;
+	device_attr->max_single_stride_log_num_of_bytes =
+		dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes;
+	device_attr->min_single_wqe_log_num_of_strides =
+		dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides;
+	device_attr->max_single_wqe_log_num_of_strides =
+		dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides;
+	device_attr->stride_supported_qpts =
+		dv_attr.striding_rq_caps.supported_qpts;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps;
+#endif
+	strlcpy(device_attr->fw_ver, attr_ex.orig_attr.fw_ver,
+		sizeof(device_attr->fw_ver));
+
+	return err;
+}
+
+/**
+ * Verbs callback to allocate a memory. This function should allocate the space
+ * according to the size provided residing inside a huge page.
+ * Please note that all allocation must respect the alignment from libmlx5
+ * (i.e. currently rte_mem_page_size()).
+ *
+ * @param[in] size
+ *   The size in bytes of the memory to allocate.
+ * @param[in] data
+ *   A pointer to the callback data.
+ *
+ * @return
+ *   Allocated buffer, NULL otherwise and rte_errno is set.
+ */
+static void *
+mlx5_alloc_verbs_buf(size_t size, void *data)
+{
+	struct mlx5_dev_ctx_shared *sh = data;
+	void *ret;
+	size_t alignment = rte_mem_page_size();
+	if (alignment == (size_t)-1) {
+		DRV_LOG(ERR, "Failed to get mem page size");
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	MLX5_ASSERT(data != NULL);
+	ret = mlx5_malloc(0, size, alignment, sh->numa_node);
+	if (!ret && size)
+		rte_errno = ENOMEM;
+	return ret;
+}
+
+/**
+ * Detect misc5 support or not
+ *
+ * @param[in] priv
+ *   Device private data pointer
+ */
+#ifdef HAVE_MLX5DV_DR
+static void
+__mlx5_discovery_misc5_cap(struct mlx5_priv *priv)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/* Dummy VxLAN matcher to detect rdma-core misc5 cap
+	 * Case: IPv4--->UDP--->VxLAN--->vni
+	 */
+	void *tbl;
+	struct mlx5_flow_dv_match_params matcher_mask;
+	void *match_m;
+	void *matcher;
+	void *headers_m;
+	void *misc5_m;
+	uint32_t *tunnel_header_m;
+	struct mlx5dv_flow_matcher_attr dv_attr;
+
+	memset(&matcher_mask, 0, sizeof(matcher_mask));
+	matcher_mask.size = sizeof(matcher_mask.buf);
+	match_m = matcher_mask.buf;
+	headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers);
+	misc5_m = MLX5_ADDR_OF(fte_match_param,
+			       match_m, misc_parameters_5);
+	tunnel_header_m = (uint32_t *)
+				MLX5_ADDR_OF(fte_match_set_misc5,
+				misc5_m, tunnel_header_1);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 4);
+	MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff);
+	*tunnel_header_m = 0xffffff;
+
+	tbl = mlx5_glue->dr_create_flow_tbl(priv->sh->rx_domain, 1);
+	if (!tbl) {
+		DRV_LOG(INFO, "No SW steering support");
+		return;
+	}
+	dv_attr.type = IBV_FLOW_ATTR_NORMAL,
+	dv_attr.match_mask = (void *)&matcher_mask,
+	dv_attr.match_criteria_enable =
+			(1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT) |
+			(1 << MLX5_MATCH_CRITERIA_ENABLE_MISC5_BIT);
+	dv_attr.priority = 3;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	void *misc2_m;
+	if (priv->config.dv_esw_en) {
+		/* FDB enabled reg_c_0 */
+		dv_attr.match_criteria_enable |=
+				(1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT);
+		misc2_m = MLX5_ADDR_OF(fte_match_param,
+				       match_m, misc_parameters_2);
+		MLX5_SET(fte_match_set_misc2, misc2_m,
+			 metadata_reg_c_0, 0xffff);
+	}
+#endif
+	matcher = mlx5_glue->dv_create_flow_matcher(priv->sh->ctx,
+						    &dv_attr, tbl);
+	if (matcher) {
+		priv->sh->misc5_cap = 1;
+		mlx5_glue->dv_destroy_flow_matcher(matcher);
+	}
+	mlx5_glue->dr_destroy_flow_tbl(tbl);
+#else
+	RTE_SET_USED(priv);
+#endif
+}
+#endif
+
+/**
+ * Verbs callback to free a memory.
+ *
+ * @param[in] ptr
+ *   A pointer to the memory to free.
+ * @param[in] data
+ *   A pointer to the callback data.
+ */
+static void
+mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+	MLX5_ASSERT(data != NULL);
+	mlx5_free(ptr);
+}
+
+/**
+ * Initialize DR related data within private structure.
+ * Routine checks the reference counter and does actual
+ * resources creation/initialization only if counter is zero.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ *
+ * @return
+ *   Zero on success, positive error code otherwise.
+ */
+static int
+mlx5_alloc_shared_dr(struct mlx5_priv *priv)
+{
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	char s[MLX5_NAME_SIZE] __rte_unused;
+	int err;
+
+	MLX5_ASSERT(sh && sh->refcnt);
+	if (sh->refcnt > 1)
+		return 0;
+	err = mlx5_alloc_table_hash_list(priv);
+	if (err)
+		goto error;
+	/* The resources below are only valid with DV support. */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/* Init port id action list. */
+	snprintf(s, sizeof(s), "%s_port_id_action_list", sh->ibdev_name);
+	sh->port_id_action_list = mlx5_list_create(s, sh, true,
+						   flow_dv_port_id_create_cb,
+						   flow_dv_port_id_match_cb,
+						   flow_dv_port_id_remove_cb,
+						   flow_dv_port_id_clone_cb,
+						 flow_dv_port_id_clone_free_cb);
+	if (!sh->port_id_action_list)
+		goto error;
+	/* Init push vlan action list. */
+	snprintf(s, sizeof(s), "%s_push_vlan_action_list", sh->ibdev_name);
+	sh->push_vlan_action_list = mlx5_list_create(s, sh, true,
+						    flow_dv_push_vlan_create_cb,
+						    flow_dv_push_vlan_match_cb,
+						    flow_dv_push_vlan_remove_cb,
+						    flow_dv_push_vlan_clone_cb,
+					       flow_dv_push_vlan_clone_free_cb);
+	if (!sh->push_vlan_action_list)
+		goto error;
+	/* Init sample action list. */
+	snprintf(s, sizeof(s), "%s_sample_action_list", sh->ibdev_name);
+	sh->sample_action_list = mlx5_list_create(s, sh, true,
+						  flow_dv_sample_create_cb,
+						  flow_dv_sample_match_cb,
+						  flow_dv_sample_remove_cb,
+						  flow_dv_sample_clone_cb,
+						  flow_dv_sample_clone_free_cb);
+	if (!sh->sample_action_list)
+		goto error;
+	/* Init dest array action list. */
+	snprintf(s, sizeof(s), "%s_dest_array_list", sh->ibdev_name);
+	sh->dest_array_list = mlx5_list_create(s, sh, true,
+					       flow_dv_dest_array_create_cb,
+					       flow_dv_dest_array_match_cb,
+					       flow_dv_dest_array_remove_cb,
+					       flow_dv_dest_array_clone_cb,
+					      flow_dv_dest_array_clone_free_cb);
+	if (!sh->dest_array_list)
+		goto error;
+#endif
+#ifdef HAVE_MLX5DV_DR
+	void *domain;
+
+	/* Reference counter is zero, we should initialize structures. */
+	domain = mlx5_glue->dr_create_domain(sh->ctx,
+					     MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
+	if (!domain) {
+		DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
+		err = errno;
+		goto error;
+	}
+	sh->rx_domain = domain;
+	domain = mlx5_glue->dr_create_domain(sh->ctx,
+					     MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
+	if (!domain) {
+		DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
+		err = errno;
+		goto error;
+	}
+	sh->tx_domain = domain;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (priv->config.dv_esw_en) {
+		domain  = mlx5_glue->dr_create_domain
+			(sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
+		if (!domain) {
+			DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
+			err = errno;
+			goto error;
+		}
+		sh->fdb_domain = domain;
+	}
+	/*
+	 * The drop action is just some dummy placeholder in rdma-core. It
+	 * does not belong to domains and has no any attributes, and, can be
+	 * shared by the entire device.
+	 */
+	sh->dr_drop_action = mlx5_glue->dr_create_flow_action_drop();
+	if (!sh->dr_drop_action) {
+		DRV_LOG(ERR, "FDB mlx5dv_dr_create_flow_action_drop");
+		err = errno;
+		goto error;
+	}
+#endif
+	if (!sh->tunnel_hub && priv->config.dv_miss_info)
+		err = mlx5_alloc_tunnel_hub(sh);
+	if (err) {
+		DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err);
+		goto error;
+	}
+	if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
+		mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
+		mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
+		if (sh->fdb_domain)
+			mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
+	}
+	sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
+	if (!priv->config.allow_duplicate_pattern) {
+#ifndef HAVE_MLX5_DR_ALLOW_DUPLICATE
+		DRV_LOG(WARNING, "Disallow duplicate pattern is not supported - maybe old rdma-core version?");
+#endif
+		mlx5_glue->dr_allow_duplicate_rules(sh->rx_domain, 0);
+		mlx5_glue->dr_allow_duplicate_rules(sh->tx_domain, 0);
+		if (sh->fdb_domain)
+			mlx5_glue->dr_allow_duplicate_rules(sh->fdb_domain, 0);
+	}
+
+	__mlx5_discovery_misc5_cap(priv);
+#endif /* HAVE_MLX5DV_DR */
+	sh->default_miss_action =
+			mlx5_glue->dr_create_flow_action_default_miss();
+	if (!sh->default_miss_action)
+		DRV_LOG(WARNING, "Default miss action is not supported.");
+	return 0;
+error:
+	/* Rollback the created objects. */
+	if (sh->rx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->rx_domain);
+		sh->rx_domain = NULL;
+	}
+	if (sh->tx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->tx_domain);
+		sh->tx_domain = NULL;
+	}
+	if (sh->fdb_domain) {
+		mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+		sh->fdb_domain = NULL;
+	}
+	if (sh->dr_drop_action) {
+		mlx5_glue->destroy_flow_action(sh->dr_drop_action);
+		sh->dr_drop_action = NULL;
+	}
+	if (sh->pop_vlan_action) {
+		mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+		sh->pop_vlan_action = NULL;
+	}
+	if (sh->encaps_decaps) {
+		mlx5_hlist_destroy(sh->encaps_decaps);
+		sh->encaps_decaps = NULL;
+	}
+	if (sh->modify_cmds) {
+		mlx5_hlist_destroy(sh->modify_cmds);
+		sh->modify_cmds = NULL;
+	}
+	if (sh->tag_table) {
+		/* tags should be destroyed with flow before. */
+		mlx5_hlist_destroy(sh->tag_table);
+		sh->tag_table = NULL;
+	}
+	if (sh->tunnel_hub) {
+		mlx5_release_tunnel_hub(sh, priv->dev_port);
+		sh->tunnel_hub = NULL;
+	}
+	mlx5_free_table_hash_list(priv);
+	if (sh->port_id_action_list) {
+		mlx5_list_destroy(sh->port_id_action_list);
+		sh->port_id_action_list = NULL;
+	}
+	if (sh->push_vlan_action_list) {
+		mlx5_list_destroy(sh->push_vlan_action_list);
+		sh->push_vlan_action_list = NULL;
+	}
+	if (sh->sample_action_list) {
+		mlx5_list_destroy(sh->sample_action_list);
+		sh->sample_action_list = NULL;
+	}
+	if (sh->dest_array_list) {
+		mlx5_list_destroy(sh->dest_array_list);
+		sh->dest_array_list = NULL;
+	}
+	return err;
+}
+
+/**
+ * Destroy DR related data within private structure.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ */
+void
+mlx5_os_free_shared_dr(struct mlx5_priv *priv)
+{
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+
+	MLX5_ASSERT(sh && sh->refcnt);
+	if (sh->refcnt > 1)
+		return;
+#ifdef HAVE_MLX5DV_DR
+	if (sh->rx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->rx_domain);
+		sh->rx_domain = NULL;
+	}
+	if (sh->tx_domain) {
+		mlx5_glue->dr_destroy_domain(sh->tx_domain);
+		sh->tx_domain = NULL;
+	}
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (sh->fdb_domain) {
+		mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+		sh->fdb_domain = NULL;
+	}
+	if (sh->dr_drop_action) {
+		mlx5_glue->destroy_flow_action(sh->dr_drop_action);
+		sh->dr_drop_action = NULL;
+	}
+#endif
+	if (sh->pop_vlan_action) {
+		mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+		sh->pop_vlan_action = NULL;
+	}
+#endif /* HAVE_MLX5DV_DR */
+	if (sh->default_miss_action)
+		mlx5_glue->destroy_flow_action
+				(sh->default_miss_action);
+	if (sh->encaps_decaps) {
+		mlx5_hlist_destroy(sh->encaps_decaps);
+		sh->encaps_decaps = NULL;
+	}
+	if (sh->modify_cmds) {
+		mlx5_hlist_destroy(sh->modify_cmds);
+		sh->modify_cmds = NULL;
+	}
+	if (sh->tag_table) {
+		/* tags should be destroyed with flow before. */
+		mlx5_hlist_destroy(sh->tag_table);
+		sh->tag_table = NULL;
+	}
+	if (sh->tunnel_hub) {
+		mlx5_release_tunnel_hub(sh, priv->dev_port);
+		sh->tunnel_hub = NULL;
+	}
+	mlx5_free_table_hash_list(priv);
+	if (sh->port_id_action_list) {
+		mlx5_list_destroy(sh->port_id_action_list);
+		sh->port_id_action_list = NULL;
+	}
+	if (sh->push_vlan_action_list) {
+		mlx5_list_destroy(sh->push_vlan_action_list);
+		sh->push_vlan_action_list = NULL;
+	}
+	if (sh->sample_action_list) {
+		mlx5_list_destroy(sh->sample_action_list);
+		sh->sample_action_list = NULL;
+	}
+	if (sh->dest_array_list) {
+		mlx5_list_destroy(sh->dest_array_list);
+		sh->dest_array_list = NULL;
+	}
+}
+
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_shared_data(void)
+{
+	const struct rte_memzone *mz;
+	int ret = 0;
+
+	rte_spinlock_lock(&mlx5_shared_data_lock);
+	if (mlx5_shared_data == NULL) {
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+			/* Allocate shared memory. */
+			mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
+						 sizeof(*mlx5_shared_data),
+						 SOCKET_ID_ANY, 0);
+			if (mz == NULL) {
+				DRV_LOG(ERR,
+					"Cannot allocate mlx5 shared data");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx5_shared_data = mz->addr;
+			memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
+			rte_spinlock_init(&mlx5_shared_data->lock);
+		} else {
+			/* Lookup allocated shared memory. */
+			mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
+			if (mz == NULL) {
+				DRV_LOG(ERR,
+					"Cannot attach mlx5 shared data");
+				ret = -rte_errno;
+				goto error;
+			}
+			mlx5_shared_data = mz->addr;
+			memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
+		}
+	}
+error:
+	rte_spinlock_unlock(&mlx5_shared_data_lock);
+	return ret;
+}
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_once(void)
+{
+	struct mlx5_shared_data *sd;
+	struct mlx5_local_data *ld = &mlx5_local_data;
+	int ret = 0;
+
+	if (mlx5_init_shared_data())
+		return -rte_errno;
+	sd = mlx5_shared_data;
+	MLX5_ASSERT(sd);
+	rte_spinlock_lock(&sd->lock);
+	switch (rte_eal_process_type()) {
+	case RTE_PROC_PRIMARY:
+		if (sd->init_done)
+			break;
+		LIST_INIT(&sd->mem_event_cb_list);
+		rte_rwlock_init(&sd->mem_event_rwlock);
+		rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
+						mlx5_mr_mem_event_cb, NULL);
+		ret = mlx5_mp_init_primary(MLX5_MP_NAME,
+					   mlx5_mp_os_primary_handle);
+		if (ret)
+			goto out;
+		sd->init_done = true;
+		break;
+	case RTE_PROC_SECONDARY:
+		if (ld->init_done)
+			break;
+		ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
+					     mlx5_mp_os_secondary_handle);
+		if (ret)
+			goto out;
+		++sd->secondary_cnt;
+		ld->init_done = true;
+		break;
+	default:
+		break;
+	}
+out:
+	rte_spinlock_unlock(&sd->lock);
+	return ret;
+}
+
+/**
+ * Create the Tx queue DevX/Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+			container_of(txq_data, struct mlx5_txq_ctrl, txq);
+
+	if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
+		return mlx5_txq_devx_obj_new(dev, idx);
+#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
+	if (!priv->config.dv_esw_en)
+		return mlx5_txq_devx_obj_new(dev, idx);
+#endif
+	return mlx5_txq_ibv_obj_new(dev, idx);
+}
+
+/**
+ * Release an Tx DevX/verbs queue object.
+ *
+ * @param txq_obj
+ *   DevX/Verbs Tx queue object.
+ */
+static void
+mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
+{
+	if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
+		mlx5_txq_devx_obj_release(txq_obj);
+		return;
+	}
+#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
+	if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
+		mlx5_txq_devx_obj_release(txq_obj);
+		return;
+	}
+#endif
+	mlx5_txq_ibv_obj_release(txq_obj);
+}
+
+/**
+ * DV flow counter mode detect and config.
+ *
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
+ *
+ */
+static void
+mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	bool fallback;
+
+#ifndef HAVE_IBV_DEVX_ASYNC
+	fallback = true;
+#else
+	fallback = false;
+	if (!priv->config.devx || !priv->config.dv_flow_en ||
+	    !priv->config.hca_attr.flow_counters_dump ||
+	    !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) ||
+	    (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP))
+		fallback = true;
+#endif
+	if (fallback)
+		DRV_LOG(INFO, "Use fall-back DV counter management. Flow "
+			"counter dump:%d, bulk_alloc_bitmap:0x%hhx.",
+			priv->config.hca_attr.flow_counters_dump,
+			priv->config.hca_attr.flow_counter_bulk_alloc_bitmap);
+	/* Initialize fallback mode only on the port initializes sh. */
+	if (sh->refcnt == 1)
+		sh->cmng.counter_fallback = fallback;
+	else if (fallback != sh->cmng.counter_fallback)
+		DRV_LOG(WARNING, "Port %d in sh has different fallback mode "
+			"with others:%d.", PORT_ID(priv), fallback);
+#endif
+}
+
+/**
+ * DR flow drop action support detect.
+ *
+ * @param dev
+ *   Pointer to rte_eth_dev structure.
+ *
+ */
+static void
+mlx5_flow_drop_action_config(struct rte_eth_dev *dev __rte_unused)
+{
+#ifdef HAVE_MLX5DV_DR
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!priv->config.dv_flow_en || !priv->sh->dr_drop_action)
+		return;
+	/**
+	 * DR supports drop action placeholder when it is supported;
+	 * otherwise, use the queue drop action.
+	 */
+	if (mlx5_flow_discover_dr_action_support(dev))
+		priv->root_drop_action = priv->drop_queue.hrxq->action;
+	else
+		priv->root_drop_action = priv->sh->dr_drop_action;
+#endif
+}
+
+static void
+mlx5_queue_counter_id_prepare(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	void *ctx = priv->sh->ctx;
+
+	priv->q_counters = mlx5_devx_cmd_queue_counter_alloc(ctx);
+	if (!priv->q_counters) {
+		struct ibv_cq *cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
+		struct ibv_wq *wq;
+
+		DRV_LOG(DEBUG, "Port %d queue counter object cannot be created "
+			"by DevX - fall-back to use the kernel driver global "
+			"queue counter.", dev->data->port_id);
+		/* Create WQ by kernel and query its queue counter ID. */
+		if (cq) {
+			wq = mlx5_glue->create_wq(ctx,
+						  &(struct ibv_wq_init_attr){
+						    .wq_type = IBV_WQT_RQ,
+						    .max_wr = 1,
+						    .max_sge = 1,
+						    .pd = priv->sh->pd,
+						    .cq = cq,
+						});
+			if (wq) {
+				/* Counter is assigned only on RDY state. */
+				int ret = mlx5_glue->modify_wq(wq,
+						 &(struct ibv_wq_attr){
+						 .attr_mask = IBV_WQ_ATTR_STATE,
+						 .wq_state = IBV_WQS_RDY,
+						});
+
+				if (ret == 0)
+					mlx5_devx_cmd_wq_query(wq,
+							 &priv->counter_set_id);
+				claim_zero(mlx5_glue->destroy_wq(wq));
+			}
+			claim_zero(mlx5_glue->destroy_cq(cq));
+		}
+	} else {
+		priv->counter_set_id = priv->q_counters->id;
+	}
+	if (priv->counter_set_id == 0)
+		DRV_LOG(INFO, "Part of the port %d statistics will not be "
+			"available.", dev->data->port_id);
+}
+
+/**
+ * Check if representor spawn info match devargs.
+ *
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param eth_da
+ *   Device devargs to probe.
+ *
+ * @return
+ *   Match result.
+ */
+static bool
+mlx5_representor_match(struct mlx5_dev_spawn_data *spawn,
+		       struct rte_eth_devargs *eth_da)
+{
+	struct mlx5_switch_info *switch_info = &spawn->info;
+	unsigned int p, f;
+	uint16_t id;
+	uint16_t repr_id = mlx5_representor_id_encode(switch_info,
+						      eth_da->type);
+
+	switch (eth_da->type) {
+	case RTE_ETH_REPRESENTOR_SF:
+		if (!(spawn->info.port_name == -1 &&
+		      switch_info->name_type ==
+				MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
+		    switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFSF) {
+			rte_errno = EBUSY;
+			return false;
+		}
+		break;
+	case RTE_ETH_REPRESENTOR_VF:
+		/* Allows HPF representor index -1 as exception. */
+		if (!(spawn->info.port_name == -1 &&
+		      switch_info->name_type ==
+				MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
+		    switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFVF) {
+			rte_errno = EBUSY;
+			return false;
+		}
+		break;
+	case RTE_ETH_REPRESENTOR_NONE:
+		rte_errno = EBUSY;
+		return false;
+	default:
+		rte_errno = ENOTSUP;
+		DRV_LOG(ERR, "unsupported representor type");
+		return false;
+	}
+	/* Check representor ID: */
+	for (p = 0; p < eth_da->nb_ports; ++p) {
+		if (spawn->pf_bond < 0) {
+			/* For non-LAG mode, allow and ignore pf. */
+			switch_info->pf_num = eth_da->ports[p];
+			repr_id = mlx5_representor_id_encode(switch_info,
+							     eth_da->type);
+		}
+		for (f = 0; f < eth_da->nb_representor_ports; ++f) {
+			id = MLX5_REPRESENTOR_ID
+				(eth_da->ports[p], eth_da->type,
+				 eth_da->representor_ports[f]);
+			if (repr_id == id)
+				return true;
+		}
+	}
+	rte_errno = EBUSY;
+	return false;
+}
+
+
+/**
+ * Spawn an Ethernet device from Verbs information.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param spawn
+ *   Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ *   Device configuration parameters.
+ * @param config
+ *   Device arguments.
+ *
+ * @return
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set. The following errors are defined:
+ *
+ *   EBUSY: device is not supposed to be spawned.
+ *   EEXIST: device is already spawned
+ */
+static struct rte_eth_dev *
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct mlx5_dev_spawn_data *spawn,
+	       struct mlx5_dev_config *config,
+	       struct rte_eth_devargs *eth_da)
+{
+	const struct mlx5_switch_info *switch_info = &spawn->info;
+	struct mlx5_dev_ctx_shared *sh = NULL;
+	struct ibv_port_attr port_attr;
+	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
+	struct mlx5_priv *priv = NULL;
+	int err = 0;
+	unsigned int hw_padding = 0;
+	unsigned int mps;
+	unsigned int tunnel_en = 0;
+	unsigned int mpls_en = 0;
+	unsigned int swp = 0;
+	unsigned int mprq = 0;
+	unsigned int mprq_min_stride_size_n = 0;
+	unsigned int mprq_max_stride_size_n = 0;
+	unsigned int mprq_min_stride_num_n = 0;
+	unsigned int mprq_max_stride_num_n = 0;
+	struct rte_ether_addr mac;
+	char name[RTE_ETH_NAME_MAX_LEN];
+	int own_domain_id = 0;
+	uint16_t port_id;
+	struct mlx5_port_info vport_info = { .query_flags = 0 };
+	int i;
+
+	/* Determine if this port representor is supposed to be spawned. */
+	if (switch_info->representor && dpdk_dev->devargs &&
+	    !mlx5_representor_match(spawn, eth_da))
+		return NULL;
+	/* Build device name. */
+	if (spawn->pf_bond < 0) {
+		/* Single device. */
+		if (!switch_info->representor)
+			strlcpy(name, dpdk_dev->name, sizeof(name));
+		else
+			err = snprintf(name, sizeof(name), "%s_representor_%s%u",
+				 dpdk_dev->name,
+				 switch_info->name_type ==
+				 MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
+				 switch_info->port_name);
+	} else {
+		/* Bonding device. */
+		if (!switch_info->representor) {
+			err = snprintf(name, sizeof(name), "%s_%s",
+				 dpdk_dev->name,
+				 mlx5_os_get_dev_device_name(spawn->phys_dev));
+		} else {
+			err = snprintf(name, sizeof(name), "%s_%s_representor_c%dpf%d%s%u",
+				dpdk_dev->name,
+				mlx5_os_get_dev_device_name(spawn->phys_dev),
+				switch_info->ctrl_num,
+				switch_info->pf_num,
+				switch_info->name_type ==
+				MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
+				switch_info->port_name);
+		}
+	}
+	if (err >= (int)sizeof(name))
+		DRV_LOG(WARNING, "device name overflow %s", name);
+	/* check if the device is already spawned */
+	if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
+		rte_errno = EEXIST;
+		return NULL;
+	}
+	DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct mlx5_mp_id mp_id;
+
+		eth_dev = rte_eth_dev_attach_secondary(name);
+		if (eth_dev == NULL) {
+			DRV_LOG(ERR, "can not attach rte ethdev");
+			rte_errno = ENOMEM;
+			return NULL;
+		}
+		eth_dev->device = dpdk_dev;
+		eth_dev->dev_ops = &mlx5_dev_sec_ops;
+		eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
+		eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
+		err = mlx5_proc_priv_init(eth_dev);
+		if (err)
+			return NULL;
+		mp_id.port_id = eth_dev->data->port_id;
+		strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+		/* Receive command fd from primary process */
+		err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
+		if (err < 0)
+			goto err_secondary;
+		/* Remap UAR for Tx queues. */
+		err = mlx5_tx_uar_init_secondary(eth_dev, err);
+		if (err)
+			goto err_secondary;
+		/*
+		 * Ethdev pointer is still required as input since
+		 * the primary device is not accessible from the
+		 * secondary process.
+		 */
+		eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+		eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+		return eth_dev;
+err_secondary:
+		mlx5_dev_close(eth_dev);
+		return NULL;
+	}
+	/*
+	 * Some parameters ("tx_db_nc" in particularly) are needed in
+	 * advance to create dv/verbs device context. We proceed the
+	 * devargs here to get ones, and later proceed devargs again
+	 * to override some hardware settings.
+	 */
+	err = mlx5_args(config, dpdk_dev->devargs);
+	if (err) {
+		err = rte_errno;
+		DRV_LOG(ERR, "failed to process device arguments: %s",
+			strerror(rte_errno));
+		goto error;
+	}
+	if (config->dv_miss_info) {
+		if (switch_info->master || switch_info->representor)
+			config->dv_xmeta_en = MLX5_XMETA_MODE_META16;
+	}
+	mlx5_malloc_mem_select(config->sys_mem_en);
+	sh = mlx5_alloc_shared_dev_ctx(spawn, config);
+	if (!sh)
+		return NULL;
+	config->devx = sh->devx;
+#ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
+	config->dest_tir = 1;
+#endif
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+#endif
+	/*
+	 * Multi-packet send is supported by ConnectX-4 Lx PF as well
+	 * as all ConnectX-5 devices.
+	 */
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+#endif
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+#endif
+	mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
+	if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+		if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+			DRV_LOG(DEBUG, "enhanced MPW is supported");
+			mps = MLX5_MPW_ENHANCED;
+		} else {
+			DRV_LOG(DEBUG, "MPW is supported");
+			mps = MLX5_MPW;
+		}
+	} else {
+		DRV_LOG(DEBUG, "MPW isn't supported");
+		mps = MLX5_MPW_DISABLED;
+	}
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+		swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
+	DRV_LOG(DEBUG, "SWP support: %u", swp);
+#endif
+	config->swp = !!swp;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+		struct mlx5dv_striding_rq_caps mprq_caps =
+			dv_attr.striding_rq_caps;
+
+		DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
+			mprq_caps.min_single_stride_log_num_of_bytes);
+		DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
+			mprq_caps.max_single_stride_log_num_of_bytes);
+		DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
+			mprq_caps.min_single_wqe_log_num_of_strides);
+		DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
+			mprq_caps.max_single_wqe_log_num_of_strides);
+		DRV_LOG(DEBUG, "\tsupported_qpts: %d",
+			mprq_caps.supported_qpts);
+		DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
+		mprq = 1;
+		mprq_min_stride_size_n =
+			mprq_caps.min_single_stride_log_num_of_bytes;
+		mprq_max_stride_size_n =
+			mprq_caps.max_single_stride_log_num_of_bytes;
+		mprq_min_stride_num_n =
+			mprq_caps.min_single_wqe_log_num_of_strides;
+		mprq_max_stride_num_n =
+			mprq_caps.max_single_wqe_log_num_of_strides;
+	}
+#endif
+	/* Rx CQE compression is enabled by default. */
+	config->cqe_comp = 1;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+		tunnel_en = ((dv_attr.tunnel_offloads_caps &
+			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
+			     (dv_attr.tunnel_offloads_caps &
+			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
+			     (dv_attr.tunnel_offloads_caps &
+			      MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
+	}
+	DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
+		tunnel_en ? "" : "not ");
+#else
+	DRV_LOG(WARNING,
+		"tunnel offloading disabled due to old OFED/rdma-core version");
+#endif
+	config->tunnel_en = tunnel_en;
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+	mpls_en = ((dv_attr.tunnel_offloads_caps &
+		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
+		   (dv_attr.tunnel_offloads_caps &
+		    MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
+	DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
+		mpls_en ? "" : "not ");
+#else
+	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
+		" old OFED/rdma-core version or firmware configuration");
+#endif
+	config->mpls_en = mpls_en;
+	/* Check port status. */
+	err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
+	if (err) {
+		DRV_LOG(ERR, "port query failed: %s", strerror(err));
+		goto error;
+	}
+	if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+		DRV_LOG(ERR, "port is not configured in Ethernet mode");
+		err = EINVAL;
+		goto error;
+	}
+	if (port_attr.state != IBV_PORT_ACTIVE)
+		DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+			mlx5_glue->port_state_str(port_attr.state),
+			port_attr.state);
+	/* Allocate private eth device data. */
+	priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE,
+			   sizeof(*priv),
+			   RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+	if (priv == NULL) {
+		DRV_LOG(ERR, "priv allocation failure");
+		err = ENOMEM;
+		goto error;
+	}
+	priv->sh = sh;
+	priv->dev_port = spawn->phys_port;
+	priv->pci_dev = spawn->pci_dev;
+	priv->mtu = RTE_ETHER_MTU;
+	/* Some internal functions rely on Netlink sockets, open them now. */
+	priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+	priv->nl_socket_route =	mlx5_nl_init(NETLINK_ROUTE);
+	priv->representor = !!switch_info->representor;
+	priv->master = !!switch_info->master;
+	priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+	priv->vport_meta_tag = 0;
+	priv->vport_meta_mask = 0;
+	priv->pf_bond = spawn->pf_bond;
+
+	DRV_LOG(DEBUG,
+		"dev_port=%u bus=%s pci=%s master=%d representor=%d pf_bond=%d\n",
+		priv->dev_port, dpdk_dev->bus->name,
+		priv->pci_dev ? priv->pci_dev->name : "NONE",
+		priv->master, priv->representor, priv->pf_bond);
+
+	/*
+	 * If we have E-Switch we should determine the vport attributes.
+	 * E-Switch may use either source vport field or reg_c[0] metadata
+	 * register to match on vport index. The engaged part of metadata
+	 * register is defined by mask.
+	 */
+	if (switch_info->representor || switch_info->master) {
+		err = mlx5_glue->devx_port_query(sh->ctx,
+						 spawn->phys_port,
+						 &vport_info);
+		if (err) {
+			DRV_LOG(WARNING,
+				"can't query devx port %d on device %s",
+				spawn->phys_port,
+				mlx5_os_get_dev_device_name(spawn->phys_dev));
+			vport_info.query_flags = 0;
+		}
+	}
+	if (vport_info.query_flags & MLX5_PORT_QUERY_REG_C0) {
+		priv->vport_meta_tag = vport_info.vport_meta_tag;
+		priv->vport_meta_mask = vport_info.vport_meta_mask;
+		if (!priv->vport_meta_mask) {
+			DRV_LOG(ERR, "vport zero mask for port %d"
+				     " on bonding device %s",
+				     spawn->phys_port,
+				     mlx5_os_get_dev_device_name
+							(spawn->phys_dev));
+			err = ENOTSUP;
+			goto error;
+		}
+		if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
+			DRV_LOG(ERR, "invalid vport tag for port %d"
+				     " on bonding device %s",
+				     spawn->phys_port,
+				     mlx5_os_get_dev_device_name
+							(spawn->phys_dev));
+			err = ENOTSUP;
+			goto error;
+		}
+	}
+	if (vport_info.query_flags & MLX5_PORT_QUERY_VPORT) {
+		priv->vport_id = vport_info.vport_id;
+	} else if (spawn->pf_bond >= 0 &&
+		   (switch_info->representor || switch_info->master)) {
+		DRV_LOG(ERR, "can't deduce vport index for port %d"
+			     " on bonding device %s",
+			     spawn->phys_port,
+			     mlx5_os_get_dev_device_name(spawn->phys_dev));
+		err = ENOTSUP;
+		goto error;
+	} else {
+		/*
+		 * Suppose vport index in compatible way. Kernel/rdma_core
+		 * support single E-Switch per PF configurations only and
+		 * vport_id field contains the vport index for associated VF,
+		 * which is deduced from representor port name.
+		 * For example, let's have the IB device port 10, it has
+		 * attached network device eth0, which has port name attribute
+		 * pf0vf2, we can deduce the VF number as 2, and set vport index
+		 * as 3 (2+1). This assigning schema should be changed if the
+		 * multiple E-Switch instances per PF configurations or/and PCI
+		 * subfunctions are added.
+		 */
+		priv->vport_id = switch_info->representor ?
+				 switch_info->port_name + 1 : -1;
+	}
+	priv->representor_id = mlx5_representor_id_encode(switch_info,
+							  eth_da->type);
+	/*
+	 * Look for sibling devices in order to reuse their switch domain
+	 * if any, otherwise allocate one.
+	 */
+	MLX5_ETH_FOREACH_DEV(port_id, dpdk_dev) {
+		const struct mlx5_priv *opriv =
+			rte_eth_devices[port_id].data->dev_private;
+
+		if (!opriv ||
+		    opriv->sh != priv->sh ||
+			opriv->domain_id ==
+			RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+			continue;
+		priv->domain_id = opriv->domain_id;
+		DRV_LOG(DEBUG, "dev_port-%u inherit domain_id=%u\n",
+			priv->dev_port, priv->domain_id);
+		break;
+	}
+	if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+		err = rte_eth_switch_domain_alloc(&priv->domain_id);
+		if (err) {
+			err = rte_errno;
+			DRV_LOG(ERR, "unable to allocate switch domain: %s",
+				strerror(rte_errno));
+			goto error;
+		}
+		own_domain_id = 1;
+		DRV_LOG(DEBUG, "dev_port-%u new domain_id=%u\n",
+			priv->dev_port, priv->domain_id);
+	}
+	/* Override some values set by hardware configuration. */
+	mlx5_args(config, dpdk_dev->devargs);
+	err = mlx5_dev_check_sibling_config(priv, config, dpdk_dev);
+	if (err)
+		goto error;
+	config->hw_csum = !!(sh->device_attr.device_cap_flags_ex &
+			    IBV_DEVICE_RAW_IP_CSUM);
+	DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+		(config->hw_csum ? "" : "not "));
+#if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
+	!defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+	DRV_LOG(DEBUG, "counters are not supported");
+#endif
+#if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
+	if (config->dv_flow_en) {
+		DRV_LOG(WARNING, "DV flow is not supported");
+		config->dv_flow_en = 0;
+	}
+#endif
+	if (spawn->max_port > UINT8_MAX) {
+		/* Verbs can't support ports larger than 255 by design. */
+		DRV_LOG(ERR, "can't support IB ports > UINT8_MAX");
+		err = EINVAL;
+		goto error;
+	}
+	config->ind_table_max_size =
+		sh->device_attr.max_rwq_indirection_table_size;
+	/*
+	 * Remove this check once DPDK supports larger/variable
+	 * indirection tables.
+	 */
+	if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+		config->ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+	DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+		config->ind_table_max_size);
+	config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
+				  IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+	DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+		(config->hw_vlan_strip ? "" : "not "));
+	config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
+				 IBV_RAW_PACKET_CAP_SCATTER_FCS);
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+	hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+	hw_padding = !!(sh->device_attr.device_cap_flags_ex &
+			IBV_DEVICE_PCI_WRITE_END_PADDING);
+#endif
+	if (config->hw_padding && !hw_padding) {
+		DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
+		config->hw_padding = 0;
+	} else if (config->hw_padding) {
+		DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
+	}
+	config->tso = (sh->device_attr.max_tso > 0 &&
+		      (sh->device_attr.tso_supported_qpts &
+		       (1 << IBV_QPT_RAW_PACKET)));
+	if (config->tso)
+		config->tso_max_payload_sz = sh->device_attr.max_tso;
+	/*
+	 * MPW is disabled by default, while the Enhanced MPW is enabled
+	 * by default.
+	 */
+	if (config->mps == MLX5_ARG_UNSET)
+		config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
+							  MLX5_MPW_DISABLED;
+	else
+		config->mps = config->mps ? mps : MLX5_MPW_DISABLED;
+	DRV_LOG(INFO, "%sMPS is %s",
+		config->mps == MLX5_MPW_ENHANCED ? "enhanced " :
+		config->mps == MLX5_MPW ? "legacy " : "",
+		config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+	if (config->devx) {
+		err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
+		if (err) {
+			err = -err;
+			goto error;
+		}
+		/* Check relax ordering support. */
+		if (!haswell_broadwell_cpu) {
+			sh->cmng.relaxed_ordering_write =
+				config->hca_attr.relaxed_ordering_write;
+			sh->cmng.relaxed_ordering_read =
+				config->hca_attr.relaxed_ordering_read;
+		} else {
+			sh->cmng.relaxed_ordering_read = 0;
+			sh->cmng.relaxed_ordering_write = 0;
+		}
+		sh->rq_ts_format = config->hca_attr.rq_ts_format;
+		sh->sq_ts_format = config->hca_attr.sq_ts_format;
+		sh->steering_format_version =
+			config->hca_attr.steering_format_version;
+		sh->qp_ts_format = config->hca_attr.qp_ts_format;
+		/* Check for LRO support. */
+		if (config->dest_tir && config->hca_attr.lro_cap &&
+		    config->dv_flow_en) {
+			/* TBD check tunnel lro caps. */
+			config->lro.supported = config->hca_attr.lro_cap;
+			DRV_LOG(DEBUG, "Device supports LRO");
+			/*
+			 * If LRO timeout is not configured by application,
+			 * use the minimal supported value.
+			 */
+			if (!config->lro.timeout)
+				config->lro.timeout =
+				config->hca_attr.lro_timer_supported_periods[0];
+			DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
+				config->lro.timeout);
+			DRV_LOG(DEBUG, "LRO minimal size of TCP segment "
+				"required for coalescing is %d bytes",
+				config->hca_attr.lro_min_mss_size);
+		}
+#if defined(HAVE_MLX5DV_DR) && \
+	(defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) || \
+	 defined(HAVE_MLX5_DR_CREATE_ACTION_ASO))
+		if (config->hca_attr.qos.sup &&
+		    config->hca_attr.qos.flow_meter_old &&
+		    config->dv_flow_en) {
+			uint8_t reg_c_mask =
+				config->hca_attr.qos.flow_meter_reg_c_ids;
+			/*
+			 * Meter needs two REG_C's for color match and pre-sfx
+			 * flow match. Here get the REG_C for color match.
+			 * REG_C_0 and REG_C_1 is reserved for metadata feature.
+			 */
+			reg_c_mask &= 0xfc;
+			if (__builtin_popcount(reg_c_mask) < 1) {
+				priv->mtr_en = 0;
+				DRV_LOG(WARNING, "No available register for"
+					" meter.");
+			} else {
+				/*
+				 * The meter color register is used by the
+				 * flow-hit feature as well.
+				 * The flow-hit feature must use REG_C_3
+				 * Prefer REG_C_3 if it is available.
+				 */
+				if (reg_c_mask & (1 << (REG_C_3 - REG_C_0)))
+					priv->mtr_color_reg = REG_C_3;
+				else
+					priv->mtr_color_reg = ffs(reg_c_mask)
+							      - 1 + REG_C_0;
+				priv->mtr_en = 1;
+				priv->mtr_reg_share =
+				      config->hca_attr.qos.flow_meter;
+				DRV_LOG(DEBUG, "The REG_C meter uses is %d",
+					priv->mtr_color_reg);
+			}
+		}
+		if (config->hca_attr.qos.sup &&
+			config->hca_attr.qos.flow_meter_aso_sup) {
+			uint32_t log_obj_size =
+				rte_log2_u32(MLX5_ASO_MTRS_PER_POOL >> 1);
+			if (log_obj_size >=
+			config->hca_attr.qos.log_meter_aso_granularity &&
+			log_obj_size <=
+			config->hca_attr.qos.log_meter_aso_max_alloc)
+				sh->meter_aso_en = 1;
+		}
+		if (priv->mtr_en) {
+			err = mlx5_aso_flow_mtrs_mng_init(priv->sh);
+			if (err) {
+				err = -err;
+				goto error;
+			}
+		}
+		if (config->hca_attr.flow.tunnel_header_0_1)
+			sh->tunnel_header_0_1 = 1;
+#endif
+#ifdef HAVE_MLX5_DR_CREATE_ACTION_ASO
+		if (config->hca_attr.flow_hit_aso &&
+		    priv->mtr_color_reg == REG_C_3) {
+			sh->flow_hit_aso_en = 1;
+			err = mlx5_flow_aso_age_mng_init(sh);
+			if (err) {
+				err = -err;
+				goto error;
+			}
+			DRV_LOG(DEBUG, "Flow Hit ASO is supported.");
+		}
+#endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO */
+#if defined(HAVE_MLX5_DR_CREATE_ACTION_ASO) && \
+	defined(HAVE_MLX5_DR_ACTION_ASO_CT)
+		if (config->hca_attr.ct_offload &&
+		    priv->mtr_color_reg == REG_C_3) {
+			err = mlx5_flow_aso_ct_mng_init(sh);
+			if (err) {
+				err = -err;
+				goto error;
+			}
+			DRV_LOG(DEBUG, "CT ASO is supported.");
+			sh->ct_aso_en = 1;
+		}
+#endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO && HAVE_MLX5_DR_ACTION_ASO_CT */
+#if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE)
+		if (config->hca_attr.log_max_ft_sampler_num > 0  &&
+		    config->dv_flow_en) {
+			priv->sampler_en = 1;
+			DRV_LOG(DEBUG, "Sampler enabled!");
+		} else {
+			priv->sampler_en = 0;
+			if (!config->hca_attr.log_max_ft_sampler_num)
+				DRV_LOG(WARNING,
+					"No available register for sampler.");
+			else
+				DRV_LOG(DEBUG, "DV flow is not supported!");
+		}
+#endif
+	}
+	if (config->cqe_comp && RTE_CACHE_LINE_SIZE == 128 &&
+	    !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) {
+		DRV_LOG(WARNING, "Rx CQE 128B compression is not supported");
+		config->cqe_comp = 0;
+	}
+	if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX &&
+	    (!config->devx || !config->hca_attr.mini_cqe_resp_flow_tag)) {
+		DRV_LOG(WARNING, "Flow Tag CQE compression"
+				 " format isn't supported.");
+		config->cqe_comp = 0;
+	}
+	if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_L34H_STRIDX &&
+	    (!config->devx || !config->hca_attr.mini_cqe_resp_l3_l4_tag)) {
+		DRV_LOG(WARNING, "L3/L4 Header CQE compression"
+				 " format isn't supported.");
+		config->cqe_comp = 0;
+	}
+	DRV_LOG(DEBUG, "Rx CQE compression is %ssupported",
+			config->cqe_comp ? "" : "not ");
+	if (config->tx_pp) {
+		DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz",
+			config->hca_attr.dev_freq_khz);
+		DRV_LOG(DEBUG, "Packet pacing is %ssupported",
+			config->hca_attr.qos.packet_pacing ? "" : "not ");
+		DRV_LOG(DEBUG, "Cross channel ops are %ssupported",
+			config->hca_attr.cross_channel ? "" : "not ");
+		DRV_LOG(DEBUG, "WQE index ignore is %ssupported",
+			config->hca_attr.wqe_index_ignore ? "" : "not ");
+		DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported",
+			config->hca_attr.non_wire_sq ? "" : "not ");
+		DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)",
+			config->hca_attr.log_max_static_sq_wq ? "" : "not ",
+			config->hca_attr.log_max_static_sq_wq);
+		DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported",
+			config->hca_attr.qos.wqe_rate_pp ? "" : "not ");
+		if (!config->devx) {
+			DRV_LOG(ERR, "DevX is required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config->hca_attr.qos.packet_pacing) {
+			DRV_LOG(ERR, "Packet pacing is not supported");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config->hca_attr.cross_channel) {
+			DRV_LOG(ERR, "Cross channel operations are"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config->hca_attr.wqe_index_ignore) {
+			DRV_LOG(ERR, "WQE index ignore feature is"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config->hca_attr.non_wire_sq) {
+			DRV_LOG(ERR, "Non-wire SQ feature is"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config->hca_attr.log_max_static_sq_wq) {
+			DRV_LOG(ERR, "Static WQE SQ feature is"
+				     " required for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+		if (!config->hca_attr.qos.wqe_rate_pp) {
+			DRV_LOG(ERR, "WQE rate mode is required"
+				     " for packet pacing");
+			err = ENODEV;
+			goto error;
+		}
+#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
+		DRV_LOG(ERR, "DevX does not provide UAR offset,"
+			     " can't create queues for packet pacing");
+		err = ENODEV;
+		goto error;
+#endif
+	}
+	if (config->devx) {
+		uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)];
+
+		err = config->hca_attr.access_register_user ?
+			mlx5_devx_cmd_register_read
+				(sh->ctx, MLX5_REGISTER_ID_MTUTC, 0,
+				reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP;
+		if (!err) {
+			uint32_t ts_mode;
+
+			/* MTUTC register is read successfully. */
+			ts_mode = MLX5_GET(register_mtutc, reg,
+					   time_stamp_mode);
+			if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME)
+				config->rt_timestamp = 1;
+		} else {
+			/* Kernel does not support register reading. */
+			if (config->hca_attr.dev_freq_khz ==
+						 (NS_PER_S / MS_PER_S))
+				config->rt_timestamp = 1;
+		}
+	}
+	/*
+	 * If HW has bug working with tunnel packet decapsulation and
+	 * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip
+	 * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore.
+	 */
+	if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en)
+		config->hw_fcs_strip = 0;
+	DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+		(config->hw_fcs_strip ? "" : "not "));
+	if (config->mprq.enabled && mprq) {
+		if (config->mprq.stride_num_n &&
+		    (config->mprq.stride_num_n > mprq_max_stride_num_n ||
+		     config->mprq.stride_num_n < mprq_min_stride_num_n)) {
+			config->mprq.stride_num_n =
+				RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+						mprq_min_stride_num_n),
+					mprq_max_stride_num_n);
+			DRV_LOG(WARNING,
+				"the number of strides"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config->mprq.stride_num_n);
+		}
+		if (config->mprq.stride_size_n &&
+		    (config->mprq.stride_size_n > mprq_max_stride_size_n ||
+		     config->mprq.stride_size_n < mprq_min_stride_size_n)) {
+			config->mprq.stride_size_n =
+				RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
+						mprq_min_stride_size_n),
+					mprq_max_stride_size_n);
+			DRV_LOG(WARNING,
+				"the size of a stride"
+				" for Multi-Packet RQ is out of range,"
+				" setting default value (%u)",
+				1 << config->mprq.stride_size_n);
+		}
+		config->mprq.min_stride_size_n = mprq_min_stride_size_n;
+		config->mprq.max_stride_size_n = mprq_max_stride_size_n;
+	} else if (config->mprq.enabled && !mprq) {
+		DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
+		config->mprq.enabled = 0;
+	}
+	if (config->max_dump_files_num == 0)
+		config->max_dump_files_num = 128;
+	eth_dev = rte_eth_dev_allocate(name);
+	if (eth_dev == NULL) {
+		DRV_LOG(ERR, "can not allocate rte ethdev");
+		err = ENOMEM;
+		goto error;
+	}
+	if (priv->representor) {
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
+		eth_dev->data->representor_id = priv->representor_id;
+	}
+	priv->mp_id.port_id = eth_dev->data->port_id;
+	strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+	/*
+	 * Store associated network device interface index. This index
+	 * is permanent throughout the lifetime of device. So, we may store
+	 * the ifindex here and use the cached value further.
+	 */
+	MLX5_ASSERT(spawn->ifindex);
+	priv->if_index = spawn->ifindex;
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	eth_dev->data->mac_addrs = priv->mac;
+	eth_dev->device = dpdk_dev;
+	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
+	/* Configure the first MAC address by default. */
+	if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+		DRV_LOG(ERR,
+			"port %u cannot get MAC address, is mlx5_en"
+			" loaded? (errno: %s)",
+			eth_dev->data->port_id, strerror(rte_errno));
+		err = ENODEV;
+		goto error;
+	}
+	DRV_LOG(INFO,
+		"port %u MAC address is " RTE_ETHER_ADDR_PRT_FMT,
+		eth_dev->data->port_id, RTE_ETHER_ADDR_BYTES(&mac));
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+	{
+		char ifname[MLX5_NAMESIZE];
+
+		if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+			DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+				eth_dev->data->port_id, ifname);
+		else
+			DRV_LOG(DEBUG, "port %u ifname is unknown",
+				eth_dev->data->port_id);
+	}
+#endif
+	/* Get actual MTU if possible. */
+	err = mlx5_get_mtu(eth_dev, &priv->mtu);
+	if (err) {
+		err = rte_errno;
+		goto error;
+	}
+	DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+		priv->mtu);
+	/* Initialize burst functions to prevent crashes before link-up. */
+	eth_dev->rx_pkt_burst = removed_rx_burst;
+	eth_dev->tx_pkt_burst = removed_tx_burst;
+	eth_dev->dev_ops = &mlx5_dev_ops;
+	eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
+	eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
+	eth_dev->rx_queue_count = mlx5_rx_queue_count;
+	/* Register MAC address. */
+	claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+	if (config->vf && config->vf_nl_en)
+		mlx5_nl_mac_addr_sync(priv->nl_socket_route,
+				      mlx5_ifindex(eth_dev),
+				      eth_dev->data->mac_addrs,
+				      MLX5_MAX_MAC_ADDRESSES);
+	priv->ctrl_flows = 0;
+	rte_spinlock_init(&priv->flow_list_lock);
+	TAILQ_INIT(&priv->flow_meters);
+	priv->mtr_profile_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_PTR);
+	if (!priv->mtr_profile_tbl)
+		goto error;
+	/* Hint libmlx5 to use PMD allocator for data plane resources */
+	mlx5_glue->dv_set_context_attr(sh->ctx,
+			MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+			(void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
+				.alloc = &mlx5_alloc_verbs_buf,
+				.free = &mlx5_free_verbs_buf,
+				.data = sh,
+			}));
+	/* Bring Ethernet device up. */
+	DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+		eth_dev->data->port_id);
+	mlx5_set_link_up(eth_dev);
+	/*
+	 * Even though the interrupt handler is not installed yet,
+	 * interrupts will still trigger on the async_fd from
+	 * Verbs context returned by ibv_open_device().
+	 */
+	mlx5_link_update(eth_dev, 0);
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+	if (!(config->hca_attr.eswitch_manager && config->dv_flow_en &&
+	      (switch_info->representor || switch_info->master)))
+		config->dv_esw_en = 0;
+#else
+	config->dv_esw_en = 0;
+#endif
+	/* Detect minimal data bytes to inline. */
+	mlx5_set_min_inline(spawn, config);
+	/* Store device configuration on private structure. */
+	priv->config = *config;
+	for (i = 0; i < MLX5_FLOW_TYPE_MAXI; i++) {
+		icfg[i].release_mem_en = !!config->reclaim_mode;
+		if (config->reclaim_mode)
+			icfg[i].per_core_cache = 0;
+		priv->flows[i] = mlx5_ipool_create(&icfg[i]);
+		if (!priv->flows[i])
+			goto error;
+	}
+	/* Create context for virtual machine VLAN workaround. */
+	priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
+	if (config->dv_flow_en) {
+		err = mlx5_alloc_shared_dr(priv);
+		if (err)
+			goto error;
+	}
+	if (config->devx && config->dv_flow_en && config->dest_tir) {
+		priv->obj_ops = devx_obj_ops;
+		priv->obj_ops.drop_action_create =
+						ibv_obj_ops.drop_action_create;
+		priv->obj_ops.drop_action_destroy =
+						ibv_obj_ops.drop_action_destroy;
+#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
+		priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
+#else
+		if (config->dv_esw_en)
+			priv->obj_ops.txq_obj_modify =
+						ibv_obj_ops.txq_obj_modify;
+#endif
+		/* Use specific wrappers for Tx object. */
+		priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
+		priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
+		mlx5_queue_counter_id_prepare(eth_dev);
+		priv->obj_ops.lb_dummy_queue_create =
+					mlx5_rxq_ibv_obj_dummy_lb_create;
+		priv->obj_ops.lb_dummy_queue_release =
+					mlx5_rxq_ibv_obj_dummy_lb_release;
+	} else {
+		priv->obj_ops = ibv_obj_ops;
+	}
+	if (config->tx_pp &&
+	    (priv->config.dv_esw_en ||
+	     priv->obj_ops.txq_obj_new != mlx5_os_txq_obj_new)) {
+		/*
+		 * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support
+		 * packet pacing and already checked above.
+		 * Hence, we should only make sure the SQs will be created
+		 * with DevX, not with Verbs.
+		 * Verbs allocates the SQ UAR on its own and it can't be shared
+		 * with Clock Queue UAR as required for Tx scheduling.
+		 */
+		DRV_LOG(ERR, "Verbs SQs, UAR can't be shared as required for packet pacing");
+		err = ENODEV;
+		goto error;
+	}
+	priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev);
+	if (!priv->drop_queue.hrxq)
+		goto error;
+	/* Supported Verbs flow priority number detection. */
+	err = mlx5_flow_discover_priorities(eth_dev);
+	if (err < 0) {
+		err = -err;
+		goto error;
+	}
+	priv->config.flow_prio = err;
+	if (!priv->config.dv_esw_en &&
+	    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+		DRV_LOG(WARNING, "metadata mode %u is not supported "
+				 "(no E-Switch)", priv->config.dv_xmeta_en);
+		priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
+	}
+	mlx5_set_metadata_mask(eth_dev);
+	if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+	    !priv->sh->dv_regc0_mask) {
+		DRV_LOG(ERR, "metadata mode %u is not supported "
+			     "(no metadata reg_c[0] is available)",
+			     priv->config.dv_xmeta_en);
+			err = ENOTSUP;
+			goto error;
+	}
+	priv->hrxqs = mlx5_list_create("hrxq", eth_dev, true,
+				       mlx5_hrxq_create_cb,
+				       mlx5_hrxq_match_cb,
+				       mlx5_hrxq_remove_cb,
+				       mlx5_hrxq_clone_cb,
+				       mlx5_hrxq_clone_free_cb);
+	if (!priv->hrxqs)
+		goto error;
+	rte_rwlock_init(&priv->ind_tbls_lock);
+	/* Query availability of metadata reg_c's. */
+	err = mlx5_flow_discover_mreg_c(eth_dev);
+	if (err < 0) {
+		err = -err;
+		goto error;
+	}
+	if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
+		DRV_LOG(DEBUG,
+			"port %u extensive metadata register is not supported",
+			eth_dev->data->port_id);
+		if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+			DRV_LOG(ERR, "metadata mode %u is not supported "
+				     "(no metadata registers available)",
+				     priv->config.dv_xmeta_en);
+			err = ENOTSUP;
+			goto error;
+		}
+	}
+	if (priv->config.dv_flow_en &&
+	    priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+	    mlx5_flow_ext_mreg_supported(eth_dev) &&
+	    priv->sh->dv_regc0_mask) {
+		priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
+						      MLX5_FLOW_MREG_HTABLE_SZ,
+						      false, true, eth_dev,
+						      flow_dv_mreg_create_cb,
+						      flow_dv_mreg_match_cb,
+						      flow_dv_mreg_remove_cb,
+						      flow_dv_mreg_clone_cb,
+						    flow_dv_mreg_clone_free_cb);
+		if (!priv->mreg_cp_tbl) {
+			err = ENOMEM;
+			goto error;
+		}
+	}
+	rte_spinlock_init(&priv->shared_act_sl);
+	mlx5_flow_counter_mode_config(eth_dev);
+	mlx5_flow_drop_action_config(eth_dev);
+	if (priv->config.dv_flow_en)
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_FLOW_OPS_THREAD_SAFE;
+	return eth_dev;
+error:
+	if (priv) {
+		if (priv->mreg_cp_tbl)
+			mlx5_hlist_destroy(priv->mreg_cp_tbl);
+		if (priv->sh)
+			mlx5_os_free_shared_dr(priv);
+		if (priv->nl_socket_route >= 0)
+			close(priv->nl_socket_route);
+		if (priv->nl_socket_rdma >= 0)
+			close(priv->nl_socket_rdma);
+		if (priv->vmwa_context)
+			mlx5_vlan_vmwa_exit(priv->vmwa_context);
+		if (eth_dev && priv->drop_queue.hrxq)
+			mlx5_drop_action_destroy(eth_dev);
+		if (priv->mtr_profile_tbl)
+			mlx5_l3t_destroy(priv->mtr_profile_tbl);
+		if (own_domain_id)
+			claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+		if (priv->hrxqs)
+			mlx5_list_destroy(priv->hrxqs);
+		mlx5_free(priv);
+		if (eth_dev != NULL)
+			eth_dev->data->dev_private = NULL;
+	}
+	if (eth_dev != NULL) {
+		/* mac_addrs must not be freed alone because part of
+		 * dev_private
+		 **/
+		eth_dev->data->mac_addrs = NULL;
+		rte_eth_dev_release_port(eth_dev);
+	}
+	if (sh)
+		mlx5_free_shared_dev_ctx(sh);
+	MLX5_ASSERT(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Comparison callback to sort device data.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ *   Pointer to pointer to first data object.
+ * @param b[in]
+ *   Pointer to pointer to second data object.
+ *
+ * @return
+ *   0 if both objects are equal, less than 0 if the first argument is less
+ *   than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_dev_spawn_data_cmp(const void *a, const void *b)
+{
+	const struct mlx5_switch_info *si_a =
+		&((const struct mlx5_dev_spawn_data *)a)->info;
+	const struct mlx5_switch_info *si_b =
+		&((const struct mlx5_dev_spawn_data *)b)->info;
+	int ret;
+
+	/* Master device first. */
+	ret = si_b->master - si_a->master;
+	if (ret)
+		return ret;
+	/* Then representor devices. */
+	ret = si_b->representor - si_a->representor;
+	if (ret)
+		return ret;
+	/* Unidentified devices come last in no specific order. */
+	if (!si_a->representor)
+		return 0;
+	/* Order representors by name. */
+	return si_a->port_name - si_b->port_name;
+}
+
+/**
+ * Match PCI information for possible slaves of bonding device.
+ *
+ * @param[in] ibv_dev
+ *   Pointer to Infiniband device structure.
+ * @param[in] pci_dev
+ *   Pointer to primary PCI address structure to match.
+ * @param[in] nl_rdma
+ *   Netlink RDMA group socket handle.
+ * @param[in] owner
+ *   Rerepsentor owner PF index.
+ * @param[out] bond_info
+ *   Pointer to bonding information.
+ *
+ * @return
+ *   negative value if no bonding device found, otherwise
+ *   positive index of slave PF in bonding.
+ */
+static int
+mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
+			   const struct rte_pci_addr *pci_dev,
+			   int nl_rdma, uint16_t owner,
+			   struct mlx5_bond_info *bond_info)
+{
+	char ifname[IF_NAMESIZE + 1];
+	unsigned int ifindex;
+	unsigned int np, i;
+	FILE *bond_file = NULL, *file;
+	int pf = -1;
+	int ret;
+
+	/*
+	 * Try to get master device name. If something goes
+	 * wrong suppose the lack of kernel support and no
+	 * bonding devices.
+	 */
+	memset(bond_info, 0, sizeof(*bond_info));
+	if (nl_rdma < 0)
+		return -1;
+	if (!strstr(ibv_dev->name, "bond"))
+		return -1;
+	np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
+	if (!np)
+		return -1;
+	/*
+	 * The Master device might not be on the predefined
+	 * port (not on port index 1, it is not garanted),
+	 * we have to scan all Infiniband device port and
+	 * find master.
+	 */
+	for (i = 1; i <= np; ++i) {
+		/* Check whether Infiniband port is populated. */
+		ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
+		if (!ifindex)
+			continue;
+		if (!if_indextoname(ifindex, ifname))
+			continue;
+		/* Try to read bonding slave names from sysfs. */
+		MKSTR(slaves,
+		      "/sys/class/net/%s/master/bonding/slaves", ifname);
+		bond_file = fopen(slaves, "r");
+		if (bond_file)
+			break;
+	}
+	if (!bond_file)
+		return -1;
+	/* Use safe format to check maximal buffer length. */
+	MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
+	while (fscanf(bond_file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
+		char tmp_str[IF_NAMESIZE + 32];
+		struct rte_pci_addr pci_addr;
+		struct mlx5_switch_info	info;
+
+		/* Process slave interface names in the loop. */
+		snprintf(tmp_str, sizeof(tmp_str),
+			 "/sys/class/net/%s", ifname);
+		if (mlx5_get_pci_addr(tmp_str, &pci_addr)) {
+			DRV_LOG(WARNING, "can not get PCI address"
+					 " for netdev \"%s\"", ifname);
+			continue;
+		}
+		/* Slave interface PCI address match found. */
+		snprintf(tmp_str, sizeof(tmp_str),
+			 "/sys/class/net/%s/phys_port_name", ifname);
+		file = fopen(tmp_str, "rb");
+		if (!file)
+			break;
+		info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
+		if (fscanf(file, "%32s", tmp_str) == 1)
+			mlx5_translate_port_name(tmp_str, &info);
+		fclose(file);
+		/* Only process PF ports. */
+		if (info.name_type != MLX5_PHYS_PORT_NAME_TYPE_LEGACY &&
+		    info.name_type != MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
+			continue;
+		/* Check max bonding member. */
+		if (info.port_name >= MLX5_BOND_MAX_PORTS) {
+			DRV_LOG(WARNING, "bonding index out of range, "
+				"please increase MLX5_BOND_MAX_PORTS: %s",
+				tmp_str);
+			break;
+		}
+		/* Match PCI address, allows BDF0+pfx or BDFx+pfx. */
+		if (pci_dev->domain == pci_addr.domain &&
+		    pci_dev->bus == pci_addr.bus &&
+		    pci_dev->devid == pci_addr.devid &&
+		    ((pci_dev->function == 0 &&
+		      pci_dev->function + owner == pci_addr.function) ||
+		     (pci_dev->function == owner &&
+		      pci_addr.function == owner)))
+			pf = info.port_name;
+		/* Get ifindex. */
+		snprintf(tmp_str, sizeof(tmp_str),
+			 "/sys/class/net/%s/ifindex", ifname);
+		file = fopen(tmp_str, "rb");
+		if (!file)
+			break;
+		ret = fscanf(file, "%u", &ifindex);
+		fclose(file);
+		if (ret != 1)
+			break;
+		/* Save bonding info. */
+		strncpy(bond_info->ports[info.port_name].ifname, ifname,
+			sizeof(bond_info->ports[0].ifname));
+		bond_info->ports[info.port_name].pci_addr = pci_addr;
+		bond_info->ports[info.port_name].ifindex = ifindex;
+		bond_info->n_port++;
+	}
+	if (pf >= 0) {
+		/* Get bond interface info */
+		ret = mlx5_sysfs_bond_info(ifindex, &bond_info->ifindex,
+					   bond_info->ifname);
+		if (ret)
+			DRV_LOG(ERR, "unable to get bond info: %s",
+				strerror(rte_errno));
+		else
+			DRV_LOG(INFO, "PF device %u, bond device %u(%s)",
+				ifindex, bond_info->ifindex, bond_info->ifname);
+	}
+	return pf;
+}
+
+static void
+mlx5_os_config_default(struct mlx5_dev_config *config)
+{
+	memset(config, 0, sizeof(*config));
+	config->mps = MLX5_ARG_UNSET;
+	config->dbnc = MLX5_ARG_UNSET;
+	config->rx_vec_en = 1;
+	config->txq_inline_max = MLX5_ARG_UNSET;
+	config->txq_inline_min = MLX5_ARG_UNSET;
+	config->txq_inline_mpw = MLX5_ARG_UNSET;
+	config->txqs_inline = MLX5_ARG_UNSET;
+	config->vf_nl_en = 1;
+	config->mr_ext_memseg_en = 1;
+	config->mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN;
+	config->mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS;
+	config->dv_esw_en = 1;
+	config->dv_flow_en = 1;
+	config->decap_en = 1;
+	config->log_hp_size = MLX5_ARG_UNSET;
+	config->allow_duplicate_pattern = 1;
+}
+
+/**
+ * Register a PCI device within bonding.
+ *
+ * This function spawns Ethernet devices out of a given PCI device and
+ * bonding owner PF index.
+ *
+ * @param[in] pci_dev
+ *   PCI device information.
+ * @param[in] req_eth_da
+ *   Requested ethdev device argument.
+ * @param[in] owner_id
+ *   Requested owner PF port ID within bonding device, default to 0.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_os_pci_probe_pf(struct rte_pci_device *pci_dev,
+		     struct rte_eth_devargs *req_eth_da,
+		     uint16_t owner_id)
+{
+	struct ibv_device **ibv_list;
+	/*
+	 * Number of found IB Devices matching with requested PCI BDF.
+	 * nd != 1 means there are multiple IB devices over the same
+	 * PCI device and we have representors and master.
+	 */
+	unsigned int nd = 0;
+	/*
+	 * Number of found IB device Ports. nd = 1 and np = 1..n means
+	 * we have the single multiport IB device, and there may be
+	 * representors attached to some of found ports.
+	 */
+	unsigned int np = 0;
+	/*
+	 * Number of DPDK ethernet devices to Spawn - either over
+	 * multiple IB devices or multiple ports of single IB device.
+	 * Actually this is the number of iterations to spawn.
+	 */
+	unsigned int ns = 0;
+	/*
+	 * Bonding device
+	 *   < 0 - no bonding device (single one)
+	 *  >= 0 - bonding device (value is slave PF index)
+	 */
+	int bd = -1;
+	struct mlx5_dev_spawn_data *list = NULL;
+	struct mlx5_dev_config dev_config;
+	unsigned int dev_config_vf;
+	struct rte_eth_devargs eth_da = *req_eth_da;
+	struct rte_pci_addr owner_pci = pci_dev->addr; /* Owner PF. */
+	struct mlx5_bond_info bond_info;
+	int ret = -1;
+
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
+		return -rte_errno;
+	}
+	/*
+	 * First scan the list of all Infiniband devices to find
+	 * matching ones, gathering into the list.
+	 */
+	struct ibv_device *ibv_match[ret + 1];
+	int nl_route = mlx5_nl_init(NETLINK_ROUTE);
+	int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
+	unsigned int i;
+
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		bd = mlx5_device_bond_pci_match
+				(ibv_list[ret], &owner_pci, nl_rdma, owner_id,
+				 &bond_info);
+		if (bd >= 0) {
+			/*
+			 * Bonding device detected. Only one match is allowed,
+			 * the bonding is supported over multi-port IB device,
+			 * there should be no matches on representor PCI
+			 * functions or non VF LAG bonding devices with
+			 * specified address.
+			 */
+			if (nd) {
+				DRV_LOG(ERR,
+					"multiple PCI match on bonding device"
+					"\"%s\" found", ibv_list[ret]->name);
+				rte_errno = ENOENT;
+				ret = -rte_errno;
+				goto exit;
+			}
+			/* Amend owner pci address if owner PF ID specified. */
+			if (eth_da.nb_representor_ports)
+				owner_pci.function += owner_id;
+			DRV_LOG(INFO, "PCI information matches for"
+				      " slave %d bonding device \"%s\"",
+				      bd, ibv_list[ret]->name);
+			ibv_match[nd++] = ibv_list[ret];
+			break;
+		} else {
+			/* Bonding device not found. */
+			if (mlx5_get_pci_addr(ibv_list[ret]->ibdev_path,
+					      &pci_addr))
+				continue;
+			if (owner_pci.domain != pci_addr.domain ||
+			    owner_pci.bus != pci_addr.bus ||
+			    owner_pci.devid != pci_addr.devid ||
+			    owner_pci.function != pci_addr.function)
+				continue;
+			DRV_LOG(INFO, "PCI information matches for device \"%s\"",
+				ibv_list[ret]->name);
+			ibv_match[nd++] = ibv_list[ret];
+		}
+	}
+	ibv_match[nd] = NULL;
+	if (!nd) {
+		/* No device matches, just complain and bail out. */
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI device " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			owner_pci.domain, owner_pci.bus,
+			owner_pci.devid, owner_pci.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+		goto exit;
+	}
+	if (nd == 1) {
+		/*
+		 * Found single matching device may have multiple ports.
+		 * Each port may be representor, we have to check the port
+		 * number and check the representors existence.
+		 */
+		if (nl_rdma >= 0)
+			np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
+		if (!np)
+			DRV_LOG(WARNING, "can not get IB device \"%s\""
+					 " ports number", ibv_match[0]->name);
+		if (bd >= 0 && !np) {
+			DRV_LOG(ERR, "can not get ports"
+				     " for bonding device");
+			rte_errno = ENOENT;
+			ret = -rte_errno;
+			goto exit;
+		}
+	}
+	/*
+	 * Now we can determine the maximal
+	 * amount of devices to be spawned.
+	 */
+	list = mlx5_malloc(MLX5_MEM_ZERO,
+			   sizeof(struct mlx5_dev_spawn_data) *
+			   (np ? np : nd),
+			   RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+	if (!list) {
+		DRV_LOG(ERR, "spawn data array allocation failure");
+		rte_errno = ENOMEM;
+		ret = -rte_errno;
+		goto exit;
+	}
+	if (bd >= 0 || np > 1) {
+		/*
+		 * Single IB device with multiple ports found,
+		 * it may be E-Switch master device and representors.
+		 * We have to perform identification through the ports.
+		 */
+		MLX5_ASSERT(nl_rdma >= 0);
+		MLX5_ASSERT(ns == 0);
+		MLX5_ASSERT(nd == 1);
+		MLX5_ASSERT(np);
+		for (i = 1; i <= np; ++i) {
+			list[ns].bond_info = &bond_info;
+			list[ns].max_port = np;
+			list[ns].phys_port = i;
+			list[ns].phys_dev = ibv_match[0];
+			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
+			list[ns].pf_bond = bd;
+			list[ns].ifindex = mlx5_nl_ifindex
+				(nl_rdma,
+				mlx5_os_get_dev_device_name
+						(list[ns].phys_dev), i);
+			if (!list[ns].ifindex) {
+				/*
+				 * No network interface index found for the
+				 * specified port, it means there is no
+				 * representor on this port. It's OK,
+				 * there can be disabled ports, for example
+				 * if sriov_numvfs < sriov_totalvfs.
+				 */
+				continue;
+			}
+			ret = -1;
+			if (nl_route >= 0)
+				ret = mlx5_nl_switch_info
+					       (nl_route,
+						list[ns].ifindex,
+						&list[ns].info);
+			if (ret || (!list[ns].info.representor &&
+				    !list[ns].info.master)) {
+				/*
+				 * We failed to recognize representors with
+				 * Netlink, let's try to perform the task
+				 * with sysfs.
+				 */
+				ret =  mlx5_sysfs_switch_info
+						(list[ns].ifindex,
+						 &list[ns].info);
+			}
+			if (!ret && bd >= 0) {
+				switch (list[ns].info.name_type) {
+				case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+					if (np == 1) {
+						/*
+						 * Force standalone bonding
+						 * device for ROCE LAG
+						 * confgiurations.
+						 */
+						list[ns].info.master = 0;
+						list[ns].info.representor = 0;
+					}
+					if (list[ns].info.port_name == bd)
+						ns++;
+					break;
+				case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+					/* Fallthrough */
+				case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+					/* Fallthrough */
+				case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+					if (list[ns].info.pf_num == bd)
+						ns++;
+					break;
+				default:
+					break;
+				}
+				continue;
+			}
+			if (!ret && (list[ns].info.representor ^
+				     list[ns].info.master))
+				ns++;
+		}
+		if (!ns) {
+			DRV_LOG(ERR,
+				"unable to recognize master/representors"
+				" on the IB device with multiple ports");
+			rte_errno = ENOENT;
+			ret = -rte_errno;
+			goto exit;
+		}
+	} else {
+		/*
+		 * The existence of several matching entries (nd > 1) means
+		 * port representors have been instantiated. No existing Verbs
+		 * call nor sysfs entries can tell them apart, this can only
+		 * be done through Netlink calls assuming kernel drivers are
+		 * recent enough to support them.
+		 *
+		 * In the event of identification failure through Netlink,
+		 * try again through sysfs, then:
+		 *
+		 * 1. A single IB device matches (nd == 1) with single
+		 *    port (np=0/1) and is not a representor, assume
+		 *    no switch support.
+		 *
+		 * 2. Otherwise no safe assumptions can be made;
+		 *    complain louder and bail out.
+		 */
+		for (i = 0; i != nd; ++i) {
+			memset(&list[ns].info, 0, sizeof(list[ns].info));
+			list[ns].bond_info = NULL;
+			list[ns].max_port = 1;
+			list[ns].phys_port = 1;
+			list[ns].phys_dev = ibv_match[i];
+			list[ns].eth_dev = NULL;
+			list[ns].pci_dev = pci_dev;
+			list[ns].pf_bond = -1;
+			list[ns].ifindex = 0;
+			if (nl_rdma >= 0)
+				list[ns].ifindex = mlx5_nl_ifindex
+				(nl_rdma,
+				mlx5_os_get_dev_device_name
+						(list[ns].phys_dev), 1);
+			if (!list[ns].ifindex) {
+				char ifname[IF_NAMESIZE];
+
+				/*
+				 * Netlink failed, it may happen with old
+				 * ib_core kernel driver (before 4.16).
+				 * We can assume there is old driver because
+				 * here we are processing single ports IB
+				 * devices. Let's try sysfs to retrieve
+				 * the ifindex. The method works for
+				 * master device only.
+				 */
+				if (nd > 1) {
+					/*
+					 * Multiple devices found, assume
+					 * representors, can not distinguish
+					 * master/representor and retrieve
+					 * ifindex via sysfs.
+					 */
+					continue;
+				}
+				ret = mlx5_get_ifname_sysfs
+					(ibv_match[i]->ibdev_path, ifname);
+				if (!ret)
+					list[ns].ifindex =
+						if_nametoindex(ifname);
+				if (!list[ns].ifindex) {
+					/*
+					 * No network interface index found
+					 * for the specified device, it means
+					 * there it is neither representor
+					 * nor master.
+					 */
+					continue;
+				}
+			}
+			ret = -1;
+			if (nl_route >= 0)
+				ret = mlx5_nl_switch_info
+					       (nl_route,
+						list[ns].ifindex,
+						&list[ns].info);
+			if (ret || (!list[ns].info.representor &&
+				    !list[ns].info.master)) {
+				/*
+				 * We failed to recognize representors with
+				 * Netlink, let's try to perform the task
+				 * with sysfs.
+				 */
+				ret =  mlx5_sysfs_switch_info
+						(list[ns].ifindex,
+						 &list[ns].info);
+			}
+			if (!ret && (list[ns].info.representor ^
+				     list[ns].info.master)) {
+				ns++;
+			} else if ((nd == 1) &&
+				   !list[ns].info.representor &&
+				   !list[ns].info.master) {
+				/*
+				 * Single IB device with
+				 * one physical port and
+				 * attached network device.
+				 * May be SRIOV is not enabled
+				 * or there is no representors.
+				 */
+				DRV_LOG(INFO, "no E-Switch support detected");
+				ns++;
+				break;
+			}
+		}
+		if (!ns) {
+			DRV_LOG(ERR,
+				"unable to recognize master/representors"
+				" on the multiple IB devices");
+			rte_errno = ENOENT;
+			ret = -rte_errno;
+			goto exit;
+		}
+		/*
+		 * New kernels may add the switch_id attribute for the case
+		 * there is no E-Switch and we wrongly recognized the
+		 * only device as master. Override this if there is the
+		 * single device with single port and new device name
+		 * format present.
+		 */
+		if (nd == 1 &&
+		    list[0].info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) {
+			list[0].info.master = 0;
+			list[0].info.representor = 0;
+		}
+	}
+	MLX5_ASSERT(ns);
+	/*
+	 * Sort list to probe devices in natural order for users convenience
+	 * (i.e. master first, then representors from lowest to highest ID).
+	 */
+	qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
+	/* Device specific configuration. */
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTXVF:
+		dev_config_vf = 1;
+		break;
+	default:
+		dev_config_vf = 0;
+		break;
+	}
+	if (eth_da.type != RTE_ETH_REPRESENTOR_NONE) {
+		/* Set devargs default values. */
+		if (eth_da.nb_mh_controllers == 0) {
+			eth_da.nb_mh_controllers = 1;
+			eth_da.mh_controllers[0] = 0;
+		}
+		if (eth_da.nb_ports == 0 && ns > 0) {
+			if (list[0].pf_bond >= 0 && list[0].info.representor)
+				DRV_LOG(WARNING, "Representor on Bonding device should use pf#vf# syntax: %s",
+					pci_dev->device.devargs->args);
+			eth_da.nb_ports = 1;
+			eth_da.ports[0] = list[0].info.pf_num;
+		}
+		if (eth_da.nb_representor_ports == 0) {
+			eth_da.nb_representor_ports = 1;
+			eth_da.representor_ports[0] = 0;
+		}
+	}
+	for (i = 0; i != ns; ++i) {
+		uint32_t restore;
+
+		/* Default configuration. */
+		mlx5_os_config_default(&dev_config);
+		dev_config.vf = dev_config_vf;
+		list[i].numa_node = pci_dev->device.numa_node;
+		list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
+						 &list[i],
+						 &dev_config,
+						 &eth_da);
+		if (!list[i].eth_dev) {
+			if (rte_errno != EBUSY && rte_errno != EEXIST)
+				break;
+			/* Device is disabled or already spawned. Ignore it. */
+			continue;
+		}
+		restore = list[i].eth_dev->data->dev_flags;
+		rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
+		/**
+		 * Each representor has a dedicated interrupts vector.
+		 * rte_eth_copy_pci_info() assigns PF interrupts handle to
+		 * representor eth_dev object because representor and PF
+		 * share the same PCI address.
+		 * Override representor device with a dedicated
+		 * interrupts handle here.
+		 * Representor interrupts handle is released in mlx5_dev_stop().
+		 */
+		if (list[i].info.representor) {
+			struct rte_intr_handle *intr_handle;
+			intr_handle = mlx5_malloc(MLX5_MEM_SYS | MLX5_MEM_ZERO,
+						  sizeof(*intr_handle), 0,
+						  SOCKET_ID_ANY);
+			if (!intr_handle) {
+				DRV_LOG(ERR,
+					"port %u failed to allocate memory for interrupt handler "
+					"Rx interrupts will not be supported",
+					i);
+				rte_errno = ENOMEM;
+				ret = -rte_errno;
+				goto exit;
+			}
+			list[i].eth_dev->intr_handle = intr_handle;
+		}
+		/* Restore non-PCI flags cleared by the above call. */
+		list[i].eth_dev->data->dev_flags |= restore;
+		rte_eth_dev_probing_finish(list[i].eth_dev);
+	}
+	if (i != ns) {
+		DRV_LOG(ERR,
+			"probe of PCI device " PCI_PRI_FMT " aborted after"
+			" encountering an error: %s",
+			owner_pci.domain, owner_pci.bus,
+			owner_pci.devid, owner_pci.function,
+			strerror(rte_errno));
+		ret = -rte_errno;
+		/* Roll back. */
+		while (i--) {
+			if (!list[i].eth_dev)
+				continue;
+			mlx5_dev_close(list[i].eth_dev);
+			/* mac_addrs must not be freed because in dev_private */
+			list[i].eth_dev->data->mac_addrs = NULL;
+			claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
+		}
+		/* Restore original error. */
+		rte_errno = -ret;
+	} else {
+		ret = 0;
+	}
+exit:
+	/*
+	 * Do the routine cleanup:
+	 * - close opened Netlink sockets
+	 * - free allocated spawn data array
+	 * - free the Infiniband device list
+	 */
+	if (nl_rdma >= 0)
+		close(nl_rdma);
+	if (nl_route >= 0)
+		close(nl_route);
+	if (list)
+		mlx5_free(list);
+	MLX5_ASSERT(ibv_list);
+	mlx5_glue->free_device_list(ibv_list);
+	return ret;
+}
+
+static int
+mlx5_os_parse_eth_devargs(struct rte_device *dev,
+			  struct rte_eth_devargs *eth_da)
+{
+	int ret = 0;
+
+	if (dev->devargs == NULL)
+		return 0;
+	memset(eth_da, 0, sizeof(*eth_da));
+	/* Parse representor information first from class argument. */
+	if (dev->devargs->cls_str)
+		ret = rte_eth_devargs_parse(dev->devargs->cls_str, eth_da);
+	if (ret != 0) {
+		DRV_LOG(ERR, "failed to parse device arguments: %s",
+			dev->devargs->cls_str);
+		return -rte_errno;
+	}
+	if (eth_da->type == RTE_ETH_REPRESENTOR_NONE) {
+		/* Parse legacy device argument */
+		ret = rte_eth_devargs_parse(dev->devargs->args, eth_da);
+		if (ret) {
+			DRV_LOG(ERR, "failed to parse device arguments: %s",
+				dev->devargs->args);
+			return -rte_errno;
+		}
+	}
+	return 0;
+}
+
+/**
+ * Callback to register a PCI device.
+ *
+ * This function spawns Ethernet devices out of a given PCI device.
+ *
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_os_pci_probe(struct rte_pci_device *pci_dev)
+{
+	struct rte_eth_devargs eth_da = { .nb_ports = 0 };
+	int ret = 0;
+	uint16_t p;
+
+	ret = mlx5_os_parse_eth_devargs(&pci_dev->device, &eth_da);
+	if (ret != 0)
+		return ret;
+
+	if (eth_da.nb_ports > 0) {
+		/* Iterate all port if devargs pf is range: "pf[0-1]vf[...]". */
+		for (p = 0; p < eth_da.nb_ports; p++) {
+			ret = mlx5_os_pci_probe_pf(pci_dev, &eth_da,
+						   eth_da.ports[p]);
+			if (ret)
+				break;
+		}
+		if (ret) {
+			DRV_LOG(ERR, "Probe of PCI device " PCI_PRI_FMT " "
+				"aborted due to proding failure of PF %u",
+				pci_dev->addr.domain, pci_dev->addr.bus,
+				pci_dev->addr.devid, pci_dev->addr.function,
+				eth_da.ports[p]);
+			mlx5_net_remove(&pci_dev->device);
+		}
+	} else {
+		ret = mlx5_os_pci_probe_pf(pci_dev, &eth_da, 0);
+	}
+	return ret;
+}
+
+/* Probe a single SF device on auxiliary bus, no representor support. */
+static int
+mlx5_os_auxiliary_probe(struct rte_device *dev)
+{
+	struct rte_eth_devargs eth_da = { .nb_ports = 0 };
+	struct mlx5_dev_config config;
+	struct mlx5_dev_spawn_data spawn = { .pf_bond = -1 };
+	struct rte_auxiliary_device *adev = RTE_DEV_TO_AUXILIARY(dev);
+	struct rte_eth_dev *eth_dev;
+	int ret = 0;
+
+	/* Parse ethdev devargs. */
+	ret = mlx5_os_parse_eth_devargs(dev, &eth_da);
+	if (ret != 0)
+		return ret;
+	/* Set default config data. */
+	mlx5_os_config_default(&config);
+	config.sf = 1;
+	/* Init spawn data. */
+	spawn.max_port = 1;
+	spawn.phys_port = 1;
+	spawn.phys_dev = mlx5_os_get_ibv_dev(dev);
+	if (spawn.phys_dev == NULL)
+		return -rte_errno;
+	ret = mlx5_auxiliary_get_ifindex(dev->name);
+	if (ret < 0) {
+		DRV_LOG(ERR, "failed to get ethdev ifindex: %s", dev->name);
+		return ret;
+	}
+	spawn.ifindex = ret;
+	spawn.numa_node = dev->numa_node;
+	/* Spawn device. */
+	eth_dev = mlx5_dev_spawn(dev, &spawn, &config, &eth_da);
+	if (eth_dev == NULL)
+		return -rte_errno;
+	/* Post create. */
+	eth_dev->intr_handle = &adev->intr_handle;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
+		eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
+		eth_dev->data->numa_node = dev->numa_node;
+	}
+	rte_eth_dev_probing_finish(eth_dev);
+	return 0;
+}
+
+/**
+ * Net class driver callback to probe a device.
+ *
+ * This function probe PCI bus device(s) or a single SF on auxiliary bus.
+ *
+ * @param[in] dev
+ *   Pointer to the generic device.
+ *
+ * @return
+ *   0 on success, the function cannot fail.
+ */
+int
+mlx5_os_net_probe(struct rte_device *dev)
+{
+	int ret;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		mlx5_pmd_socket_init();
+	ret = mlx5_init_once();
+	if (ret) {
+		DRV_LOG(ERR, "unable to init PMD global data: %s",
+			strerror(rte_errno));
+		return -rte_errno;
+	}
+	if (mlx5_dev_is_pci(dev))
+		return mlx5_os_pci_probe(RTE_DEV_TO_PCI(dev));
+	else
+		return mlx5_os_auxiliary_probe(dev);
+}
+
+static int
+mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
+{
+	char *env;
+	int value;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Get environment variable to store. */
+	env = getenv(MLX5_SHUT_UP_BF);
+	value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
+	if (config->dbnc == MLX5_ARG_UNSET)
+		setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
+	else
+		setenv(MLX5_SHUT_UP_BF,
+		       config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
+	return value;
+}
+
+static void
+mlx5_restore_doorbell_mapping_env(int value)
+{
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	/* Restore the original environment variable state. */
+	if (value == MLX5_ARG_UNSET)
+		unsetenv(MLX5_SHUT_UP_BF);
+	else
+		setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
+}
+
+/**
+ * Extract pdn of PD object using DV API.
+ *
+ * @param[in] pd
+ *   Pointer to the verbs PD object.
+ * @param[out] pdn
+ *   Pointer to the PD object number variable.
+ *
+ * @return
+ *   0 on success, error value otherwise.
+ */
+int
+mlx5_os_get_pdn(void *pd, uint32_t *pdn)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	struct mlx5dv_obj obj;
+	struct mlx5dv_pd pd_info;
+	int ret = 0;
+
+	obj.pd.in = pd;
+	obj.pd.out = &pd_info;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+	if (ret) {
+		DRV_LOG(DEBUG, "Fail to get PD object info");
+		return ret;
+	}
+	*pdn = pd_info.pdn;
+	return 0;
+#else
+	(void)pd;
+	(void)pdn;
+	return -ENOTSUP;
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+}
+
+/**
+ * Function API to open IB device.
+ *
+ * This function calls the Linux glue APIs to open a device.
+ *
+ * @param[in] spawn
+ *   Pointer to the IB device attributes (name, port, etc).
+ * @param[out] config
+ *   Pointer to device configuration structure.
+ * @param[out] sh
+ *   Pointer to shared context structure.
+ *
+ * @return
+ *   0 on success, a positive error value otherwise.
+ */
+int
+mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
+		     const struct mlx5_dev_config *config,
+		     struct mlx5_dev_ctx_shared *sh)
+{
+	int dbmap_env;
+	int err = 0;
+
+	pthread_mutex_init(&sh->txpp.mutex, NULL);
+	/*
+	 * Configure environment variable "MLX5_BF_SHUT_UP"
+	 * before the device creation. The rdma_core library
+	 * checks the variable at device creation and
+	 * stores the result internally.
+	 */
+	dbmap_env = mlx5_config_doorbell_mapping_env(config);
+	/* Try to open IB device with DV first, then usual Verbs. */
+	errno = 0;
+	sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev);
+	if (sh->ctx) {
+		sh->devx = 1;
+		DRV_LOG(DEBUG, "DevX is supported");
+		/* The device is created, no need for environment. */
+		mlx5_restore_doorbell_mapping_env(dbmap_env);
+	} else {
+		/* The environment variable is still configured. */
+		sh->ctx = mlx5_glue->open_device(spawn->phys_dev);
+		err = errno ? errno : ENODEV;
+		/*
+		 * The environment variable is not needed anymore,
+		 * all device creation attempts are completed.
+		 */
+		mlx5_restore_doorbell_mapping_env(dbmap_env);
+		if (!sh->ctx)
+			return err;
+		DRV_LOG(DEBUG, "DevX is NOT supported");
+		err = 0;
+	}
+	if (!err && sh->ctx) {
+		/* Hint libmlx5 to use PMD allocator for data plane resources */
+		mlx5_glue->dv_set_context_attr(sh->ctx,
+			MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+			(void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
+				.alloc = &mlx5_alloc_verbs_buf,
+				.free = &mlx5_free_verbs_buf,
+				.data = sh,
+			}));
+	}
+	return err;
+}
+
+/**
+ * Install shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param sh
+ *   Pointer to mlx5_dev_ctx_shared object.
+ */
+void
+mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
+{
+	int ret;
+	int flags;
+
+	sh->intr_handle.fd = -1;
+	flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
+	ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
+		    F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(INFO, "failed to change file descriptor async event"
+			" queue");
+	} else {
+		sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
+		sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
+		if (rte_intr_callback_register(&sh->intr_handle,
+					mlx5_dev_interrupt_handler, sh)) {
+			DRV_LOG(INFO, "Fail to install the shared interrupt.");
+			sh->intr_handle.fd = -1;
+		}
+	}
+	if (sh->devx) {
+#ifdef HAVE_IBV_DEVX_ASYNC
+		sh->intr_handle_devx.fd = -1;
+		sh->devx_comp =
+			(void *)mlx5_glue->devx_create_cmd_comp(sh->ctx);
+		struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp;
+		if (!devx_comp) {
+			DRV_LOG(INFO, "failed to allocate devx_comp.");
+			return;
+		}
+		flags = fcntl(devx_comp->fd, F_GETFL);
+		ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
+		if (ret) {
+			DRV_LOG(INFO, "failed to change file descriptor"
+				" devx comp");
+			return;
+		}
+		sh->intr_handle_devx.fd = devx_comp->fd;
+		sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
+		if (rte_intr_callback_register(&sh->intr_handle_devx,
+					mlx5_dev_interrupt_handler_devx, sh)) {
+			DRV_LOG(INFO, "Fail to install the devx shared"
+				" interrupt.");
+			sh->intr_handle_devx.fd = -1;
+		}
+#endif /* HAVE_IBV_DEVX_ASYNC */
+	}
+}
+
+/**
+ * Uninstall shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ *   Pointer to mlx5_dev_ctx_shared object.
+ */
+void
+mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
+{
+	if (sh->intr_handle.fd >= 0)
+		mlx5_intr_callback_unregister(&sh->intr_handle,
+					      mlx5_dev_interrupt_handler, sh);
+#ifdef HAVE_IBV_DEVX_ASYNC
+	if (sh->intr_handle_devx.fd >= 0)
+		rte_intr_callback_unregister(&sh->intr_handle_devx,
+				  mlx5_dev_interrupt_handler_devx, sh);
+	if (sh->devx_comp)
+		mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
+#endif
+}
+
+/**
+ * Read statistics by a named counter.
+ *
+ * @param[in] priv
+ *   Pointer to the private device data structure.
+ * @param[in] ctr_name
+ *   Pointer to the name of the statistic counter to read
+ * @param[out] stat
+ *   Pointer to read statistic value.
+ * @return
+ *   0 on success and stat is valud, 1 if failed to read the value
+ *   rte_errno is set.
+ *
+ */
+int
+mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name,
+		      uint64_t *stat)
+{
+	int fd;
+
+	if (priv->sh) {
+		if (priv->q_counters != NULL &&
+		    strcmp(ctr_name, "out_of_buffer") == 0)
+			return mlx5_devx_cmd_queue_counter_query
+					(priv->q_counters, 0, (uint32_t *)stat);
+		MKSTR(path, "%s/ports/%d/hw_counters/%s",
+		      priv->sh->ibdev_path,
+		      priv->dev_port,
+		      ctr_name);
+		fd = open(path, O_RDONLY);
+		/*
+		 * in switchdev the file location is not per port
+		 * but rather in <ibdev_path>/hw_counters/<file_name>.
+		 */
+		if (fd == -1) {
+			MKSTR(path1, "%s/hw_counters/%s",
+			      priv->sh->ibdev_path,
+			      ctr_name);
+			fd = open(path1, O_RDONLY);
+		}
+		if (fd != -1) {
+			char buf[21] = {'\0'};
+			ssize_t n = read(fd, buf, sizeof(buf));
+
+			close(fd);
+			if (n != -1) {
+				*stat = strtoull(buf, NULL, 10);
+				return 0;
+			}
+		}
+	}
+	*stat = 0;
+	return 1;
+}
+
+/**
+ * Set the reg_mr and dereg_mr call backs
+ *
+ * @param reg_mr_cb[out]
+ *   Pointer to reg_mr func
+ * @param dereg_mr_cb[out]
+ *   Pointer to dereg_mr func
+ *
+ */
+void
+mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
+		      mlx5_dereg_mr_t *dereg_mr_cb)
+{
+	*reg_mr_cb = mlx5_mr_verbs_ops.reg_mr;
+	*dereg_mr_cb = mlx5_mr_verbs_ops.dereg_mr;
+}
+
+/**
+ * Remove a MAC address from device
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param index
+ *   MAC address index.
+ */
+void
+mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const int vf = priv->config.vf;
+
+	if (vf)
+		mlx5_nl_mac_addr_remove(priv->nl_socket_route,
+					mlx5_ifindex(dev), priv->mac_own,
+					&dev->data->mac_addrs[index], index);
+}
+
+/**
+ * Adds a MAC address to the device
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mac_addr
+ *   MAC address to register.
+ * @param index
+ *   MAC address index.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise
+ */
+int
+mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+		     uint32_t index)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	const int vf = priv->config.vf;
+	int ret = 0;
+
+	if (vf)
+		ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
+					   mlx5_ifindex(dev), priv->mac_own,
+					   mac, index);
+	return ret;
+}
+
+/**
+ * Modify a VF MAC address
+ *
+ * @param priv
+ *   Pointer to device private data.
+ * @param mac_addr
+ *   MAC address to modify into.
+ * @param iface_idx
+ *   Net device interface index
+ * @param vf_index
+ *   VF index
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise
+ */
+int
+mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv,
+			   unsigned int iface_idx,
+			   struct rte_ether_addr *mac_addr,
+			   int vf_index)
+{
+	return mlx5_nl_vf_mac_addr_modify
+		(priv->nl_socket_route, iface_idx, mac_addr, vf_index);
+}
+
+/**
+ * Set device promiscuous mode
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param enable
+ *   0 - promiscuous is disabled, otherwise - enabled
+ *
+ * @return
+ *   0 on success, a negative error value otherwise
+ */
+int
+mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	return mlx5_nl_promisc(priv->nl_socket_route,
+			       mlx5_ifindex(dev), !!enable);
+}
+
+/**
+ * Set device promiscuous mode
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param enable
+ *   0 - all multicase is disabled, otherwise - enabled
+ *
+ * @return
+ *   0 on success, a negative error value otherwise
+ */
+int
+mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	return mlx5_nl_allmulti(priv->nl_socket_route,
+				mlx5_ifindex(dev), !!enable);
+}
+
+/**
+ * Flush device MAC addresses
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ *
+ */
+void
+mlx5_os_mac_addr_flush(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
+			       dev->data->mac_addrs,
+			       MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_os.h b/drivers/net/mlx5/freebsd/mlx5_os.h
new file mode 100644
index 0000000000..2991d37df2
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_os.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_OS_H_
+#define RTE_PMD_MLX5_OS_H_
+
+#include <net/if.h>
+
+/* verb enumerations translations to local enums. */
+enum {
+	MLX5_FS_NAME_MAX = IBV_SYSFS_NAME_MAX + 1,
+	MLX5_FS_PATH_MAX = IBV_SYSFS_PATH_MAX + 1
+};
+
+/* Maximal data of sendmsg message(in bytes). */
+#define MLX5_SENDMSG_MAX 64
+
+#define MLX5_NAMESIZE IF_NAMESIZE
+
+int mlx5_auxiliary_get_ifindex(const char *sf_name);
+
+#endif /* RTE_PMD_MLX5_OS_H_ */
diff --git a/drivers/net/mlx5/freebsd/mlx5_socket.c b/drivers/net/mlx5/freebsd/mlx5_socket.c
new file mode 100644
index 0000000000..6356b66dc4
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_socket.c
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "rte_eal.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+
+/* PMD socket service for tools. */
+
+#define MLX5_SOCKET_PATH "/var/tmp/dpdk_net_mlx5_%d"
+
+int server_socket; /* Unix socket for primary process. */
+struct rte_intr_handle server_intr_handle; /* Interrupt handler. */
+
+/**
+ * Handle server pmd socket interrupts.
+ */
+static void
+mlx5_pmd_socket_handle(void *cb __rte_unused)
+{
+	int conn_sock;
+	int ret;
+	struct cmsghdr *cmsg = NULL;
+	uint32_t data[MLX5_SENDMSG_MAX / sizeof(uint32_t)];
+	uint64_t flow_ptr = 0;
+	uint8_t  buf[CMSG_SPACE(sizeof(int))] = { 0 };
+	struct iovec io = {
+		.iov_base = data,
+		.iov_len = sizeof(data),
+	};
+	struct msghdr msg = {
+		.msg_iov = &io,
+		.msg_iovlen = 1,
+		.msg_control = buf,
+		.msg_controllen = sizeof(buf),
+	};
+
+	uint32_t port_id;
+	int fd;
+	FILE *file = NULL;
+	struct rte_eth_dev *dev;
+	struct rte_flow_error err;
+	struct mlx5_flow_dump_req  *dump_req;
+	struct mlx5_flow_dump_ack  *dump_ack;
+
+	memset(data, 0, sizeof(data));
+	/* Accept the connection from the client. */
+	conn_sock = accept(server_socket, NULL, NULL);
+	if (conn_sock < 0) {
+		DRV_LOG(WARNING, "connection failed: %s", strerror(errno));
+		return;
+	}
+	ret = recvmsg(conn_sock, &msg, MSG_WAITALL);
+	if (ret != sizeof(struct mlx5_flow_dump_req)) {
+		DRV_LOG(WARNING, "wrong message received: %s",
+			strerror(errno));
+		goto error;
+	}
+
+	/* Receive file descriptor. */
+	cmsg = CMSG_FIRSTHDR(&msg);
+	if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS ||
+	    cmsg->cmsg_len < sizeof(int)) {
+		DRV_LOG(WARNING, "invalid file descriptor message");
+		goto error;
+	}
+	memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+	file = fdopen(fd, "w");
+	if (!file) {
+		DRV_LOG(WARNING, "Failed to open file");
+		goto error;
+	}
+	/* Receive port number. */
+	if (msg.msg_iovlen != 1 || msg.msg_iov->iov_len < sizeof(uint16_t)) {
+		DRV_LOG(WARNING, "wrong port number message");
+		goto error;
+	}
+
+	dump_req = (struct mlx5_flow_dump_req *)msg.msg_iov->iov_base;
+	if (dump_req) {
+		port_id = dump_req->port_id;
+		flow_ptr = dump_req->flow_id;
+	} else {
+		DRV_LOG(WARNING, "Invalid message");
+		goto error;
+	}
+
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		DRV_LOG(WARNING, "Invalid port %u", port_id);
+		goto error;
+	}
+
+	/* Dump flow. */
+	dev = &rte_eth_devices[port_id];
+	if (flow_ptr == 0)
+		ret = mlx5_flow_dev_dump(dev, NULL, file, NULL);
+	else
+		ret = mlx5_flow_dev_dump(dev,
+			(struct rte_flow *)((uintptr_t)flow_ptr), file, &err);
+
+	/* Set-up the ancillary data and reply. */
+	msg.msg_controllen = 0;
+	msg.msg_control = NULL;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &io;
+	dump_ack = (struct mlx5_flow_dump_ack *)data;
+	dump_ack->rc = -ret;
+	io.iov_len = sizeof(struct mlx5_flow_dump_ack);
+	io.iov_base = dump_ack;
+	do {
+		ret = sendmsg(conn_sock, &msg, 0);
+	} while (ret < 0 && errno == EINTR);
+	if (ret < 0)
+		DRV_LOG(WARNING, "failed to send response %s",
+			strerror(errno));
+error:
+	if (conn_sock >= 0)
+		close(conn_sock);
+	if (file)
+		fclose(file);
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @return
+ *   0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_pmd_interrupt_handler_install(void)
+{
+	MLX5_ASSERT(server_socket);
+	server_intr_handle.fd = server_socket;
+	server_intr_handle.type = RTE_INTR_HANDLE_EXT;
+	return rte_intr_callback_register(&server_intr_handle,
+					  mlx5_pmd_socket_handle, NULL);
+}
+
+/**
+ * Uninstall interrupt handler.
+ */
+static void
+mlx5_pmd_interrupt_handler_uninstall(void)
+{
+	if (server_socket) {
+		mlx5_intr_callback_unregister(&server_intr_handle,
+					      mlx5_pmd_socket_handle,
+					      NULL);
+	}
+	server_intr_handle.fd = 0;
+	server_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+}
+
+/**
+ * Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, a negative value otherwise.
+ */
+int
+mlx5_pmd_socket_init(void)
+{
+	struct sockaddr_un sun = {
+		.sun_family = AF_UNIX,
+	};
+	int ret;
+	int flags;
+
+	MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+	if (server_socket)
+		return 0;
+	/*
+	 * Initialize the socket to communicate with the secondary
+	 * process.
+	 */
+	ret = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "Failed to open mlx5 socket: %s",
+			strerror(errno));
+		goto error;
+	}
+	server_socket = ret;
+	flags = fcntl(server_socket, F_GETFL, 0);
+	if (flags == -1)
+		goto error;
+	ret = fcntl(server_socket, F_SETFL, flags | O_NONBLOCK);
+	if (ret < 0)
+		goto error;
+	snprintf(sun.sun_path, sizeof(sun.sun_path), MLX5_SOCKET_PATH,
+		 getpid());
+	remove(sun.sun_path);
+	ret = bind(server_socket, (const struct sockaddr *)&sun, sizeof(sun));
+	if (ret < 0) {
+		DRV_LOG(WARNING,
+			"cannot bind mlx5 socket: %s", strerror(errno));
+		goto close;
+	}
+	ret = listen(server_socket, 0);
+	if (ret < 0) {
+		DRV_LOG(WARNING, "cannot listen on mlx5 socket: %s",
+			strerror(errno));
+		goto close;
+	}
+	if (mlx5_pmd_interrupt_handler_install()) {
+		DRV_LOG(WARNING, "cannot register interrupt handler for mlx5 socket: %s",
+			strerror(errno));
+		goto close;
+	}
+	return 0;
+close:
+	remove(sun.sun_path);
+error:
+	claim_zero(close(server_socket));
+	server_socket = 0;
+	DRV_LOG(ERR, "Cannot initialize socket: %s", strerror(errno));
+	return -errno;
+}
+
+/**
+ * Un-Initialize the pmd socket
+ */
+RTE_FINI(mlx5_pmd_socket_uninit)
+{
+	if (!server_socket)
+		return;
+	mlx5_pmd_interrupt_handler_uninstall();
+	claim_zero(close(server_socket));
+	server_socket = 0;
+	MKSTR(path, MLX5_SOCKET_PATH, getpid());
+	claim_zero(remove(path));
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_verbs.c b/drivers/net/mlx5/freebsd/mlx5_verbs.c
new file mode 100644
index 0000000000..d4fa202ac4
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_verbs.c
@@ -0,0 +1,1208 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sys/queue.h>
+
+#include "mlx5_autoconf.h"
+
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_common.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mr.h>
+#include <mlx5_verbs.h>
+#include <mlx5_rx.h>
+#include <mlx5_tx.h>
+#include <mlx5_utils.h>
+#include <mlx5_malloc.h>
+
+/**
+ * Register mr. Given protection domain pointer, pointer to addr and length
+ * register the memory region.
+ *
+ * @param[in] pd
+ *   Pointer to protection domain context.
+ * @param[in] addr
+ *   Pointer to memory start address.
+ * @param[in] length
+ *   Length of the memory to register.
+ * @param[out] pmd_mr
+ *   pmd_mr struct set with lkey, address, length and pointer to mr object
+ *
+ * @return
+ *   0 on successful registration, -1 otherwise
+ */
+static int
+mlx5_reg_mr(void *pd, void *addr, size_t length,
+		 struct mlx5_pmd_mr *pmd_mr)
+{
+	return mlx5_common_verbs_reg_mr(pd, addr, length, pmd_mr);
+}
+
+/**
+ * Deregister mr. Given the mlx5 pmd MR - deregister the MR
+ *
+ * @param[in] pmd_mr
+ *   pmd_mr struct set with lkey, address, length and pointer to mr object
+ *
+ */
+static void
+mlx5_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
+{
+	mlx5_common_verbs_dereg_mr(pmd_mr);
+}
+
+/* verbs operations. */
+const struct mlx5_mr_ops mlx5_mr_verbs_ops = {
+	.reg_mr = mlx5_reg_mr,
+	.dereg_mr = mlx5_dereg_mr,
+};
+
+/**
+ * Modify Rx WQ vlan stripping offload
+ *
+ * @param rxq_obj
+ *   Rx queue object.
+ *
+ * @return 0 on success, non-0 otherwise
+ */
+static int
+mlx5_rxq_obj_modify_wq_vlan_strip(struct mlx5_rxq_obj *rxq_obj, int on)
+{
+	uint16_t vlan_offloads =
+		(on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) |
+		0;
+	struct ibv_wq_attr mod;
+	mod = (struct ibv_wq_attr){
+		.attr_mask = IBV_WQ_ATTR_FLAGS,
+		.flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING,
+		.flags = vlan_offloads,
+	};
+
+	return mlx5_glue->modify_wq(rxq_obj->wq, &mod);
+}
+
+/**
+ * Modifies the attributes for the specified WQ.
+ *
+ * @param rxq_obj
+ *   Verbs Rx queue object.
+ * @param type
+ *   Type of change queue state.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_modify_wq(struct mlx5_rxq_obj *rxq_obj, uint8_t type)
+{
+	struct ibv_wq_attr mod = {
+		.attr_mask = IBV_WQ_ATTR_STATE,
+		.wq_state = (enum ibv_wq_state)type,
+	};
+
+	return mlx5_glue->modify_wq(rxq_obj->wq, &mod);
+}
+
+/**
+ * Modify QP using Verbs API.
+ *
+ * @param txq_obj
+ *   Verbs Tx queue object.
+ * @param type
+ *   Type of change queue state.
+ * @param dev_port
+ *   IB device port number.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_modify_qp(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
+		   uint8_t dev_port)
+{
+	struct ibv_qp_attr mod = {
+		.qp_state = IBV_QPS_RESET,
+		.port_num = dev_port,
+	};
+	int attr_mask = (IBV_QP_STATE | IBV_QP_PORT);
+	int ret;
+
+	if (type != MLX5_TXQ_MOD_RST2RDY) {
+		ret = mlx5_glue->modify_qp(obj->qp, &mod, IBV_QP_STATE);
+		if (ret) {
+			DRV_LOG(ERR, "Cannot change Tx QP state to RESET %s",
+				strerror(errno));
+			rte_errno = errno;
+			return ret;
+		}
+		if (type == MLX5_TXQ_MOD_RDY2RST)
+			return 0;
+	}
+	if (type == MLX5_TXQ_MOD_ERR2RDY)
+		attr_mask = IBV_QP_STATE;
+	mod.qp_state = IBV_QPS_INIT;
+	ret = mlx5_glue->modify_qp(obj->qp, &mod, attr_mask);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	mod.qp_state = IBV_QPS_RTR;
+	ret = mlx5_glue->modify_qp(obj->qp, &mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	mod.qp_state = IBV_QPS_RTS;
+	ret = mlx5_glue->modify_qp(obj->qp, &mod, IBV_QP_STATE);
+	if (ret) {
+		DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
+			strerror(errno));
+		rte_errno = errno;
+		return ret;
+	}
+	return 0;
+}
+
+/**
+ * Create a CQ Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array.
+ *
+ * @return
+ *   The Verbs CQ object initialized, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_cq *
+mlx5_rxq_ibv_cq_create(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_rxq_obj *rxq_obj = rxq_ctrl->obj;
+	unsigned int cqe_n = mlx5_rxq_cqe_num(rxq_data);
+	struct {
+		struct ibv_cq_init_attr_ex ibv;
+		struct mlx5dv_cq_init_attr mlx5;
+	} cq_attr;
+
+	cq_attr.ibv = (struct ibv_cq_init_attr_ex){
+		.cqe = cqe_n,
+		.channel = rxq_obj->ibv_channel,
+		.comp_mask = 0,
+	};
+	cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
+		.comp_mask = 0,
+	};
+	if (priv->config.cqe_comp && !rxq_data->hw_timestamp) {
+		cq_attr.mlx5.comp_mask |=
+				MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
+		rxq_data->byte_mask = UINT32_MAX;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+		if (mlx5_rxq_mprq_enabled(rxq_data)) {
+			cq_attr.mlx5.cqe_comp_res_format =
+					MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX;
+			rxq_data->mcqe_format =
+					MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+		} else {
+			cq_attr.mlx5.cqe_comp_res_format =
+					MLX5DV_CQE_RES_FORMAT_HASH;
+			rxq_data->mcqe_format =
+					MLX5_CQE_RESP_FORMAT_HASH;
+		}
+#else
+		cq_attr.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
+		rxq_data->mcqe_format = MLX5_CQE_RESP_FORMAT_HASH;
+#endif
+		/*
+		 * For vectorized Rx, it must not be doubled in order to
+		 * make cq_ci and rq_ci aligned.
+		 */
+		if (mlx5_rxq_check_vec_support(rxq_data) < 0)
+			cq_attr.ibv.cqe *= 2;
+	} else if (priv->config.cqe_comp && rxq_data->hw_timestamp) {
+		DRV_LOG(DEBUG,
+			"Port %u Rx CQE compression is disabled for HW"
+			" timestamp.",
+			dev->data->port_id);
+	}
+#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
+	if (RTE_CACHE_LINE_SIZE == 128) {
+		cq_attr.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS;
+		cq_attr.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD;
+	}
+#endif
+	return mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(priv->sh->ctx,
+							      &cq_attr.ibv,
+							      &cq_attr.mlx5));
+}
+
+/**
+ * Create a WQ Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array.
+ *
+ * @return
+ *   The Verbs WQ object initialized, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_wq *
+mlx5_rxq_ibv_wq_create(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_rxq_obj *rxq_obj = rxq_ctrl->obj;
+	unsigned int wqe_n = 1 << rxq_data->elts_n;
+	struct {
+		struct ibv_wq_init_attr ibv;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+		struct mlx5dv_wq_init_attr mlx5;
+#endif
+	} wq_attr;
+
+	wq_attr.ibv = (struct ibv_wq_init_attr){
+		.wq_context = NULL, /* Could be useful in the future. */
+		.wq_type = IBV_WQT_RQ,
+		/* Max number of outstanding WRs. */
+		.max_wr = wqe_n >> rxq_data->sges_n,
+		/* Max number of scatter/gather elements in a WR. */
+		.max_sge = 1 << rxq_data->sges_n,
+		.pd = priv->sh->pd,
+		.cq = rxq_obj->ibv_cq,
+		.comp_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING | 0,
+		.create_flags = (rxq_data->vlan_strip ?
+				 IBV_WQ_FLAGS_CVLAN_STRIPPING : 0),
+	};
+	/* By default, FCS (CRC) is stripped by hardware. */
+	if (rxq_data->crc_present) {
+		wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+	}
+	if (priv->config.hw_padding) {
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+		wq_attr.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
+		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+		wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING;
+		wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#endif
+	}
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+	wq_attr.mlx5 = (struct mlx5dv_wq_init_attr){
+		.comp_mask = 0,
+	};
+	if (mlx5_rxq_mprq_enabled(rxq_data)) {
+		struct mlx5dv_striding_rq_init_attr *mprq_attr =
+						&wq_attr.mlx5.striding_rq_attrs;
+
+		wq_attr.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
+		*mprq_attr = (struct mlx5dv_striding_rq_init_attr){
+			.single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
+			.single_wqe_log_num_of_strides = rxq_data->strd_num_n,
+			.two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
+		};
+	}
+	rxq_obj->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &wq_attr.ibv,
+					      &wq_attr.mlx5);
+#else
+	rxq_obj->wq = mlx5_glue->create_wq(priv->sh->ctx, &wq_attr.ibv);
+#endif
+	if (rxq_obj->wq) {
+		/*
+		 * Make sure number of WRs*SGEs match expectations since a queue
+		 * cannot allocate more than "desc" buffers.
+		 */
+		if (wq_attr.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
+		    wq_attr.ibv.max_sge != (1u << rxq_data->sges_n)) {
+			DRV_LOG(ERR,
+				"Port %u Rx queue %u requested %u*%u but got"
+				" %u*%u WRs*SGEs.",
+				dev->data->port_id, idx,
+				wqe_n >> rxq_data->sges_n,
+				(1 << rxq_data->sges_n),
+				wq_attr.ibv.max_wr, wq_attr.ibv.max_sge);
+			claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+			rxq_obj->wq = NULL;
+			rte_errno = EINVAL;
+		}
+	}
+	return rxq_obj->wq;
+}
+
+/**
+ * Create the Rx queue Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Rx queue array.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rxq_ibv_obj_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_rxq_obj *tmpl = rxq_ctrl->obj;
+	struct mlx5dv_cq cq_info;
+	struct mlx5dv_rwq rwq;
+	int ret = 0;
+	struct mlx5dv_obj obj;
+
+	MLX5_ASSERT(rxq_data);
+	MLX5_ASSERT(tmpl);
+	tmpl->rxq_ctrl = rxq_ctrl;
+	if (rxq_ctrl->irq) {
+		tmpl->ibv_channel =
+				mlx5_glue->create_comp_channel(priv->sh->ctx);
+		if (!tmpl->ibv_channel) {
+			DRV_LOG(ERR, "Port %u: comp channel creation failure.",
+				dev->data->port_id);
+			rte_errno = ENOMEM;
+			goto error;
+		}
+		tmpl->fd = ((struct ibv_comp_channel *)(tmpl->ibv_channel))->fd;
+	}
+	/* Create CQ using Verbs API. */
+	tmpl->ibv_cq = mlx5_rxq_ibv_cq_create(dev, idx);
+	if (!tmpl->ibv_cq) {
+		DRV_LOG(ERR, "Port %u Rx queue %u CQ creation failure.",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	obj.cq.in = tmpl->ibv_cq;
+	obj.cq.out = &cq_info;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ);
+	if (ret) {
+		rte_errno = ret;
+		goto error;
+	}
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		DRV_LOG(ERR,
+			"Port %u wrong MLX5_CQE_SIZE environment "
+			"variable value: it should be set to %u.",
+			dev->data->port_id, RTE_CACHE_LINE_SIZE);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	/* Fill the rings. */
+	rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	rxq_data->cq_db = cq_info.dbrec;
+	rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+	rxq_data->cq_uar = cq_info.cq_uar;
+	rxq_data->cqn = cq_info.cqn;
+	/* Create WQ (RQ) using Verbs API. */
+	tmpl->wq = mlx5_rxq_ibv_wq_create(dev, idx);
+	if (!tmpl->wq) {
+		DRV_LOG(ERR, "Port %u Rx queue %u WQ creation failure.",
+			dev->data->port_id, idx);
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	/* Change queue state to ready. */
+	ret = mlx5_ibv_modify_wq(tmpl, IBV_WQS_RDY);
+	if (ret) {
+		DRV_LOG(ERR,
+			"Port %u Rx queue %u WQ state to IBV_WQS_RDY failed.",
+			dev->data->port_id, idx);
+		rte_errno = ret;
+		goto error;
+	}
+	obj.rwq.in = tmpl->wq;
+	obj.rwq.out = &rwq;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_RWQ);
+	if (ret) {
+		rte_errno = ret;
+		goto error;
+	}
+	rxq_data->wqes = rwq.buf;
+	rxq_data->rq_db = rwq.dbrec;
+	rxq_data->cq_arm_sn = 0;
+	mlx5_rxq_initialize(rxq_data);
+	rxq_data->cq_ci = 0;
+	dev->data->rx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
+	rxq_ctrl->wqn = ((struct ibv_wq *)(tmpl->wq))->wq_num;
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	if (tmpl->wq)
+		claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
+	if (tmpl->ibv_cq)
+		claim_zero(mlx5_glue->destroy_cq(tmpl->ibv_cq));
+	if (tmpl->ibv_channel)
+		claim_zero(mlx5_glue->destroy_comp_channel(tmpl->ibv_channel));
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Release an Rx verbs queue object.
+ *
+ * @param rxq_obj
+ *   Verbs Rx queue object.
+ */
+static void
+mlx5_rxq_ibv_obj_release(struct mlx5_rxq_obj *rxq_obj)
+{
+	MLX5_ASSERT(rxq_obj);
+	MLX5_ASSERT(rxq_obj->wq);
+	MLX5_ASSERT(rxq_obj->ibv_cq);
+	claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+	claim_zero(mlx5_glue->destroy_cq(rxq_obj->ibv_cq));
+	if (rxq_obj->ibv_channel)
+		claim_zero(mlx5_glue->destroy_comp_channel
+							(rxq_obj->ibv_channel));
+}
+
+/**
+ * Get event for an Rx verbs queue object.
+ *
+ * @param rxq_obj
+ *   Verbs Rx queue object.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rx_ibv_get_event(struct mlx5_rxq_obj *rxq_obj)
+{
+	struct ibv_cq *ev_cq;
+	void *ev_ctx;
+	int ret = mlx5_glue->get_cq_event(rxq_obj->ibv_channel,
+					  &ev_cq, &ev_ctx);
+
+	if (ret < 0 || ev_cq != rxq_obj->ibv_cq)
+		goto exit;
+	mlx5_glue->ack_cq_events(rxq_obj->ibv_cq, 1);
+	return 0;
+exit:
+	if (ret < 0)
+		rte_errno = errno;
+	else
+		rte_errno = EINVAL;
+	return -rte_errno;
+}
+
+/**
+ * Creates a receive work queue as a filed of indirection table.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param log_n
+ *   Log of number of queues in the array.
+ * @param ind_tbl
+ *   Verbs indirection table object.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_ind_table_new(struct rte_eth_dev *dev, const unsigned int log_n,
+		       struct mlx5_ind_table_obj *ind_tbl)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_wq *wq[1 << log_n];
+	unsigned int i, j;
+
+	MLX5_ASSERT(ind_tbl);
+	for (i = 0; i != ind_tbl->queues_n; ++i) {
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[ind_tbl->queues[i]];
+		struct mlx5_rxq_ctrl *rxq_ctrl =
+				container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+		wq[i] = rxq_ctrl->obj->wq;
+	}
+	MLX5_ASSERT(i > 0);
+	/* Finalise indirection table. */
+	for (j = 0; i != (unsigned int)(1 << log_n); ++j, ++i)
+		wq[i] = wq[j];
+	ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table(priv->sh->ctx,
+					&(struct ibv_rwq_ind_table_init_attr){
+						.log_ind_tbl_size = log_n,
+						.ind_tbl = wq,
+						.comp_mask = 0,
+					});
+	if (!ind_tbl->ind_table) {
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	return 0;
+}
+
+/**
+ * Destroys the specified Indirection Table.
+ *
+ * @param ind_table
+ *   Indirection table to release.
+ */
+static void
+mlx5_ibv_ind_table_destroy(struct mlx5_ind_table_obj *ind_tbl)
+{
+	claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
+}
+
+/**
+ * Create an Rx Hash queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param hrxq
+ *   Pointer to Rx Hash queue.
+ * @param tunnel
+ *   Tunnel type.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_hrxq_new(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq,
+		  int tunnel __rte_unused)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_qp *qp = NULL;
+	struct mlx5_ind_table_obj *ind_tbl = hrxq->ind_table;
+	const uint8_t *rss_key = hrxq->rss_key;
+	uint64_t hash_fields = hrxq->hash_fields;
+	int err;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+	struct mlx5dv_qp_init_attr qp_init_attr;
+
+	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+	if (tunnel) {
+		qp_init_attr.comp_mask =
+				       MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+		qp_init_attr.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
+	}
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	if (dev->data->dev_conf.lpbk_mode) {
+		/* Allow packet sent from NIC loop back w/o source MAC check. */
+		qp_init_attr.comp_mask |=
+				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+		qp_init_attr.create_flags |=
+				MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
+	}
+#endif
+	qp = mlx5_glue->dv_create_qp
+			(priv->sh->ctx,
+			 &(struct ibv_qp_init_attr_ex){
+				.qp_type = IBV_QPT_RAW_PACKET,
+				.comp_mask =
+					IBV_QP_INIT_ATTR_PD |
+					IBV_QP_INIT_ATTR_IND_TABLE |
+					IBV_QP_INIT_ATTR_RX_HASH,
+				.rx_hash_conf = (struct ibv_rx_hash_conf){
+					.rx_hash_function =
+						IBV_RX_HASH_FUNC_TOEPLITZ,
+					.rx_hash_key_len = hrxq->rss_key_len,
+					.rx_hash_key =
+						(void *)(uintptr_t)rss_key,
+					.rx_hash_fields_mask = hash_fields,
+				},
+				.rwq_ind_tbl = ind_tbl->ind_table,
+				.pd = priv->sh->pd,
+			  },
+			  &qp_init_attr);
+#else
+	qp = mlx5_glue->create_qp_ex
+			(priv->sh->ctx,
+			 &(struct ibv_qp_init_attr_ex){
+				.qp_type = IBV_QPT_RAW_PACKET,
+				.comp_mask =
+					IBV_QP_INIT_ATTR_PD |
+					IBV_QP_INIT_ATTR_IND_TABLE |
+					IBV_QP_INIT_ATTR_RX_HASH,
+				.rx_hash_conf = (struct ibv_rx_hash_conf){
+					.rx_hash_function =
+						IBV_RX_HASH_FUNC_TOEPLITZ,
+					.rx_hash_key_len = hrxq->rss_key_len,
+					.rx_hash_key =
+						(void *)(uintptr_t)rss_key,
+					.rx_hash_fields_mask = hash_fields,
+				},
+				.rwq_ind_tbl = ind_tbl->ind_table,
+				.pd = priv->sh->pd,
+			 });
+#endif
+	if (!qp) {
+		rte_errno = errno;
+		goto error;
+	}
+	hrxq->qp = qp;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+	if (!hrxq->action) {
+		rte_errno = errno;
+		goto error;
+	}
+#endif
+	return 0;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	if (qp)
+		claim_zero(mlx5_glue->destroy_qp(qp));
+	rte_errno = err; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/**
+ * Destroy a Verbs queue pair.
+ *
+ * @param hrxq
+ *   Hash Rx queue to release its qp.
+ */
+static void
+mlx5_ibv_qp_destroy(struct mlx5_hrxq *hrxq)
+{
+	claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+}
+
+/**
+ * Release a drop Rx queue Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_rxq_ibv_obj_drop_release(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
+
+	if (rxq->wq)
+		claim_zero(mlx5_glue->destroy_wq(rxq->wq));
+	if (rxq->ibv_cq)
+		claim_zero(mlx5_glue->destroy_cq(rxq->ibv_cq));
+	mlx5_free(rxq);
+	priv->drop_queue.rxq = NULL;
+}
+
+/**
+ * Create a drop Rx queue Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rxq_ibv_obj_drop_create(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct ibv_context *ctx = priv->sh->ctx;
+	struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
+
+	if (rxq)
+		return 0;
+	rxq = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*rxq), 0, SOCKET_ID_ANY);
+	if (!rxq) {
+		DRV_LOG(DEBUG, "Port %u cannot allocate drop Rx queue memory.",
+		      dev->data->port_id);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	priv->drop_queue.rxq = rxq;
+	rxq->ibv_cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
+	if (!rxq->ibv_cq) {
+		DRV_LOG(DEBUG, "Port %u cannot allocate CQ for drop queue.",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	rxq->wq = mlx5_glue->create_wq(ctx, &(struct ibv_wq_init_attr){
+						    .wq_type = IBV_WQT_RQ,
+						    .max_wr = 1,
+						    .max_sge = 1,
+						    .pd = priv->sh->pd,
+						    .cq = rxq->ibv_cq,
+					      });
+	if (!rxq->wq) {
+		DRV_LOG(DEBUG, "Port %u cannot allocate WQ for drop queue.",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	priv->drop_queue.rxq = rxq;
+	return 0;
+error:
+	mlx5_rxq_ibv_obj_drop_release(dev);
+	return -rte_errno;
+}
+
+/**
+ * Create a Verbs drop action for Rx Hash queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_drop_action_create(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
+	struct ibv_rwq_ind_table *ind_tbl = NULL;
+	struct mlx5_rxq_obj *rxq;
+	int ret;
+
+	MLX5_ASSERT(hrxq && hrxq->ind_table);
+	ret = mlx5_rxq_ibv_obj_drop_create(dev);
+	if (ret < 0)
+		goto error;
+	rxq = priv->drop_queue.rxq;
+	ind_tbl = mlx5_glue->create_rwq_ind_table
+				(priv->sh->ctx,
+				 &(struct ibv_rwq_ind_table_init_attr){
+					.log_ind_tbl_size = 0,
+					.ind_tbl = (struct ibv_wq **)&rxq->wq,
+					.comp_mask = 0,
+				 });
+	if (!ind_tbl) {
+		DRV_LOG(DEBUG, "Port %u"
+			" cannot allocate indirection table for drop queue.",
+			dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	hrxq->qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
+		 &(struct ibv_qp_init_attr_ex){
+			.qp_type = IBV_QPT_RAW_PACKET,
+			.comp_mask = IBV_QP_INIT_ATTR_PD |
+				     IBV_QP_INIT_ATTR_IND_TABLE |
+				     IBV_QP_INIT_ATTR_RX_HASH,
+			.rx_hash_conf = (struct ibv_rx_hash_conf){
+				.rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+				.rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
+				.rx_hash_key = rss_hash_default_key,
+				.rx_hash_fields_mask = 0,
+				},
+			.rwq_ind_tbl = ind_tbl,
+			.pd = priv->sh->pd
+		 });
+	if (!hrxq->qp) {
+		DRV_LOG(DEBUG, "Port %u cannot allocate QP for drop queue.",
+		      dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+	if (!hrxq->action) {
+		rte_errno = errno;
+		goto error;
+	}
+#endif
+	hrxq->ind_table->ind_table = ind_tbl;
+	return 0;
+error:
+	if (hrxq->qp)
+		claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+	if (ind_tbl)
+		claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl));
+	if (priv->drop_queue.rxq)
+		mlx5_rxq_ibv_obj_drop_release(dev);
+	return -rte_errno;
+}
+
+/**
+ * Release a drop hash Rx queue.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+static void
+mlx5_ibv_drop_action_destroy(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
+	struct ibv_rwq_ind_table *ind_tbl = hrxq->ind_table->ind_table;
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	claim_zero(mlx5_glue->destroy_flow_action(hrxq->action));
+#endif
+	claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+	claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl));
+	mlx5_rxq_ibv_obj_drop_release(dev);
+}
+
+/**
+ * Create a QP Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array.
+ *
+ * @return
+ *   The QP Verbs object, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_qp *
+mlx5_txq_ibv_qp_create(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+			container_of(txq_data, struct mlx5_txq_ctrl, txq);
+	struct ibv_qp *qp_obj = NULL;
+	struct ibv_qp_init_attr_ex qp_attr = { 0 };
+	const int desc = 1 << txq_data->elts_n;
+
+	MLX5_ASSERT(txq_ctrl->obj->cq);
+	/* CQ to be associated with the send queue. */
+	qp_attr.send_cq = txq_ctrl->obj->cq;
+	/* CQ to be associated with the receive queue. */
+	qp_attr.recv_cq = txq_ctrl->obj->cq;
+	/* Max number of outstanding WRs. */
+	qp_attr.cap.max_send_wr = ((priv->sh->device_attr.max_qp_wr < desc) ?
+				   priv->sh->device_attr.max_qp_wr : desc);
+	/*
+	 * Max number of scatter/gather elements in a WR, must be 1 to prevent
+	 * libmlx5 from trying to affect must be 1 to prevent libmlx5 from
+	 * trying to affect too much memory. TX gather is not impacted by the
+	 * device_attr.max_sge limit and will still work properly.
+	 */
+	qp_attr.cap.max_send_sge = 1;
+	qp_attr.qp_type = IBV_QPT_RAW_PACKET,
+	/* Do *NOT* enable this, completions events are managed per Tx burst. */
+	qp_attr.sq_sig_all = 0;
+	qp_attr.pd = priv->sh->pd;
+	qp_attr.comp_mask = IBV_QP_INIT_ATTR_PD;
+	if (txq_data->inlen_send)
+		qp_attr.cap.max_inline_data = txq_ctrl->max_inline_data;
+	if (txq_data->tso_en) {
+		qp_attr.max_tso_header = txq_ctrl->max_tso_header;
+		qp_attr.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER;
+	}
+	qp_obj = mlx5_glue->create_qp_ex(priv->sh->ctx, &qp_attr);
+	if (qp_obj == NULL) {
+		DRV_LOG(ERR, "Port %u Tx queue %u QP creation failure.",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+	}
+	return qp_obj;
+}
+
+/**
+ * Create the Tx queue Verbs object.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param idx
+ *   Queue index in DPDK Tx queue array.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_txq_ibv_obj_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq_data, struct mlx5_txq_ctrl, txq);
+	struct mlx5_txq_obj *txq_obj = txq_ctrl->obj;
+	unsigned int cqe_n;
+	struct mlx5dv_qp qp;
+	struct mlx5dv_cq cq_info;
+	struct mlx5dv_obj obj;
+	const int desc = 1 << txq_data->elts_n;
+	int ret = 0;
+
+	MLX5_ASSERT(txq_data);
+	MLX5_ASSERT(txq_obj);
+	txq_obj->txq_ctrl = txq_ctrl;
+	if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+		DRV_LOG(ERR, "Port %u MLX5_ENABLE_CQE_COMPRESSION "
+			"must never be set.", dev->data->port_id);
+		rte_errno = EINVAL;
+		return -rte_errno;
+	}
+	cqe_n = desc / MLX5_TX_COMP_THRESH +
+		1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+	txq_obj->cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
+	if (txq_obj->cq == NULL) {
+		DRV_LOG(ERR, "Port %u Tx queue %u CQ creation failure.",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	txq_obj->qp = mlx5_txq_ibv_qp_create(dev, idx);
+	if (txq_obj->qp == NULL) {
+		rte_errno = errno;
+		goto error;
+	}
+	ret = mlx5_ibv_modify_qp(txq_obj, MLX5_TXQ_MOD_RST2RDY,
+				 (uint8_t)priv->dev_port);
+	if (ret) {
+		DRV_LOG(ERR, "Port %u Tx queue %u QP state modifying failed.",
+			dev->data->port_id, idx);
+		rte_errno = errno;
+		goto error;
+	}
+	qp.comp_mask = MLX5DV_QP_MASK_UAR_MMAP_OFFSET;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/* If using DevX, need additional mask to read tisn value. */
+	if (priv->sh->devx && !priv->sh->tdn)
+		qp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES;
+#endif
+	obj.cq.in = txq_obj->cq;
+	obj.cq.out = &cq_info;
+	obj.qp.in = txq_obj->qp;
+	obj.qp.out = &qp;
+	ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP);
+	if (ret != 0) {
+		rte_errno = errno;
+		goto error;
+	}
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		DRV_LOG(ERR,
+			"Port %u wrong MLX5_CQE_SIZE environment variable"
+			" value: it should be set to %u.",
+			dev->data->port_id, RTE_CACHE_LINE_SIZE);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+	txq_data->cqe_s = 1 << txq_data->cqe_n;
+	txq_data->cqe_m = txq_data->cqe_s - 1;
+	txq_data->qp_num_8s = ((struct ibv_qp *)txq_obj->qp)->qp_num << 8;
+	txq_data->wqes = qp.sq.buf;
+	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+	txq_data->wqe_s = 1 << txq_data->wqe_n;
+	txq_data->wqe_m = txq_data->wqe_s - 1;
+	txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
+	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
+	txq_data->cq_db = cq_info.dbrec;
+	txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
+	txq_data->cq_ci = 0;
+	txq_data->cq_pi = 0;
+	txq_data->wqe_ci = 0;
+	txq_data->wqe_pi = 0;
+	txq_data->wqe_comp = 0;
+	txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	/*
+	 * If using DevX need to query and store TIS transport domain value.
+	 * This is done once per port.
+	 * Will use this value on Rx, when creating matching TIR.
+	 */
+	if (priv->sh->devx && !priv->sh->tdn) {
+		ret = mlx5_devx_cmd_qp_query_tis_td(txq_obj->qp, qp.tisn,
+						    &priv->sh->tdn);
+		if (ret) {
+			DRV_LOG(ERR, "Fail to query port %u Tx queue %u QP TIS "
+				"transport domain.", dev->data->port_id, idx);
+			rte_errno = EINVAL;
+			goto error;
+		} else {
+			DRV_LOG(DEBUG, "Port %u Tx queue %u TIS number %d "
+				"transport domain %d.", dev->data->port_id,
+				idx, qp.tisn, priv->sh->tdn);
+		}
+	}
+#endif
+	txq_ctrl->bf_reg = qp.bf.reg;
+	if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
+		txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
+		DRV_LOG(DEBUG, "Port %u: uar_mmap_offset 0x%" PRIx64 ".",
+			dev->data->port_id, txq_ctrl->uar_mmap_offset);
+	} else {
+		DRV_LOG(ERR,
+			"Port %u failed to retrieve UAR info, invalid"
+			" libmlx5.so",
+			dev->data->port_id);
+		rte_errno = EINVAL;
+		goto error;
+	}
+	txq_uar_init(txq_ctrl);
+	dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
+	return 0;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	if (txq_obj->cq)
+		claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
+	if (txq_obj->qp)
+		claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
+	rte_errno = ret; /* Restore rte_errno. */
+	return -rte_errno;
+}
+
+/*
+ * Create the dummy QP with minimal resources for loopback.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rxq_ibv_obj_dummy_lb_create(struct rte_eth_dev *dev)
+{
+#if defined(HAVE_IBV_DEVICE_TUNNEL_SUPPORT) && defined(HAVE_IBV_FLOW_DV_SUPPORT)
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct ibv_context *ctx = sh->ctx;
+	struct mlx5dv_qp_init_attr qp_init_attr = {0};
+	struct {
+		struct ibv_cq_init_attr_ex ibv;
+		struct mlx5dv_cq_init_attr mlx5;
+	} cq_attr = {{0}};
+
+	if (dev->data->dev_conf.lpbk_mode) {
+		/* Allow packet sent from NIC loop back w/o source MAC check. */
+		qp_init_attr.comp_mask |=
+				MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+		qp_init_attr.create_flags |=
+				MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
+	} else {
+		return 0;
+	}
+	/* Only need to check refcnt, 0 after "sh" is allocated. */
+	if (!!(__atomic_fetch_add(&sh->self_lb.refcnt, 1, __ATOMIC_RELAXED))) {
+		MLX5_ASSERT(sh->self_lb.ibv_cq && sh->self_lb.qp);
+		priv->lb_used = 1;
+		return 0;
+	}
+	cq_attr.ibv = (struct ibv_cq_init_attr_ex){
+		.cqe = 1,
+		.channel = NULL,
+		.comp_mask = 0,
+	};
+	cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
+		.comp_mask = 0,
+	};
+	/* Only CQ is needed, no WQ(RQ) is required in this case. */
+	sh->self_lb.ibv_cq = mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(ctx,
+							&cq_attr.ibv,
+							&cq_attr.mlx5));
+	if (!sh->self_lb.ibv_cq) {
+		DRV_LOG(ERR, "Port %u cannot allocate CQ for loopback.",
+			dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	sh->self_lb.qp = mlx5_glue->dv_create_qp(ctx,
+				&(struct ibv_qp_init_attr_ex){
+					.qp_type = IBV_QPT_RAW_PACKET,
+					.comp_mask = IBV_QP_INIT_ATTR_PD,
+					.pd = sh->pd,
+					.send_cq = sh->self_lb.ibv_cq,
+					.recv_cq = sh->self_lb.ibv_cq,
+					.cap.max_recv_wr = 1,
+				},
+				&qp_init_attr);
+	if (!sh->self_lb.qp) {
+		DRV_LOG(DEBUG, "Port %u cannot allocate QP for loopback.",
+			dev->data->port_id);
+		rte_errno = errno;
+		goto error;
+	}
+	priv->lb_used = 1;
+	return 0;
+error:
+	if (sh->self_lb.ibv_cq) {
+		claim_zero(mlx5_glue->destroy_cq(sh->self_lb.ibv_cq));
+		sh->self_lb.ibv_cq = NULL;
+	}
+	(void)__atomic_sub_fetch(&sh->self_lb.refcnt, 1, __ATOMIC_RELAXED);
+	return -rte_errno;
+#else
+	RTE_SET_USED(dev);
+	return 0;
+#endif
+}
+
+/*
+ * Release the dummy queue resources for loopback.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_rxq_ibv_obj_dummy_lb_release(struct rte_eth_dev *dev)
+{
+#if defined(HAVE_IBV_DEVICE_TUNNEL_SUPPORT) && defined(HAVE_IBV_FLOW_DV_SUPPORT)
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+
+	if (!priv->lb_used)
+		return;
+	MLX5_ASSERT(__atomic_load_n(&sh->self_lb.refcnt, __ATOMIC_RELAXED));
+	if (!(__atomic_sub_fetch(&sh->self_lb.refcnt, 1, __ATOMIC_RELAXED))) {
+		if (sh->self_lb.qp) {
+			claim_zero(mlx5_glue->destroy_qp(sh->self_lb.qp));
+			sh->self_lb.qp = NULL;
+		}
+		if (sh->self_lb.ibv_cq) {
+			claim_zero(mlx5_glue->destroy_cq(sh->self_lb.ibv_cq));
+			sh->self_lb.ibv_cq = NULL;
+		}
+	}
+	priv->lb_used = 0;
+#else
+	RTE_SET_USED(dev);
+	return;
+#endif
+}
+
+/**
+ * Release an Tx verbs queue object.
+ *
+ * @param txq_obj
+ *   Verbs Tx queue object..
+ */
+void
+mlx5_txq_ibv_obj_release(struct mlx5_txq_obj *txq_obj)
+{
+	MLX5_ASSERT(txq_obj);
+	claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
+	claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
+}
+
+struct mlx5_obj_ops ibv_obj_ops = {
+	.rxq_obj_modify_vlan_strip = mlx5_rxq_obj_modify_wq_vlan_strip,
+	.rxq_obj_new = mlx5_rxq_ibv_obj_new,
+	.rxq_event_get = mlx5_rx_ibv_get_event,
+	.rxq_obj_modify = mlx5_ibv_modify_wq,
+	.rxq_obj_release = mlx5_rxq_ibv_obj_release,
+	.ind_table_new = mlx5_ibv_ind_table_new,
+	.ind_table_destroy = mlx5_ibv_ind_table_destroy,
+	.hrxq_new = mlx5_ibv_hrxq_new,
+	.hrxq_destroy = mlx5_ibv_qp_destroy,
+	.drop_action_create = mlx5_ibv_drop_action_create,
+	.drop_action_destroy = mlx5_ibv_drop_action_destroy,
+	.txq_obj_new = mlx5_txq_ibv_obj_new,
+	.txq_obj_modify = mlx5_ibv_modify_qp,
+	.txq_obj_release = mlx5_txq_ibv_obj_release,
+	.lb_dummy_queue_create = NULL,
+	.lb_dummy_queue_release = NULL,
+};
diff --git a/drivers/net/mlx5/freebsd/mlx5_verbs.h b/drivers/net/mlx5/freebsd/mlx5_verbs.h
new file mode 100644
index 0000000000..f7e8e2fe98
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_verbs.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_VERBS_H_
+#define RTE_PMD_MLX5_VERBS_H_
+
+#include "mlx5.h"
+
+int mlx5_txq_ibv_obj_new(struct rte_eth_dev *dev, uint16_t idx);
+void mlx5_txq_ibv_obj_release(struct mlx5_txq_obj *txq_obj);
+int mlx5_rxq_ibv_obj_dummy_lb_create(struct rte_eth_dev *dev);
+void mlx5_rxq_ibv_obj_dummy_lb_release(struct rte_eth_dev *dev);
+
+/* Verbs ops struct */
+extern const struct mlx5_mr_ops mlx5_mr_verbs_ops;
+extern struct mlx5_obj_ops ibv_obj_ops;
+#endif /* RTE_PMD_MLX5_VERBS_H_ */
-- 
2.30.2


  parent reply	other threads:[~2021-09-27 14:58 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-27 13:34 [dpdk-dev] [PATCH 00/19] MLX5 FreeBSD support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 01/19] common/mlx5: stub for FreeBSD Srikanth Kaka
2021-09-27 13:34 ` Srikanth Kaka [this message]
2021-09-27 13:34 ` [dpdk-dev] [PATCH 03/19] common/mlx5: disabling auxiliary bus support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 04/19] net/mlx5: " Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 05/19] net/mlx5: modified PCI probe to work on FreeBSD Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 06/19] common/mlx5: define PF_INET socket Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 07/19] net/mlx5: use the newly defined INET socket Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 08/19] common/mlx5: derive PCI addr in FreeBSD Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 09/19] common/mlx5: get interface name Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 10/19] net/mlx5: fix socket MAC request Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 11/19] net/mlx5: removing port representator support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 12/19] net/mlx5: Added procedure to detect link state Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 13/19] net/mlx5: added placeholder for VLAN vmwa Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 14/19] net/mlx5: added stats support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 15/19] net/mlx5: making flow control DPDK callback invalid Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 16/19] net/mlx5: making module DPDK callbacks invalid Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 17/19] common/mlx5: fixed missing dependency in mlx5_glue.h Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 18/19] net/mlx5: fixed compilation warnings Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 19/19] mlx5: Added meson support for FreeBSD Srikanth Kaka
2021-09-29 12:20 ` [dpdk-dev] [PATCH 00/19] MLX5 FreeBSD support Thomas Monjalon
2021-09-29 15:56   ` Srikanth K
2021-09-29 16:20     ` Thomas Monjalon
2021-09-30 16:27       ` Srikanth K
2021-09-30 16:55         ` Thomas Monjalon
2021-10-01 11:35           ` Srikanth K

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210927133450.10653-3-srikanth.k@oneconvergence.com \
    --to=srikanth.k@oneconvergence.com \
    --cc=avelu@juniper.net \
    --cc=dev@dpdk.org \
    --cc=matan@nvidia.com \
    --cc=vag.singh@oneconvergence.com \
    --cc=viacheslavo@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).