From: Srikanth Kaka <srikanth.k@oneconvergence.com>
To: Matan Azrad <matan@nvidia.com>,
Viacheslav Ovsiienko <viacheslavo@nvidia.com>
Cc: dev@dpdk.org, Vag Singh <vag.singh@oneconvergence.com>,
Anand Thulasiram <avelu@juniper.net>,
Srikanth Kaka <srikanth.k@oneconvergence.com>
Subject: [dpdk-dev] [PATCH 02/19] net/mlx5: stub for FreeBSD
Date: Mon, 27 Sep 2021 19:04:33 +0530 [thread overview]
Message-ID: <20210927133450.10653-3-srikanth.k@oneconvergence.com> (raw)
In-Reply-To: <20210927133450.10653-1-srikanth.k@oneconvergence.com>
These files are a copy of their Linux equivalents.
They will be ported to FreeBSD.
Signed-off-by: Srikanth Kaka <srikanth.k@oneconvergence.com>
Signed-off-by: Vag Singh <vag.singh@oneconvergence.com>
Signed-off-by: Anand Thulasiram <avelu@juniper.net>
---
drivers/net/mlx5/freebsd/mlx5_ethdev_os.c | 1632 +++++++++++
drivers/net/mlx5/freebsd/mlx5_flow_os.c | 38 +
drivers/net/mlx5/freebsd/mlx5_flow_os.h | 484 ++++
drivers/net/mlx5/freebsd/mlx5_mp_os.c | 305 ++
drivers/net/mlx5/freebsd/mlx5_os.c | 3208 +++++++++++++++++++++
drivers/net/mlx5/freebsd/mlx5_os.h | 24 +
drivers/net/mlx5/freebsd/mlx5_socket.c | 249 ++
drivers/net/mlx5/freebsd/mlx5_verbs.c | 1208 ++++++++
drivers/net/mlx5/freebsd/mlx5_verbs.h | 18 +
9 files changed, 7166 insertions(+)
create mode 100644 drivers/net/mlx5/freebsd/mlx5_ethdev_os.c
create mode 100644 drivers/net/mlx5/freebsd/mlx5_flow_os.c
create mode 100644 drivers/net/mlx5/freebsd/mlx5_flow_os.h
create mode 100644 drivers/net/mlx5/freebsd/mlx5_mp_os.c
create mode 100644 drivers/net/mlx5/freebsd/mlx5_os.c
create mode 100644 drivers/net/mlx5/freebsd/mlx5_os.h
create mode 100644 drivers/net/mlx5/freebsd/mlx5_socket.c
create mode 100644 drivers/net/mlx5/freebsd/mlx5_verbs.c
create mode 100644 drivers/net/mlx5/freebsd/mlx5_verbs.h
diff --git a/drivers/net/mlx5/freebsd/mlx5_ethdev_os.c b/drivers/net/mlx5/freebsd/mlx5_ethdev_os.c
new file mode 100644
index 0000000000..f34133e2c6
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_ethdev_os.c
@@ -0,0 +1,1632 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2015 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <dirent.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <fcntl.h>
+#include <stdalign.h>
+#include <sys/un.h>
+#include <time.h>
+
+#include <ethdev_driver.h>
+#include <rte_bus_pci.h>
+#include <rte_mbuf.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_malloc.h>
+#include <rte_string_fns.h>
+#include <rte_rwlock.h>
+#include <rte_cycles.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_malloc.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_utils.h"
+
+/* Supported speed values found in /usr/include/linux/ethtool.h */
+#ifndef HAVE_SUPPORTED_40000baseKR4_Full
+#define SUPPORTED_40000baseKR4_Full (1 << 23)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseCR4_Full
+#define SUPPORTED_40000baseCR4_Full (1 << 24)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseSR4_Full
+#define SUPPORTED_40000baseSR4_Full (1 << 25)
+#endif
+#ifndef HAVE_SUPPORTED_40000baseLR4_Full
+#define SUPPORTED_40000baseLR4_Full (1 << 26)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseKR4_Full
+#define SUPPORTED_56000baseKR4_Full (1 << 27)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseCR4_Full
+#define SUPPORTED_56000baseCR4_Full (1 << 28)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseSR4_Full
+#define SUPPORTED_56000baseSR4_Full (1 << 29)
+#endif
+#ifndef HAVE_SUPPORTED_56000baseLR4_Full
+#define SUPPORTED_56000baseLR4_Full (1 << 30)
+#endif
+
+/* Add defines in case the running kernel is not the same as user headers. */
+#ifndef ETHTOOL_GLINKSETTINGS
+struct ethtool_link_settings {
+ uint32_t cmd;
+ uint32_t speed;
+ uint8_t duplex;
+ uint8_t port;
+ uint8_t phy_address;
+ uint8_t autoneg;
+ uint8_t mdio_support;
+ uint8_t eth_to_mdix;
+ uint8_t eth_tp_mdix_ctrl;
+ int8_t link_mode_masks_nwords;
+ uint32_t reserved[8];
+ uint32_t link_mode_masks[];
+};
+
+/* The kernel values can be found in /include/uapi/linux/ethtool.h */
+#define ETHTOOL_GLINKSETTINGS 0x0000004c
+#define ETHTOOL_LINK_MODE_1000baseT_Full_BIT 5
+#define ETHTOOL_LINK_MODE_Autoneg_BIT 6
+#define ETHTOOL_LINK_MODE_1000baseKX_Full_BIT 17
+#define ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT 18
+#define ETHTOOL_LINK_MODE_10000baseKR_Full_BIT 19
+#define ETHTOOL_LINK_MODE_10000baseR_FEC_BIT 20
+#define ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT 21
+#define ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT 22
+#define ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT 23
+#define ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT 24
+#define ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT 25
+#define ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT 26
+#define ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT 27
+#define ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT 28
+#define ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT 29
+#define ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT 30
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_25G
+#define ETHTOOL_LINK_MODE_25000baseCR_Full_BIT 31
+#define ETHTOOL_LINK_MODE_25000baseKR_Full_BIT 32
+#define ETHTOOL_LINK_MODE_25000baseSR_Full_BIT 33
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_50G
+#define ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT 34
+#define ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT 35
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_100G
+#define ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT 36
+#define ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT 37
+#define ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT 38
+#define ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT 39
+#endif
+#ifndef HAVE_ETHTOOL_LINK_MODE_200G
+#define ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT 62
+#define ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT 63
+#define ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT 0 /* 64 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT 1 /* 65 - 64 */
+#define ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT 2 /* 66 - 64 */
+#endif
+
+/* Get interface index from SubFunction device name. */
+int
+mlx5_auxiliary_get_ifindex(const char *sf_name)
+{
+ char if_name[IF_NAMESIZE] = { 0 };
+
+ if (mlx5_auxiliary_get_child_name(sf_name, "/net",
+ if_name, sizeof(if_name)) != 0)
+ return -rte_errno;
+ return if_nametoindex(if_name);
+}
+
+/**
+ * Get interface name from private structure.
+ *
+ * This is a port representor-aware version of mlx5_get_ifname_sysfs().
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] ifname
+ * Interface name output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_ifname(const struct rte_eth_dev *dev, char (*ifname)[MLX5_NAMESIZE])
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ unsigned int ifindex;
+
+ MLX5_ASSERT(priv);
+ MLX5_ASSERT(priv->sh);
+ if (priv->master && priv->sh->bond.ifindex > 0) {
+ memcpy(ifname, priv->sh->bond.ifname, MLX5_NAMESIZE);
+ return 0;
+ }
+ ifindex = mlx5_ifindex(dev);
+ if (!ifindex) {
+ if (!priv->representor)
+ return mlx5_get_ifname_sysfs(priv->sh->ibdev_path,
+ *ifname);
+ rte_errno = ENXIO;
+ return -rte_errno;
+ }
+ if (if_indextoname(ifindex, &(*ifname)[0]))
+ return 0;
+ rte_errno = errno;
+ return -rte_errno;
+}
+
+/**
+ * Perform ifreq ioctl() on associated netdev ifname.
+ *
+ * @param[in] ifname
+ * Pointer to netdev name.
+ * @param req
+ * Request number to pass to ioctl().
+ * @param[out] ifr
+ * Interface request structure output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ifreq_by_ifname(const char *ifname, int req, struct ifreq *ifr)
+{
+ int sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ int ret = 0;
+
+ if (sock == -1) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ rte_strscpy(ifr->ifr_name, ifname, sizeof(ifr->ifr_name));
+ ret = ioctl(sock, req, ifr);
+ if (ret == -1) {
+ rte_errno = errno;
+ goto error;
+ }
+ close(sock);
+ return 0;
+error:
+ close(sock);
+ return -rte_errno;
+}
+
+/**
+ * Perform ifreq ioctl() on associated Ethernet device.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param req
+ * Request number to pass to ioctl().
+ * @param[out] ifr
+ * Interface request structure output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ifreq(const struct rte_eth_dev *dev, int req, struct ifreq *ifr)
+{
+ char ifname[sizeof(ifr->ifr_name)];
+ int ret;
+
+ ret = mlx5_get_ifname(dev, &ifname);
+ if (ret)
+ return -rte_errno;
+ return mlx5_ifreq_by_ifname(ifname, req, ifr);
+}
+
+/**
+ * Get device MTU.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] mtu
+ * MTU value output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mtu(struct rte_eth_dev *dev, uint16_t *mtu)
+{
+ struct ifreq request;
+ int ret = mlx5_ifreq(dev, SIOCGIFMTU, &request);
+
+ if (ret)
+ return ret;
+ *mtu = request.ifr_mtu;
+ return 0;
+}
+
+/**
+ * Set device MTU.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param mtu
+ * MTU value to set.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct ifreq request = { .ifr_mtu = mtu, };
+
+ return mlx5_ifreq(dev, SIOCSIFMTU, &request);
+}
+
+/**
+ * Set device flags.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param keep
+ * Bitmask for flags that must remain untouched.
+ * @param flags
+ * Bitmask for flags to modify.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_set_flags(struct rte_eth_dev *dev, unsigned int keep, unsigned int flags)
+{
+ struct ifreq request;
+ int ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &request);
+
+ if (ret)
+ return ret;
+ request.ifr_flags &= keep;
+ request.ifr_flags |= flags & ~keep;
+ return mlx5_ifreq(dev, SIOCSIFFLAGS, &request);
+}
+
+/**
+ * Get device current raw clock counter
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] time
+ * Current raw clock counter of the device.
+ *
+ * @return
+ * 0 if the clock has correctly been read
+ * The value of errno in case of error
+ */
+int
+mlx5_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_context *ctx = priv->sh->ctx;
+ struct ibv_values_ex values;
+ int err = 0;
+
+ values.comp_mask = IBV_VALUES_MASK_RAW_CLOCK;
+ err = mlx5_glue->query_rt_values_ex(ctx, &values);
+ if (err != 0) {
+ DRV_LOG(WARNING, "Could not query the clock !");
+ return err;
+ }
+ *clock = values.raw_clock.tv_nsec;
+ return 0;
+}
+
+/**
+ * Retrieve the master device for representor in the same switch domain.
+ *
+ * @param dev
+ * Pointer to representor Ethernet device structure.
+ *
+ * @return
+ * Master device structure on success, NULL otherwise.
+ */
+static struct rte_eth_dev *
+mlx5_find_master_dev(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv;
+ uint16_t port_id;
+ uint16_t domain_id;
+
+ priv = dev->data->dev_private;
+ domain_id = priv->domain_id;
+ MLX5_ASSERT(priv->representor);
+ MLX5_ETH_FOREACH_DEV(port_id, dev->device) {
+ struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+ if (opriv &&
+ opriv->master &&
+ opriv->domain_id == domain_id &&
+ opriv->sh == priv->sh)
+ return &rte_eth_devices[port_id];
+ }
+ return NULL;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] link
+ * Storage for current link status.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gset(struct rte_eth_dev *dev,
+ struct rte_eth_link *link)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ethtool_cmd edata = {
+ .cmd = ETHTOOL_GSET /* Deprecated since Linux v4.5. */
+ };
+ struct ifreq ifr;
+ struct rte_eth_link dev_link;
+ int link_speed = 0;
+ int ret;
+
+ ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ dev_link = (struct rte_eth_link) {
+ .link_status = ((ifr.ifr_flags & IFF_UP) &&
+ (ifr.ifr_flags & IFF_RUNNING)),
+ };
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&edata,
+ };
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ if (ret == -ENOTSUP && priv->representor) {
+ struct rte_eth_dev *master;
+
+ /*
+ * For representors we can try to inherit link
+ * settings from the master device. Actually
+ * link settings do not make a lot of sense
+ * for representors due to missing physical
+ * link. The old kernel drivers supported
+ * emulated settings query for representors,
+ * the new ones do not, so we have to add
+ * this code for compatibility issues.
+ */
+ master = mlx5_find_master_dev(dev);
+ if (master) {
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&edata,
+ };
+ ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+ }
+ }
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u ioctl(SIOCETHTOOL,"
+ " ETHTOOL_GSET) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ }
+ link_speed = ethtool_cmd_speed(&edata);
+ if (link_speed == -1)
+ dev_link.link_speed = ETH_SPEED_NUM_UNKNOWN;
+ else
+ dev_link.link_speed = link_speed;
+ priv->link_speed_capa = 0;
+ if (edata.supported & (SUPPORTED_1000baseT_Full |
+ SUPPORTED_1000baseKX_Full))
+ priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+ if (edata.supported & SUPPORTED_10000baseKR_Full)
+ priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+ if (edata.supported & (SUPPORTED_40000baseKR4_Full |
+ SUPPORTED_40000baseCR4_Full |
+ SUPPORTED_40000baseSR4_Full |
+ SUPPORTED_40000baseLR4_Full))
+ priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+ dev_link.link_duplex = ((edata.duplex == DUPLEX_HALF) ?
+ ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+ dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+ ETH_LINK_SPEED_FIXED);
+ *link = dev_link;
+ return 0;
+}
+
+/**
+ * Retrieve physical link information (unlocked version using new ioctl).
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] link
+ * Storage for current link status.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_link_update_unlocked_gs(struct rte_eth_dev *dev,
+ struct rte_eth_link *link)
+
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ethtool_link_settings gcmd = { .cmd = ETHTOOL_GLINKSETTINGS };
+ struct ifreq ifr;
+ struct rte_eth_link dev_link;
+ struct rte_eth_dev *master = NULL;
+ uint64_t sc;
+ int ret;
+
+ ret = mlx5_ifreq(dev, SIOCGIFFLAGS, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u ioctl(SIOCGIFFLAGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ dev_link = (struct rte_eth_link) {
+ .link_status = ((ifr.ifr_flags & IFF_UP) &&
+ (ifr.ifr_flags & IFF_RUNNING)),
+ };
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&gcmd,
+ };
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ if (ret == -ENOTSUP && priv->representor) {
+ /*
+ * For representors we can try to inherit link
+ * settings from the master device. Actually
+ * link settings do not make a lot of sense
+ * for representors due to missing physical
+ * link. The old kernel drivers supported
+ * emulated settings query for representors,
+ * the new ones do not, so we have to add
+ * this code for compatibility issues.
+ */
+ master = mlx5_find_master_dev(dev);
+ if (master) {
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)&gcmd,
+ };
+ ret = mlx5_ifreq(master, SIOCETHTOOL, &ifr);
+ }
+ }
+ if (ret) {
+ DRV_LOG(DEBUG,
+ "port %u ioctl(SIOCETHTOOL,"
+ " ETHTOOL_GLINKSETTINGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ }
+ gcmd.link_mode_masks_nwords = -gcmd.link_mode_masks_nwords;
+
+ alignas(struct ethtool_link_settings)
+ uint8_t data[offsetof(struct ethtool_link_settings, link_mode_masks) +
+ sizeof(uint32_t) * gcmd.link_mode_masks_nwords * 3];
+ struct ethtool_link_settings *ecmd = (void *)data;
+
+ *ecmd = gcmd;
+ ifr.ifr_data = (void *)ecmd;
+ ret = mlx5_ifreq(master ? master : dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(DEBUG,
+ "port %u ioctl(SIOCETHTOOL,"
+ "ETHTOOL_GLINKSETTINGS) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ dev_link.link_speed = (ecmd->speed == UINT32_MAX) ?
+ ETH_SPEED_NUM_UNKNOWN : ecmd->speed;
+ sc = ecmd->link_mode_masks[0] |
+ ((uint64_t)ecmd->link_mode_masks[1] << 32);
+ priv->link_speed_capa = 0;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseT_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_1000baseKX_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_1G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseKR_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_10000baseR_FEC_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_10G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_20G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_40G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_56G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseCR_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseKR_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_25000baseSR_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_25G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_50G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_100G;
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+
+ sc = ecmd->link_mode_masks[2] |
+ ((uint64_t)ecmd->link_mode_masks[3] << 32);
+ if (sc & (MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT) |
+ MLX5_BITSHIFT
+ (ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT) |
+ MLX5_BITSHIFT(ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT)))
+ priv->link_speed_capa |= ETH_LINK_SPEED_200G;
+ dev_link.link_duplex = ((ecmd->duplex == DUPLEX_HALF) ?
+ ETH_LINK_HALF_DUPLEX : ETH_LINK_FULL_DUPLEX);
+ dev_link.link_autoneg = !(dev->data->dev_conf.link_speeds &
+ ETH_LINK_SPEED_FIXED);
+ *link = dev_link;
+ return 0;
+}
+
+/**
+ * DPDK callback to retrieve physical link information.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param wait_to_complete
+ * Wait for request completion.
+ *
+ * @return
+ * 0 if link status was not updated, positive if it was, a negative errno
+ * value otherwise and rte_errno is set.
+ */
+int
+mlx5_link_update(struct rte_eth_dev *dev, int wait_to_complete)
+{
+ int ret;
+ struct rte_eth_link dev_link;
+ time_t start_time = time(NULL);
+ int retry = MLX5_GET_LINK_STATUS_RETRY_COUNT;
+
+ do {
+ ret = mlx5_link_update_unlocked_gs(dev, &dev_link);
+ if (ret == -ENOTSUP)
+ ret = mlx5_link_update_unlocked_gset(dev, &dev_link);
+ if (ret == 0)
+ break;
+ /* Handle wait to complete situation. */
+ if ((wait_to_complete || retry) && ret == -EAGAIN) {
+ if (abs((int)difftime(time(NULL), start_time)) <
+ MLX5_LINK_STATUS_TIMEOUT) {
+ usleep(0);
+ continue;
+ } else {
+ rte_errno = EBUSY;
+ return -rte_errno;
+ }
+ } else if (ret < 0) {
+ return ret;
+ }
+ } while (wait_to_complete || retry-- > 0);
+ ret = !!memcmp(&dev->data->dev_link, &dev_link,
+ sizeof(struct rte_eth_link));
+ dev->data->dev_link = dev_link;
+ return ret;
+}
+
+/**
+ * DPDK callback to get flow control status.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] fc_conf
+ * Flow control output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_get_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+ struct ifreq ifr;
+ struct ethtool_pauseparam ethpause = {
+ .cmd = ETHTOOL_GPAUSEPARAM
+ };
+ int ret;
+
+ ifr.ifr_data = (void *)ðpause;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u ioctl(SIOCETHTOOL, ETHTOOL_GPAUSEPARAM) failed:"
+ " %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ fc_conf->autoneg = ethpause.autoneg;
+ if (ethpause.rx_pause && ethpause.tx_pause)
+ fc_conf->mode = RTE_FC_FULL;
+ else if (ethpause.rx_pause)
+ fc_conf->mode = RTE_FC_RX_PAUSE;
+ else if (ethpause.tx_pause)
+ fc_conf->mode = RTE_FC_TX_PAUSE;
+ else
+ fc_conf->mode = RTE_FC_NONE;
+ return 0;
+}
+
+/**
+ * DPDK callback to modify flow control parameters.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[in] fc_conf
+ * Flow control parameters.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_dev_set_flow_ctrl(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf)
+{
+ struct ifreq ifr;
+ struct ethtool_pauseparam ethpause = {
+ .cmd = ETHTOOL_SPAUSEPARAM
+ };
+ int ret;
+
+ ifr.ifr_data = (void *)ðpause;
+ ethpause.autoneg = fc_conf->autoneg;
+ if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+ (fc_conf->mode & RTE_FC_RX_PAUSE))
+ ethpause.rx_pause = 1;
+ else
+ ethpause.rx_pause = 0;
+
+ if (((fc_conf->mode & RTE_FC_FULL) == RTE_FC_FULL) ||
+ (fc_conf->mode & RTE_FC_TX_PAUSE))
+ ethpause.tx_pause = 1;
+ else
+ ethpause.tx_pause = 0;
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u ioctl(SIOCETHTOOL, ETHTOOL_SPAUSEPARAM)"
+ " failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * Handle asynchronous removal event for entire multiport device.
+ *
+ * @param sh
+ * Infiniband device shared context.
+ */
+static void
+mlx5_dev_interrupt_device_fatal(struct mlx5_dev_ctx_shared *sh)
+{
+ uint32_t i;
+
+ for (i = 0; i < sh->max_port; ++i) {
+ struct rte_eth_dev *dev;
+
+ if (sh->port[i].ih_port_id >= RTE_MAX_ETHPORTS) {
+ /*
+ * Or not existing port either no
+ * handler installed for this port.
+ */
+ continue;
+ }
+ dev = &rte_eth_devices[sh->port[i].ih_port_id];
+ MLX5_ASSERT(dev);
+ if (dev->data->dev_conf.intr_conf.rmv)
+ rte_eth_dev_callback_process
+ (dev, RTE_ETH_EVENT_INTR_RMV, NULL);
+ }
+}
+
+/**
+ * Handle shared asynchronous events the NIC (removal event
+ * and link status change). Supports multiport IB device.
+ *
+ * @param cb_arg
+ * Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler(void *cb_arg)
+{
+ struct mlx5_dev_ctx_shared *sh = cb_arg;
+ struct ibv_async_event event;
+
+ /* Read all message from the IB device and acknowledge them. */
+ for (;;) {
+ struct rte_eth_dev *dev;
+ uint32_t tmp;
+
+ if (mlx5_glue->get_async_event(sh->ctx, &event))
+ break;
+ /* Retrieve and check IB port index. */
+ tmp = (uint32_t)event.element.port_num;
+ if (!tmp && event.event_type == IBV_EVENT_DEVICE_FATAL) {
+ /*
+ * The DEVICE_FATAL event is called once for
+ * entire device without port specifying.
+ * We should notify all existing ports.
+ */
+ mlx5_glue->ack_async_event(&event);
+ mlx5_dev_interrupt_device_fatal(sh);
+ continue;
+ }
+ MLX5_ASSERT(tmp && (tmp <= sh->max_port));
+ if (!tmp) {
+ /* Unsupported device level event. */
+ mlx5_glue->ack_async_event(&event);
+ DRV_LOG(DEBUG,
+ "unsupported common event (type %d)",
+ event.event_type);
+ continue;
+ }
+ if (tmp > sh->max_port) {
+ /* Invalid IB port index. */
+ mlx5_glue->ack_async_event(&event);
+ DRV_LOG(DEBUG,
+ "cannot handle an event (type %d)"
+ "due to invalid IB port index (%u)",
+ event.event_type, tmp);
+ continue;
+ }
+ if (sh->port[tmp - 1].ih_port_id >= RTE_MAX_ETHPORTS) {
+ /* No handler installed. */
+ mlx5_glue->ack_async_event(&event);
+ DRV_LOG(DEBUG,
+ "cannot handle an event (type %d)"
+ "due to no handler installed for port %u",
+ event.event_type, tmp);
+ continue;
+ }
+ /* Retrieve ethernet device descriptor. */
+ tmp = sh->port[tmp - 1].ih_port_id;
+ dev = &rte_eth_devices[tmp];
+ MLX5_ASSERT(dev);
+ if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+ event.event_type == IBV_EVENT_PORT_ERR) &&
+ dev->data->dev_conf.intr_conf.lsc) {
+ mlx5_glue->ack_async_event(&event);
+ if (mlx5_link_update(dev, 0) == -EAGAIN) {
+ usleep(0);
+ continue;
+ }
+ rte_eth_dev_callback_process
+ (dev, RTE_ETH_EVENT_INTR_LSC, NULL);
+ continue;
+ }
+ DRV_LOG(DEBUG,
+ "port %u cannot handle an unknown event (type %d)",
+ dev->data->port_id, event.event_type);
+ mlx5_glue->ack_async_event(&event);
+ }
+}
+
+/*
+ * Unregister callback handler safely. The handler may be active
+ * while we are trying to unregister it, in this case code -EAGAIN
+ * is returned by rte_intr_callback_unregister(). This routine checks
+ * the return code and tries to unregister handler again.
+ *
+ * @param handle
+ * interrupt handle
+ * @param cb_fn
+ * pointer to callback routine
+ * @cb_arg
+ * opaque callback parameter
+ */
+void
+mlx5_intr_callback_unregister(const struct rte_intr_handle *handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+ /*
+ * Try to reduce timeout management overhead by not calling
+ * the timer related routines on the first iteration. If the
+ * unregistering succeeds on first call there will be no
+ * timer calls at all.
+ */
+ uint64_t twait = 0;
+ uint64_t start = 0;
+
+ do {
+ int ret;
+
+ ret = rte_intr_callback_unregister(handle, cb_fn, cb_arg);
+ if (ret >= 0)
+ return;
+ if (ret != -EAGAIN) {
+ DRV_LOG(INFO, "failed to unregister interrupt"
+ " handler (error: %d)", ret);
+ MLX5_ASSERT(false);
+ return;
+ }
+ if (twait) {
+ struct timespec onems;
+
+ /* Wait one millisecond and try again. */
+ onems.tv_sec = 0;
+ onems.tv_nsec = NS_PER_S / MS_PER_S;
+ nanosleep(&onems, 0);
+ /* Check whether one second elapsed. */
+ if ((rte_get_timer_cycles() - start) <= twait)
+ continue;
+ } else {
+ /*
+ * We get the amount of timer ticks for one second.
+ * If this amount elapsed it means we spent one
+ * second in waiting. This branch is executed once
+ * on first iteration.
+ */
+ twait = rte_get_timer_hz();
+ MLX5_ASSERT(twait);
+ }
+ /*
+ * Timeout elapsed, show message (once a second) and retry.
+ * We have no other acceptable option here, if we ignore
+ * the unregistering return code the handler will not
+ * be unregistered, fd will be closed and we may get the
+ * crush. Hanging and messaging in the loop seems not to be
+ * the worst choice.
+ */
+ DRV_LOG(INFO, "Retrying to unregister interrupt handler");
+ start = rte_get_timer_cycles();
+ } while (true);
+}
+
+/**
+ * Handle DEVX interrupts from the NIC.
+ * This function is probably called from the DPDK host thread.
+ *
+ * @param cb_arg
+ * Callback argument.
+ */
+void
+mlx5_dev_interrupt_handler_devx(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_ASYNC
+ (void)cb_arg;
+ return;
+#else
+ struct mlx5_dev_ctx_shared *sh = cb_arg;
+ union {
+ struct mlx5dv_devx_async_cmd_hdr cmd_resp;
+ uint8_t buf[MLX5_ST_SZ_BYTES(query_flow_counter_out) +
+ MLX5_ST_SZ_BYTES(traffic_counter) +
+ sizeof(struct mlx5dv_devx_async_cmd_hdr)];
+ } out;
+ uint8_t *buf = out.buf + sizeof(out.cmd_resp);
+
+ while (!mlx5_glue->devx_get_async_cmd_comp(sh->devx_comp,
+ &out.cmd_resp,
+ sizeof(out.buf)))
+ mlx5_flow_async_pool_query_handle
+ (sh, (uint64_t)out.cmd_resp.wr_id,
+ mlx5_devx_get_out_command_status(buf));
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+/**
+ * DPDK callback to bring the link DOWN.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_down(struct rte_eth_dev *dev)
+{
+ return mlx5_set_flags(dev, ~IFF_UP, ~IFF_UP);
+}
+
+/**
+ * DPDK callback to bring the link UP.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_set_link_up(struct rte_eth_dev *dev)
+{
+ return mlx5_set_flags(dev, ~IFF_UP, IFF_UP);
+}
+
+/**
+ * Check if mlx5 device was removed.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ * @return
+ * 1 when device is removed, otherwise 0.
+ */
+int
+mlx5_is_removed(struct rte_eth_dev *dev)
+{
+ struct ibv_device_attr device_attr;
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (mlx5_glue->query_device(priv->sh->ctx, &device_attr) == EIO)
+ return 1;
+ return 0;
+}
+
+/**
+ * Analyze gathered port parameters via sysfs to recognize master
+ * and representor devices for E-Switch configuration.
+ *
+ * @param[in] device_dir
+ * flag of presence of "device" directory under port device key.
+ * @param[inout] switch_info
+ * Port information, including port name as a number and port name
+ * type if recognized
+ *
+ * @return
+ * master and representor flags are set in switch_info according to
+ * recognized parameters (if any).
+ */
+static void
+mlx5_sysfs_check_switch_info(bool device_dir,
+ struct mlx5_switch_info *switch_info)
+{
+ switch (switch_info->name_type) {
+ case MLX5_PHYS_PORT_NAME_TYPE_UNKNOWN:
+ /*
+ * Name is not recognized, assume the master,
+ * check the device directory presence.
+ */
+ switch_info->master = device_dir;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_NOTSET:
+ /*
+ * Name is not set, this assumes the legacy naming
+ * schema for master, just check if there is
+ * a device directory.
+ */
+ switch_info->master = device_dir;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+ /* New uplink naming schema recognized. */
+ switch_info->master = 1;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_LEGACY:
+ /* Legacy representors naming schema. */
+ switch_info->representor = !device_dir;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+ /* Fallthrough */
+ case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+ /* Fallthrough */
+ case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+ /* New representors naming schema. */
+ switch_info->representor = 1;
+ break;
+ default:
+ switch_info->master = device_dir;
+ break;
+ }
+}
+
+/**
+ * Get switch information associated with network interface.
+ *
+ * @param ifindex
+ * Network interface index.
+ * @param[out] info
+ * Switch information object, populated in case of success.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_sysfs_switch_info(unsigned int ifindex, struct mlx5_switch_info *info)
+{
+ char ifname[IF_NAMESIZE];
+ char port_name[IF_NAMESIZE];
+ FILE *file;
+ struct mlx5_switch_info data = {
+ .master = 0,
+ .representor = 0,
+ .name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET,
+ .port_name = 0,
+ .switch_id = 0,
+ };
+ DIR *dir;
+ bool port_switch_id_set = false;
+ bool device_dir = false;
+ char c;
+ int ret;
+
+ if (!if_indextoname(ifindex, ifname)) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+
+ MKSTR(phys_port_name, "/sys/class/net/%s/phys_port_name",
+ ifname);
+ MKSTR(phys_switch_id, "/sys/class/net/%s/phys_switch_id",
+ ifname);
+ MKSTR(pci_device, "/sys/class/net/%s/device",
+ ifname);
+
+ file = fopen(phys_port_name, "rb");
+ if (file != NULL) {
+ ret = fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", port_name);
+ fclose(file);
+ if (ret == 1)
+ mlx5_translate_port_name(port_name, &data);
+ }
+ file = fopen(phys_switch_id, "rb");
+ if (file == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ port_switch_id_set =
+ fscanf(file, "%" SCNx64 "%c", &data.switch_id, &c) == 2 &&
+ c == '\n';
+ fclose(file);
+ dir = opendir(pci_device);
+ if (dir != NULL) {
+ closedir(dir);
+ device_dir = true;
+ }
+ if (port_switch_id_set) {
+ /* We have some E-Switch configuration. */
+ mlx5_sysfs_check_switch_info(device_dir, &data);
+ }
+ *info = data;
+ MLX5_ASSERT(!(data.master && data.representor));
+ if (data.master && data.representor) {
+ DRV_LOG(ERR, "ifindex %u device is recognized as master"
+ " and as representor", ifindex);
+ rte_errno = ENODEV;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Get bond information associated with network interface.
+ *
+ * @param pf_ifindex
+ * Network interface index of bond slave interface
+ * @param[out] ifindex
+ * Pointer to bond ifindex.
+ * @param[out] ifname
+ * Pointer to bond ifname.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_sysfs_bond_info(unsigned int pf_ifindex, unsigned int *ifindex,
+ char *ifname)
+{
+ char name[IF_NAMESIZE];
+ FILE *file;
+ unsigned int index;
+ int ret;
+
+ if (!if_indextoname(pf_ifindex, name) || !strlen(name)) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ MKSTR(bond_if, "/sys/class/net/%s/master/ifindex", name);
+ /* read bond ifindex */
+ file = fopen(bond_if, "rb");
+ if (file == NULL) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ ret = fscanf(file, "%u", &index);
+ fclose(file);
+ if (ret <= 0) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ if (ifindex)
+ *ifindex = index;
+
+ /* read bond device name from symbol link */
+ if (ifname) {
+ if (!if_indextoname(index, ifname)) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ }
+ return 0;
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM information (type and size).
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] modinfo
+ * Storage for plug-in module EEPROM information.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_module_info(struct rte_eth_dev *dev,
+ struct rte_eth_dev_module_info *modinfo)
+{
+ struct ethtool_modinfo info = {
+ .cmd = ETHTOOL_GMODULEINFO,
+ };
+ struct ifreq ifr = (struct ifreq) {
+ .ifr_data = (void *)&info,
+ };
+ int ret = 0;
+
+ if (!dev) {
+ DRV_LOG(WARNING, "missing argument, cannot get module info");
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ return ret;
+ }
+ modinfo->type = info.type;
+ modinfo->eeprom_len = info.eeprom_len;
+ return ret;
+}
+
+/**
+ * DPDK callback to retrieve plug-in module EEPROM data.
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param[out] info
+ * Storage for plug-in module EEPROM data.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_get_module_eeprom(struct rte_eth_dev *dev,
+ struct rte_dev_eeprom_info *info)
+{
+ struct ethtool_eeprom *eeprom;
+ struct ifreq ifr;
+ int ret = 0;
+
+ if (!dev) {
+ DRV_LOG(WARNING, "missing argument, cannot get module eeprom");
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ eeprom = mlx5_malloc(MLX5_MEM_ZERO,
+ (sizeof(struct ethtool_eeprom) + info->length), 0,
+ SOCKET_ID_ANY);
+ if (!eeprom) {
+ DRV_LOG(WARNING, "port %u cannot allocate memory for "
+ "eeprom data", dev->data->port_id);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ eeprom->cmd = ETHTOOL_GMODULEEEPROM;
+ eeprom->offset = info->offset;
+ eeprom->len = info->length;
+ ifr = (struct ifreq) {
+ .ifr_data = (void *)eeprom,
+ };
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret)
+ DRV_LOG(WARNING, "port %u ioctl(SIOCETHTOOL) failed: %s",
+ dev->data->port_id, strerror(rte_errno));
+ else
+ rte_memcpy(info->data, eeprom->data, info->length);
+ mlx5_free(eeprom);
+ return ret;
+}
+
+/**
+ * Read device counters table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[in] pf
+ * PF index in case of bonding device, -1 otherwise
+ * @param[out] stats
+ * Counters table output buffer.
+ *
+ * @return
+ * 0 on success and stats is filled, negative errno value otherwise and
+ * rte_errno is set.
+ */
+static int
+_mlx5_os_read_dev_counters(struct rte_eth_dev *dev, int pf, uint64_t *stats)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ unsigned int i;
+ struct ifreq ifr;
+ unsigned int stats_sz = xstats_ctrl->stats_n * sizeof(uint64_t);
+ unsigned char et_stat_buf[sizeof(struct ethtool_stats) + stats_sz];
+ struct ethtool_stats *et_stats = (struct ethtool_stats *)et_stat_buf;
+ int ret;
+
+ et_stats->cmd = ETHTOOL_GSTATS;
+ et_stats->n_stats = xstats_ctrl->stats_n;
+ ifr.ifr_data = (caddr_t)et_stats;
+ if (pf >= 0)
+ ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[pf].ifname,
+ SIOCETHTOOL, &ifr);
+ else
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING,
+ "port %u unable to read statistic values from device",
+ dev->data->port_id);
+ return ret;
+ }
+ for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+ if (xstats_ctrl->info[i].dev)
+ continue;
+ stats[i] += (uint64_t)
+ et_stats->data[xstats_ctrl->dev_table_idx[i]];
+ }
+ return 0;
+}
+
+/**
+ * Read device counters.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param[out] stats
+ * Counters table output buffer.
+ *
+ * @return
+ * 0 on success and stats is filled, negative errno value otherwise and
+ * rte_errno is set.
+ */
+int
+mlx5_os_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ int ret = 0, i;
+
+ memset(stats, 0, sizeof(*stats) * xstats_ctrl->mlx5_stats_n);
+ /* Read ifreq counters. */
+ if (priv->master && priv->pf_bond >= 0) {
+ /* Sum xstats from bonding device member ports. */
+ for (i = 0; i < priv->sh->bond.n_port; i++) {
+ ret = _mlx5_os_read_dev_counters(dev, i, stats);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ret = _mlx5_os_read_dev_counters(dev, -1, stats);
+ }
+ /* Read IB counters. */
+ for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) {
+ if (!xstats_ctrl->info[i].dev)
+ continue;
+ ret = mlx5_os_read_dev_stat(priv, xstats_ctrl->info[i].ctr_name,
+ &stats[i]);
+ /* return last xstats counter if fail to read. */
+ if (ret != 0)
+ xstats_ctrl->xstats[i] = stats[i];
+ else
+ stats[i] = xstats_ctrl->xstats[i];
+ }
+ return ret;
+}
+
+/**
+ * Query the number of statistics provided by ETHTOOL.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * Number of statistics on success, negative errno value otherwise and
+ * rte_errno is set.
+ */
+int
+mlx5_os_get_stats_n(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ethtool_drvinfo drvinfo;
+ struct ifreq ifr;
+ int ret;
+
+ drvinfo.cmd = ETHTOOL_GDRVINFO;
+ ifr.ifr_data = (caddr_t)&drvinfo;
+ if (priv->master && priv->pf_bond >= 0)
+ /* Bonding PF. */
+ ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
+ SIOCETHTOOL, &ifr);
+ else
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u unable to query number of statistics",
+ dev->data->port_id);
+ return ret;
+ }
+ return drvinfo.n_stats;
+}
+
+static const struct mlx5_counter_ctrl mlx5_counters_init[] = {
+ {
+ .dpdk_name = "rx_unicast_bytes",
+ .ctr_name = "rx_vport_unicast_bytes",
+ },
+ {
+ .dpdk_name = "rx_multicast_bytes",
+ .ctr_name = "rx_vport_multicast_bytes",
+ },
+ {
+ .dpdk_name = "rx_broadcast_bytes",
+ .ctr_name = "rx_vport_broadcast_bytes",
+ },
+ {
+ .dpdk_name = "rx_unicast_packets",
+ .ctr_name = "rx_vport_unicast_packets",
+ },
+ {
+ .dpdk_name = "rx_multicast_packets",
+ .ctr_name = "rx_vport_multicast_packets",
+ },
+ {
+ .dpdk_name = "rx_broadcast_packets",
+ .ctr_name = "rx_vport_broadcast_packets",
+ },
+ {
+ .dpdk_name = "tx_unicast_bytes",
+ .ctr_name = "tx_vport_unicast_bytes",
+ },
+ {
+ .dpdk_name = "tx_multicast_bytes",
+ .ctr_name = "tx_vport_multicast_bytes",
+ },
+ {
+ .dpdk_name = "tx_broadcast_bytes",
+ .ctr_name = "tx_vport_broadcast_bytes",
+ },
+ {
+ .dpdk_name = "tx_unicast_packets",
+ .ctr_name = "tx_vport_unicast_packets",
+ },
+ {
+ .dpdk_name = "tx_multicast_packets",
+ .ctr_name = "tx_vport_multicast_packets",
+ },
+ {
+ .dpdk_name = "tx_broadcast_packets",
+ .ctr_name = "tx_vport_broadcast_packets",
+ },
+ {
+ .dpdk_name = "rx_wqe_errors",
+ .ctr_name = "rx_wqe_err",
+ },
+ {
+ .dpdk_name = "rx_phy_crc_errors",
+ .ctr_name = "rx_crc_errors_phy",
+ },
+ {
+ .dpdk_name = "rx_phy_in_range_len_errors",
+ .ctr_name = "rx_in_range_len_errors_phy",
+ },
+ {
+ .dpdk_name = "rx_phy_symbol_errors",
+ .ctr_name = "rx_symbol_err_phy",
+ },
+ {
+ .dpdk_name = "tx_phy_errors",
+ .ctr_name = "tx_errors_phy",
+ },
+ {
+ .dpdk_name = "rx_out_of_buffer",
+ .ctr_name = "out_of_buffer",
+ .dev = 1,
+ },
+ {
+ .dpdk_name = "tx_phy_packets",
+ .ctr_name = "tx_packets_phy",
+ },
+ {
+ .dpdk_name = "rx_phy_packets",
+ .ctr_name = "rx_packets_phy",
+ },
+ {
+ .dpdk_name = "tx_phy_discard_packets",
+ .ctr_name = "tx_discards_phy",
+ },
+ {
+ .dpdk_name = "rx_phy_discard_packets",
+ .ctr_name = "rx_discards_phy",
+ },
+ {
+ .dpdk_name = "tx_phy_bytes",
+ .ctr_name = "tx_bytes_phy",
+ },
+ {
+ .dpdk_name = "rx_phy_bytes",
+ .ctr_name = "rx_bytes_phy",
+ },
+ /* Representor only */
+ {
+ .dpdk_name = "rx_vport_packets",
+ .ctr_name = "vport_rx_packets",
+ },
+ {
+ .dpdk_name = "rx_vport_bytes",
+ .ctr_name = "vport_rx_bytes",
+ },
+ {
+ .dpdk_name = "tx_vport_packets",
+ .ctr_name = "vport_tx_packets",
+ },
+ {
+ .dpdk_name = "tx_vport_bytes",
+ .ctr_name = "vport_tx_bytes",
+ },
+};
+
+static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init);
+
+/**
+ * Init the structures to read device counters.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_os_stats_init(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_xstats_ctrl *xstats_ctrl = &priv->xstats_ctrl;
+ struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl;
+ unsigned int i;
+ unsigned int j;
+ struct ifreq ifr;
+ struct ethtool_gstrings *strings = NULL;
+ unsigned int dev_stats_n;
+ unsigned int str_sz;
+ int ret;
+
+ /* So that it won't aggregate for each init. */
+ xstats_ctrl->mlx5_stats_n = 0;
+ ret = mlx5_os_get_stats_n(dev);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "port %u no extended statistics available",
+ dev->data->port_id);
+ return;
+ }
+ dev_stats_n = ret;
+ /* Allocate memory to grab stat names and values. */
+ str_sz = dev_stats_n * ETH_GSTRING_LEN;
+ strings = (struct ethtool_gstrings *)
+ mlx5_malloc(0, str_sz + sizeof(struct ethtool_gstrings), 0,
+ SOCKET_ID_ANY);
+ if (!strings) {
+ DRV_LOG(WARNING, "port %u unable to allocate memory for xstats",
+ dev->data->port_id);
+ return;
+ }
+ strings->cmd = ETHTOOL_GSTRINGS;
+ strings->string_set = ETH_SS_STATS;
+ strings->len = dev_stats_n;
+ ifr.ifr_data = (caddr_t)strings;
+ if (priv->master && priv->pf_bond >= 0)
+ /* Bonding master. */
+ ret = mlx5_ifreq_by_ifname(priv->sh->bond.ports[0].ifname,
+ SIOCETHTOOL, &ifr);
+ else
+ ret = mlx5_ifreq(dev, SIOCETHTOOL, &ifr);
+ if (ret) {
+ DRV_LOG(WARNING, "port %u unable to get statistic names",
+ dev->data->port_id);
+ goto free;
+ }
+ for (i = 0; i != dev_stats_n; ++i) {
+ const char *curr_string = (const char *)
+ &strings->data[i * ETH_GSTRING_LEN];
+
+ for (j = 0; j != xstats_n; ++j) {
+ if (!strcmp(mlx5_counters_init[j].ctr_name,
+ curr_string)) {
+ unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+ xstats_ctrl->dev_table_idx[idx] = i;
+ xstats_ctrl->info[idx] = mlx5_counters_init[j];
+ break;
+ }
+ }
+ }
+ /* Add dev counters. */
+ for (i = 0; i != xstats_n; ++i) {
+ if (mlx5_counters_init[i].dev) {
+ unsigned int idx = xstats_ctrl->mlx5_stats_n++;
+
+ xstats_ctrl->info[idx] = mlx5_counters_init[i];
+ xstats_ctrl->hw_stats[idx] = 0;
+ }
+ }
+ MLX5_ASSERT(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS);
+ xstats_ctrl->stats_n = dev_stats_n;
+ /* Copy to base at first time. */
+ ret = mlx5_os_read_dev_counters(dev, xstats_ctrl->base);
+ if (ret)
+ DRV_LOG(ERR, "port %u cannot read device counters: %s",
+ dev->data->port_id, strerror(rte_errno));
+ mlx5_os_read_dev_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base);
+ stats_ctrl->imissed = 0;
+free:
+ mlx5_free(strings);
+}
+
+/**
+ * Get MAC address by querying netdevice.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ * @param[out] mac
+ * MAC address output buffer.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_get_mac(struct rte_eth_dev *dev, uint8_t (*mac)[RTE_ETHER_ADDR_LEN])
+{
+ struct ifreq request;
+ int ret;
+
+ ret = mlx5_ifreq(dev, SIOCGIFHWADDR, &request);
+ if (ret)
+ return ret;
+ memcpy(mac, request.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
+ return 0;
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_flow_os.c b/drivers/net/mlx5/freebsd/mlx5_flow_os.c
new file mode 100644
index 0000000000..893f00b824
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_flow_os.c
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include "mlx5_flow_os.h"
+
+#include <rte_thread.h>
+
+/* Key of thread specific flow workspace data. */
+static rte_thread_key key_workspace;
+
+int
+mlx5_flow_os_init_workspace_once(void)
+{
+ if (rte_thread_key_create(&key_workspace, flow_release_workspace)) {
+ DRV_LOG(ERR, "Can't create flow workspace data thread key.");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void *
+mlx5_flow_os_get_specific_workspace(void)
+{
+ return rte_thread_value_get(key_workspace);
+}
+
+int
+mlx5_flow_os_set_specific_workspace(struct mlx5_flow_workspace *data)
+{
+ return rte_thread_value_set(key_workspace, data);
+}
+
+void
+mlx5_flow_os_release_workspace(void)
+{
+ rte_thread_key_delete(key_workspace);
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_flow_os.h b/drivers/net/mlx5/freebsd/mlx5_flow_os.h
new file mode 100644
index 0000000000..1926d26410
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_flow_os.h
@@ -0,0 +1,484 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_FLOW_OS_H_
+#define RTE_PMD_MLX5_FLOW_OS_H_
+
+#include "mlx5_flow.h"
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+extern const struct mlx5_flow_driver_ops mlx5_flow_dv_drv_ops;
+#endif
+
+/**
+ * Get OS enforced flow type. MLX5_FLOW_TYPE_MAX means "non enforced type".
+ *
+ * @return
+ * Flow type (MLX5_FLOW_TYPE_MAX)
+ */
+static inline enum mlx5_flow_drv_type
+mlx5_flow_os_get_type(void)
+{
+ return MLX5_FLOW_TYPE_MAX;
+}
+
+/**
+ * Check if item type is supported.
+ *
+ * @param item
+ * Item type to check.
+ *
+ * @return
+ * True is this item type is supported, false if not supported.
+ */
+static inline bool
+mlx5_flow_os_item_supported(int item __rte_unused)
+{
+ return true;
+}
+
+/**
+ * Check if action type is supported.
+ *
+ * @param action
+ * Action type to check.
+ *
+ * @return
+ * True is this action type is supported, false if not supported.
+ */
+static inline bool
+mlx5_flow_os_action_supported(int action __rte_unused)
+{
+ return true;
+}
+
+/**
+ * Create flow rule.
+ *
+ * @param[in] matcher
+ * Pointer to match mask structure.
+ * @param[in] match_value
+ * Pointer to match value structure.
+ * @param[in] num_actions
+ * Number of actions in flow rule.
+ * @param[in] actions
+ * Pointer to array of flow rule actions.
+ * @param[out] flow
+ * Pointer to a valid flow rule object on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow(void *matcher, void *match_value,
+ size_t num_actions, void *actions[], void **flow)
+{
+ *flow = mlx5_glue->dv_create_flow(matcher, match_value,
+ num_actions, actions);
+ return (*flow) ? 0 : -1;
+}
+
+/**
+ * Destroy flow rule.
+ *
+ * @param[in] drv_flow_ptr
+ * Pointer to flow rule object.
+ *
+ * @return
+ * 0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow(void *drv_flow_ptr)
+{
+ return mlx5_glue->dv_destroy_flow(drv_flow_ptr);
+}
+
+/**
+ * Create flow table.
+ *
+ * @param[in] domain
+ * Pointer to relevant domain.
+ * @param[in] table_id
+ * Table ID.
+ * @param[out] table
+ * Pointer to a valid flow table object on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_tbl(void *domain, uint32_t table_id, void **table)
+{
+ *table = mlx5_glue->dr_create_flow_tbl(domain, table_id);
+ return (*table) ? 0 : -1;
+}
+
+/**
+ * Destroy flow table.
+ *
+ * @param[in] table
+ * Pointer to table object to destroy.
+ *
+ * @return
+ * 0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow_tbl(void *table)
+{
+ return mlx5_glue->dr_destroy_flow_tbl(table);
+}
+
+/**
+ * Create flow matcher in a flow table.
+ *
+ * @param[in] ctx
+ * Pointer to relevant device context.
+ * @param[in] attr
+ * Pointer to relevant attributes.
+ * @param[in] table
+ * Pointer to table object.
+ * @param[out] matcher
+ * Pointer to a valid flow matcher object on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_matcher(void *ctx, void *attr, void *table,
+ void **matcher)
+{
+ *matcher = mlx5_glue->dv_create_flow_matcher(ctx, attr, table);
+ return (*matcher) ? 0 : -1;
+}
+
+/**
+ * Destroy flow matcher.
+ *
+ * @param[in] matcher
+ * Pointer to matcher object to destroy.
+ *
+ * @return
+ * 0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow_matcher(void *matcher)
+{
+ return mlx5_glue->dv_destroy_flow_matcher(matcher);
+}
+
+/**
+ * Create flow action: packet reformat.
+ *
+ * @param[in] ctx
+ * Pointer to relevant device context.
+ * @param[in] domain
+ * Pointer to domain handler.
+ * @param[in] resource
+ * Pointer to action data resource.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_packet_reformat(void *ctx, void *domain,
+ void *resource, void **action)
+{
+ struct mlx5_flow_dv_encap_decap_resource *res =
+ (struct mlx5_flow_dv_encap_decap_resource *)resource;
+
+ *action = mlx5_glue->dv_create_flow_action_packet_reformat
+ (ctx, res->reformat_type, res->ft_type,
+ domain, res->flags, res->size,
+ (res->size ? res->buf : NULL));
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: modify header.
+ *
+ * @param[in] ctx
+ * Pointer to relevant device context.
+ * @param[in] domain
+ * Pointer to domain handler.
+ * @param[in] resource
+ * Pointer to action data resource.
+ * @param[in] actions_len
+ * Total length of actions data in resource.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_modify_header(void *ctx, void *domain,
+ void *resource,
+ uint32_t actions_len,
+ void **action)
+{
+ struct mlx5_flow_dv_modify_hdr_resource *res =
+ (struct mlx5_flow_dv_modify_hdr_resource *)resource;
+
+ *action = mlx5_glue->dv_create_flow_action_modify_header
+ (ctx, res->ft_type, domain, res->root ?
+ MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL : 0,
+ actions_len, (uint64_t *)res->actions);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: destination flow table.
+ *
+ * @param[in] tbl_obj
+ * Pointer to destination table object.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_dest_flow_tbl(void *tbl_obj, void **action)
+{
+ *action = mlx5_glue->dr_create_flow_action_dest_flow_tbl(tbl_obj);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: destination port.
+ *
+ * @param[in] domain
+ * Pointer to domain handler.
+ * @param[in] port_id
+ * Destination port ID.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_dest_port(void *domain, uint32_t port_id,
+ void **action)
+{
+ /*
+ * Depending on rdma_core version the glue routine calls
+ * either mlx5dv_dr_action_create_dest_ib_port(domain, dev_port)
+ * or mlx5dv_dr_action_create_dest_vport(domain, vport_id).
+ */
+ *action = mlx5_glue->dr_create_flow_action_dest_port(domain, port_id);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: push vlan.
+ *
+ * @param[in] domain
+ * Pointer to domain handler.
+ * @param[in] vlan_tag
+ * VLAN tag value.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_push_vlan(void *domain, rte_be32_t vlan_tag,
+ void **action)
+{
+ *action = mlx5_glue->dr_create_flow_action_push_vlan(domain, vlan_tag);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: count.
+ *
+ * @param[in] cnt_obj
+ * Pointer to DevX counter object.
+ * @param[in] offset
+ * Offset of counter in array.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_count(void *cnt_obj, uint16_t offset,
+ void **action)
+{
+ *action = mlx5_glue->dv_create_flow_action_counter(cnt_obj, offset);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: tag.
+ *
+ * @param[in] tag
+ * Tag value.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_tag(uint32_t tag, void **action)
+{
+ *action = mlx5_glue->dv_create_flow_action_tag(tag);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: drop.
+ *
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_drop(void **action)
+{
+ *action = mlx5_glue->dr_create_flow_action_drop();
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: default miss.
+ *
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_default_miss(void **action)
+{
+ *action = mlx5_glue->dr_create_flow_action_default_miss();
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: dest_devx_tir
+ *
+ * @param[in] tir
+ * Pointer to DevX tir object
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_flow_os_create_flow_action_dest_devx_tir(struct mlx5_devx_obj *tir,
+ void **action)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ *action = mlx5_glue->dv_create_flow_action_dest_devx_tir(tir->obj);
+ return (*action) ? 0 : -1;
+#else
+ /* If no DV support - skip the operation and return success */
+ RTE_SET_USED(tir);
+ *action = 0;
+ return 0;
+#endif
+}
+
+/**
+ * Create flow action: sampler
+ *
+ * @param[in] attr
+ * Pointer to sampler attribute
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_os_flow_dr_create_flow_action_sampler
+ (struct mlx5dv_dr_flow_sampler_attr *attr,
+ void **action)
+{
+ *action = mlx5_glue->dr_create_flow_action_sampler(attr);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Create flow action: dest_array
+ *
+ * @param[in] domain
+ * Pointer to relevant domain.
+ * @param[in] num_dest
+ * Number of destinations array.
+ * @param[in] dests
+ * Array of destination attributes.
+ * @param[out] action
+ * Pointer to a valid action on success, NULL otherwise.
+ *
+ * @return
+ * 0 on success, or -1 on failure and errno is set.
+ */
+static inline int
+mlx5_os_flow_dr_create_flow_action_dest_array
+ (void *domain,
+ size_t num_dest,
+ struct mlx5dv_dr_action_dest_attr *dests[],
+ void **action)
+{
+ *action = mlx5_glue->dr_create_flow_action_dest_array(
+ domain, num_dest, dests);
+ return (*action) ? 0 : -1;
+}
+
+/**
+ * Destroy flow action.
+ *
+ * @param[in] action
+ * Pointer to action object to destroy.
+ *
+ * @return
+ * 0 on success, or the value of errno on failure.
+ */
+static inline int
+mlx5_flow_os_destroy_flow_action(void *action)
+{
+ return mlx5_glue->destroy_flow_action(action);
+}
+
+/**
+ * OS wrapper over Verbs API.
+ * Adjust flow priority based on the highest layer and the request priority.
+ *
+ * @param[in] dev
+ * Pointer to the Ethernet device structure.
+ * @param[in] priority
+ * The rule base priority.
+ * @param[in] subpriority
+ * The priority based on the items.
+ *
+ * @return
+ * The new priority.
+ */
+static inline uint32_t
+mlx5_os_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
+ uint32_t subpriority)
+{
+ return mlx5_flow_adjust_priority(dev, priority, subpriority);
+}
+
+static inline int
+mlx5_os_flow_dr_sync_domain(void *domain, uint32_t flags)
+{
+ return mlx5_glue->dr_sync_domain(domain, flags);
+}
+#endif /* RTE_PMD_MLX5_FLOW_OS_H_ */
diff --git a/drivers/net/mlx5/freebsd/mlx5_mp_os.c b/drivers/net/mlx5/freebsd/mlx5_mp_os.c
new file mode 100644
index 0000000000..3a4aa766f8
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_mp_os.c
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 6WIND S.A.
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#include <stdio.h>
+#include <time.h>
+
+#include <rte_eal.h>
+#include <ethdev_driver.h>
+#include <rte_string_fns.h>
+
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+#include <mlx5_malloc.h>
+
+#include "mlx5.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rx.h"
+#include "mlx5_tx.h"
+#include "mlx5_utils.h"
+
+int
+mlx5_mp_os_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+ struct rte_mp_msg mp_res;
+ struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+ const struct mlx5_mp_param *param =
+ (const struct mlx5_mp_param *)mp_msg->param;
+ struct rte_eth_dev *dev;
+ struct mlx5_priv *priv;
+ struct mr_cache_entry entry;
+ uint32_t lkey;
+ int ret;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (!rte_eth_dev_is_valid_port(param->port_id)) {
+ rte_errno = ENODEV;
+ DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+ return -rte_errno;
+ }
+ dev = &rte_eth_devices[param->port_id];
+ priv = dev->data->dev_private;
+ switch (param->type) {
+ case MLX5_MP_REQ_CREATE_MR:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ lkey = mlx5_mr_create_primary(priv->sh->pd,
+ &priv->sh->share_cache,
+ &entry, param->args.addr,
+ priv->config.mr_ext_memseg_en);
+ if (lkey == UINT32_MAX)
+ res->result = -rte_errno;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_VERBS_CMD_FD:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ mp_res.num_fds = 1;
+ mp_res.fds[0] = ((struct ibv_context *)priv->sh->ctx)->cmd_fd;
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_QUEUE_STATE_MODIFY:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = mlx5_queue_state_modify_primary
+ (dev, ¶m->args.state_modify);
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_QUEUE_RX_STOP:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = mlx5_rx_queue_stop_primary
+ (dev, param->args.queue_id.queue_id);
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_QUEUE_RX_START:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = mlx5_rx_queue_start_primary
+ (dev, param->args.queue_id.queue_id);
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_QUEUE_TX_STOP:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = mlx5_tx_queue_stop_primary
+ (dev, param->args.queue_id.queue_id);
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_QUEUE_TX_START:
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = mlx5_tx_queue_start_primary
+ (dev, param->args.queue_id.queue_id);
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ default:
+ rte_errno = EINVAL;
+ DRV_LOG(ERR, "port %u invalid mp request type",
+ dev->data->port_id);
+ return -rte_errno;
+ }
+ return ret;
+}
+
+/**
+ * IPC message handler of a secondary process.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] peer
+ * Pointer to the peer socket path.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_mp_os_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
+{
+struct rte_mp_msg mp_res;
+ struct mlx5_mp_param *res = (struct mlx5_mp_param *)mp_res.param;
+ const struct mlx5_mp_param *param =
+ (const struct mlx5_mp_param *)mp_msg->param;
+ struct rte_eth_dev *dev;
+ struct mlx5_proc_priv *ppriv;
+ struct mlx5_priv *priv;
+ int ret;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ if (!rte_eth_dev_is_valid_port(param->port_id)) {
+ rte_errno = ENODEV;
+ DRV_LOG(ERR, "port %u invalid port ID", param->port_id);
+ return -rte_errno;
+ }
+ dev = &rte_eth_devices[param->port_id];
+ priv = dev->data->dev_private;
+ switch (param->type) {
+ case MLX5_MP_REQ_START_RXTX:
+ DRV_LOG(INFO, "port %u starting datapath", dev->data->port_id);
+ dev->rx_pkt_burst = mlx5_select_rx_function(dev);
+ dev->tx_pkt_burst = mlx5_select_tx_function(dev);
+ ppriv = (struct mlx5_proc_priv *)dev->process_private;
+ /* If Tx queue number changes, re-initialize UAR. */
+ if (ppriv->uar_table_sz != priv->txqs_n) {
+ mlx5_tx_uar_uninit_secondary(dev);
+ mlx5_proc_priv_uninit(dev);
+ ret = mlx5_proc_priv_init(dev);
+ if (ret)
+ return -rte_errno;
+ ret = mlx5_tx_uar_init_secondary(dev, mp_msg->fds[0]);
+ if (ret) {
+ mlx5_proc_priv_uninit(dev);
+ return -rte_errno;
+ }
+ }
+ rte_mb();
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ case MLX5_MP_REQ_STOP_RXTX:
+ DRV_LOG(INFO, "port %u stopping datapath", dev->data->port_id);
+ dev->rx_pkt_burst = removed_rx_burst;
+ dev->tx_pkt_burst = removed_tx_burst;
+ rte_mb();
+ mp_init_msg(&priv->mp_id, &mp_res, param->type);
+ res->result = 0;
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+ default:
+ rte_errno = EINVAL;
+ DRV_LOG(ERR, "port %u invalid mp request type",
+ dev->data->port_id);
+ return -rte_errno;
+ }
+ return ret;
+}
+
+/**
+ * Broadcast request of stopping/starting data-path to secondary processes.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param[in] type
+ * Request type.
+ */
+static void
+mp_req_on_rxtx(struct rte_eth_dev *dev, enum mlx5_mp_req_type type)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_msg *mp_res;
+ struct rte_mp_reply mp_rep;
+ struct mlx5_mp_param *res;
+ struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+ struct mlx5_priv *priv = dev->data->dev_private;
+ int ret;
+ int i;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (!mlx5_shared_data->secondary_cnt)
+ return;
+ if (type != MLX5_MP_REQ_START_RXTX && type != MLX5_MP_REQ_STOP_RXTX) {
+ DRV_LOG(ERR, "port %u unknown request (req_type %d)",
+ dev->data->port_id, type);
+ return;
+ }
+ mp_init_msg(&priv->mp_id, &mp_req, type);
+ if (type == MLX5_MP_REQ_START_RXTX) {
+ mp_req.num_fds = 1;
+ mp_req.fds[0] = ((struct ibv_context *)priv->sh->ctx)->cmd_fd;
+ }
+ ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+ if (ret) {
+ if (rte_errno != ENOTSUP)
+ DRV_LOG(ERR, "port %u failed to request stop/start Rx/Tx (%d)",
+ dev->data->port_id, type);
+ goto exit;
+ }
+ if (mp_rep.nb_sent != mp_rep.nb_received) {
+ DRV_LOG(ERR,
+ "port %u not all secondaries responded (req_type %d)",
+ dev->data->port_id, type);
+ goto exit;
+ }
+ for (i = 0; i < mp_rep.nb_received; i++) {
+ mp_res = &mp_rep.msgs[i];
+ res = (struct mlx5_mp_param *)mp_res->param;
+ if (res->result) {
+ DRV_LOG(ERR, "port %u request failed on secondary #%d",
+ dev->data->port_id, i);
+ goto exit;
+ }
+ }
+exit:
+ mlx5_free(mp_rep.msgs);
+}
+
+/**
+ * Broadcast request of starting data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_os_req_start_rxtx(struct rte_eth_dev *dev)
+{
+ mp_req_on_rxtx(dev, MLX5_MP_REQ_START_RXTX);
+}
+
+/**
+ * Broadcast request of stopping data-path to secondary processes. The request
+ * is synchronous.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ */
+void
+mlx5_mp_os_req_stop_rxtx(struct rte_eth_dev *dev)
+{
+ mp_req_on_rxtx(dev, MLX5_MP_REQ_STOP_RXTX);
+}
+
+/**
+ * Request Verbs Rx/Tx queue stop or start to the primary process.
+ *
+ * @param[in] dev
+ * Pointer to Ethernet structure.
+ * @param queue_id
+ * Queue ID to control.
+ * @param req_type
+ * request type
+ * MLX5_MP_REQ_QUEUE_RX_START - start Rx queue
+ * MLX5_MP_REQ_QUEUE_TX_START - stop Tx queue
+ * MLX5_MP_REQ_QUEUE_RX_STOP - stop Rx queue
+ * MLX5_MP_REQ_QUEUE_TX_STOP - stop Tx queue
+ * @return
+ * 0 on success, a negative errno value otherwise and
+ * rte_errno is set.
+ */
+int
+mlx5_mp_os_req_queue_control(struct rte_eth_dev *dev, uint16_t queue_id,
+ enum mlx5_mp_req_type req_type)
+{
+ struct rte_mp_msg mp_req;
+ struct rte_mp_msg *mp_res;
+ struct rte_mp_reply mp_rep;
+ struct mlx5_mp_param *req = (struct mlx5_mp_param *)mp_req.param;
+ struct mlx5_mp_param *res;
+ struct timespec ts = {.tv_sec = MLX5_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
+ struct mlx5_priv *priv;
+ int ret;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_SECONDARY);
+ priv = dev->data->dev_private;
+ mp_init_msg(&priv->mp_id, &mp_req, req_type);
+ req->args.queue_id.queue_id = queue_id;
+ ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
+ if (ret) {
+ DRV_LOG(ERR, "port %u request to primary process failed",
+ dev->data->port_id);
+ return -rte_errno;
+ }
+ MLX5_ASSERT(mp_rep.nb_received == 1);
+ mp_res = &mp_rep.msgs[0];
+ res = (struct mlx5_mp_param *)mp_res->param;
+ ret = res->result;
+ free(mp_rep.msgs);
+ return ret;
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_os.c b/drivers/net/mlx5/freebsd/mlx5_os.c
new file mode 100644
index 0000000000..3746057673
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_os.c
@@ -0,0 +1,3208 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <net/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/sockios.h>
+#include <linux/ethtool.h>
+#include <fcntl.h>
+
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <ethdev_pci.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_bus_auxiliary.h>
+#include <rte_common.h>
+#include <rte_kvargs.h>
+#include <rte_rwlock.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+#include <rte_alarm.h>
+#include <rte_eal_paging.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_devx_cmds.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mp.h>
+#include <mlx5_common_mr.h>
+#include <mlx5_malloc.h>
+
+#include "mlx5_defs.h"
+#include "mlx5.h"
+#include "mlx5_common_os.h"
+#include "mlx5_utils.h"
+#include "mlx5_rxtx.h"
+#include "mlx5_rx.h"
+#include "mlx5_tx.h"
+#include "mlx5_autoconf.h"
+#include "mlx5_mr.h"
+#include "mlx5_flow.h"
+#include "rte_pmd_mlx5.h"
+#include "mlx5_verbs.h"
+#include "mlx5_nl.h"
+#include "mlx5_devx.h"
+
+#ifndef HAVE_IBV_MLX5_MOD_MPW
+#define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
+#define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
+#endif
+
+#ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
+#define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
+#endif
+
+static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
+
+/* Spinlock for mlx5_shared_data allocation. */
+static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* Process local data for secondary processes. */
+static struct mlx5_local_data mlx5_local_data;
+
+/* rte flow indexed pool configuration. */
+static struct mlx5_indexed_pool_config icfg[] = {
+ {
+ .size = sizeof(struct rte_flow),
+ .trunk_size = 64,
+ .need_lock = 1,
+ .release_mem_en = 0,
+ .malloc = mlx5_malloc,
+ .free = mlx5_free,
+ .per_core_cache = 0,
+ .type = "ctl_flow_ipool",
+ },
+ {
+ .size = sizeof(struct rte_flow),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 1,
+ .release_mem_en = 0,
+ .malloc = mlx5_malloc,
+ .free = mlx5_free,
+ .per_core_cache = 1 << 14,
+ .type = "rte_flow_ipool",
+ },
+ {
+ .size = sizeof(struct rte_flow),
+ .trunk_size = 64,
+ .grow_trunk = 3,
+ .grow_shift = 2,
+ .need_lock = 1,
+ .release_mem_en = 0,
+ .malloc = mlx5_malloc,
+ .free = mlx5_free,
+ .per_core_cache = 0,
+ .type = "mcp_flow_ipool",
+ },
+};
+
+/**
+ * Set the completion channel file descriptor interrupt as non-blocking.
+ *
+ * @param[in] rxq_obj
+ * Pointer to RQ channel object, which includes the channel fd
+ *
+ * @param[out] fd
+ * The file descriptor (representing the intetrrupt) used in this channel.
+ *
+ * @return
+ * 0 on successfully setting the fd to non-blocking, non-zero otherwise.
+ */
+int
+mlx5_os_set_nonblock_channel_fd(int fd)
+{
+ int flags;
+
+ flags = fcntl(fd, F_GETFL);
+ return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+/**
+ * Get mlx5 device attributes. The glue function query_device_ex() is called
+ * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5
+ * device attributes from the glue out parameter.
+ *
+ * @param dev
+ * Pointer to ibv context.
+ *
+ * @param device_attr
+ * Pointer to mlx5 device attributes.
+ *
+ * @return
+ * 0 on success, non zero error number otherwise
+ */
+int
+mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr)
+{
+ int err;
+ struct ibv_device_attr_ex attr_ex;
+ memset(device_attr, 0, sizeof(*device_attr));
+ err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex);
+ if (err)
+ return err;
+
+ device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex;
+ device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr;
+ device_attr->max_sge = attr_ex.orig_attr.max_sge;
+ device_attr->max_cq = attr_ex.orig_attr.max_cq;
+ device_attr->max_cqe = attr_ex.orig_attr.max_cqe;
+ device_attr->max_mr = attr_ex.orig_attr.max_mr;
+ device_attr->max_pd = attr_ex.orig_attr.max_pd;
+ device_attr->max_qp = attr_ex.orig_attr.max_qp;
+ device_attr->max_srq = attr_ex.orig_attr.max_srq;
+ device_attr->max_srq_wr = attr_ex.orig_attr.max_srq_wr;
+ device_attr->raw_packet_caps = attr_ex.raw_packet_caps;
+ device_attr->max_rwq_indirection_table_size =
+ attr_ex.rss_caps.max_rwq_indirection_table_size;
+ device_attr->max_tso = attr_ex.tso_caps.max_tso;
+ device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts;
+
+ struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+ err = mlx5_glue->dv_query_device(ctx, &dv_attr);
+ if (err)
+ return err;
+
+ device_attr->flags = dv_attr.flags;
+ device_attr->comp_mask = dv_attr.comp_mask;
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+ device_attr->sw_parsing_offloads =
+ dv_attr.sw_parsing_caps.sw_parsing_offloads;
+#endif
+ device_attr->min_single_stride_log_num_of_bytes =
+ dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes;
+ device_attr->max_single_stride_log_num_of_bytes =
+ dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes;
+ device_attr->min_single_wqe_log_num_of_strides =
+ dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides;
+ device_attr->max_single_wqe_log_num_of_strides =
+ dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides;
+ device_attr->stride_supported_qpts =
+ dv_attr.striding_rq_caps.supported_qpts;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps;
+#endif
+ strlcpy(device_attr->fw_ver, attr_ex.orig_attr.fw_ver,
+ sizeof(device_attr->fw_ver));
+
+ return err;
+}
+
+/**
+ * Verbs callback to allocate a memory. This function should allocate the space
+ * according to the size provided residing inside a huge page.
+ * Please note that all allocation must respect the alignment from libmlx5
+ * (i.e. currently rte_mem_page_size()).
+ *
+ * @param[in] size
+ * The size in bytes of the memory to allocate.
+ * @param[in] data
+ * A pointer to the callback data.
+ *
+ * @return
+ * Allocated buffer, NULL otherwise and rte_errno is set.
+ */
+static void *
+mlx5_alloc_verbs_buf(size_t size, void *data)
+{
+ struct mlx5_dev_ctx_shared *sh = data;
+ void *ret;
+ size_t alignment = rte_mem_page_size();
+ if (alignment == (size_t)-1) {
+ DRV_LOG(ERR, "Failed to get mem page size");
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+
+ MLX5_ASSERT(data != NULL);
+ ret = mlx5_malloc(0, size, alignment, sh->numa_node);
+ if (!ret && size)
+ rte_errno = ENOMEM;
+ return ret;
+}
+
+/**
+ * Detect misc5 support or not
+ *
+ * @param[in] priv
+ * Device private data pointer
+ */
+#ifdef HAVE_MLX5DV_DR
+static void
+__mlx5_discovery_misc5_cap(struct mlx5_priv *priv)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /* Dummy VxLAN matcher to detect rdma-core misc5 cap
+ * Case: IPv4--->UDP--->VxLAN--->vni
+ */
+ void *tbl;
+ struct mlx5_flow_dv_match_params matcher_mask;
+ void *match_m;
+ void *matcher;
+ void *headers_m;
+ void *misc5_m;
+ uint32_t *tunnel_header_m;
+ struct mlx5dv_flow_matcher_attr dv_attr;
+
+ memset(&matcher_mask, 0, sizeof(matcher_mask));
+ matcher_mask.size = sizeof(matcher_mask.buf);
+ match_m = matcher_mask.buf;
+ headers_m = MLX5_ADDR_OF(fte_match_param, match_m, outer_headers);
+ misc5_m = MLX5_ADDR_OF(fte_match_param,
+ match_m, misc_parameters_5);
+ tunnel_header_m = (uint32_t *)
+ MLX5_ADDR_OF(fte_match_set_misc5,
+ misc5_m, tunnel_header_1);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_protocol, 0xff);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 4);
+ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xffff);
+ *tunnel_header_m = 0xffffff;
+
+ tbl = mlx5_glue->dr_create_flow_tbl(priv->sh->rx_domain, 1);
+ if (!tbl) {
+ DRV_LOG(INFO, "No SW steering support");
+ return;
+ }
+ dv_attr.type = IBV_FLOW_ATTR_NORMAL,
+ dv_attr.match_mask = (void *)&matcher_mask,
+ dv_attr.match_criteria_enable =
+ (1 << MLX5_MATCH_CRITERIA_ENABLE_OUTER_BIT) |
+ (1 << MLX5_MATCH_CRITERIA_ENABLE_MISC5_BIT);
+ dv_attr.priority = 3;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ void *misc2_m;
+ if (priv->config.dv_esw_en) {
+ /* FDB enabled reg_c_0 */
+ dv_attr.match_criteria_enable |=
+ (1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT);
+ misc2_m = MLX5_ADDR_OF(fte_match_param,
+ match_m, misc_parameters_2);
+ MLX5_SET(fte_match_set_misc2, misc2_m,
+ metadata_reg_c_0, 0xffff);
+ }
+#endif
+ matcher = mlx5_glue->dv_create_flow_matcher(priv->sh->ctx,
+ &dv_attr, tbl);
+ if (matcher) {
+ priv->sh->misc5_cap = 1;
+ mlx5_glue->dv_destroy_flow_matcher(matcher);
+ }
+ mlx5_glue->dr_destroy_flow_tbl(tbl);
+#else
+ RTE_SET_USED(priv);
+#endif
+}
+#endif
+
+/**
+ * Verbs callback to free a memory.
+ *
+ * @param[in] ptr
+ * A pointer to the memory to free.
+ * @param[in] data
+ * A pointer to the callback data.
+ */
+static void
+mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+ MLX5_ASSERT(data != NULL);
+ mlx5_free(ptr);
+}
+
+/**
+ * Initialize DR related data within private structure.
+ * Routine checks the reference counter and does actual
+ * resources creation/initialization only if counter is zero.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ *
+ * @return
+ * Zero on success, positive error code otherwise.
+ */
+static int
+mlx5_alloc_shared_dr(struct mlx5_priv *priv)
+{
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+ char s[MLX5_NAME_SIZE] __rte_unused;
+ int err;
+
+ MLX5_ASSERT(sh && sh->refcnt);
+ if (sh->refcnt > 1)
+ return 0;
+ err = mlx5_alloc_table_hash_list(priv);
+ if (err)
+ goto error;
+ /* The resources below are only valid with DV support. */
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /* Init port id action list. */
+ snprintf(s, sizeof(s), "%s_port_id_action_list", sh->ibdev_name);
+ sh->port_id_action_list = mlx5_list_create(s, sh, true,
+ flow_dv_port_id_create_cb,
+ flow_dv_port_id_match_cb,
+ flow_dv_port_id_remove_cb,
+ flow_dv_port_id_clone_cb,
+ flow_dv_port_id_clone_free_cb);
+ if (!sh->port_id_action_list)
+ goto error;
+ /* Init push vlan action list. */
+ snprintf(s, sizeof(s), "%s_push_vlan_action_list", sh->ibdev_name);
+ sh->push_vlan_action_list = mlx5_list_create(s, sh, true,
+ flow_dv_push_vlan_create_cb,
+ flow_dv_push_vlan_match_cb,
+ flow_dv_push_vlan_remove_cb,
+ flow_dv_push_vlan_clone_cb,
+ flow_dv_push_vlan_clone_free_cb);
+ if (!sh->push_vlan_action_list)
+ goto error;
+ /* Init sample action list. */
+ snprintf(s, sizeof(s), "%s_sample_action_list", sh->ibdev_name);
+ sh->sample_action_list = mlx5_list_create(s, sh, true,
+ flow_dv_sample_create_cb,
+ flow_dv_sample_match_cb,
+ flow_dv_sample_remove_cb,
+ flow_dv_sample_clone_cb,
+ flow_dv_sample_clone_free_cb);
+ if (!sh->sample_action_list)
+ goto error;
+ /* Init dest array action list. */
+ snprintf(s, sizeof(s), "%s_dest_array_list", sh->ibdev_name);
+ sh->dest_array_list = mlx5_list_create(s, sh, true,
+ flow_dv_dest_array_create_cb,
+ flow_dv_dest_array_match_cb,
+ flow_dv_dest_array_remove_cb,
+ flow_dv_dest_array_clone_cb,
+ flow_dv_dest_array_clone_free_cb);
+ if (!sh->dest_array_list)
+ goto error;
+#endif
+#ifdef HAVE_MLX5DV_DR
+ void *domain;
+
+ /* Reference counter is zero, we should initialize structures. */
+ domain = mlx5_glue->dr_create_domain(sh->ctx,
+ MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
+ if (!domain) {
+ DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
+ err = errno;
+ goto error;
+ }
+ sh->rx_domain = domain;
+ domain = mlx5_glue->dr_create_domain(sh->ctx,
+ MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
+ if (!domain) {
+ DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
+ err = errno;
+ goto error;
+ }
+ sh->tx_domain = domain;
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ if (priv->config.dv_esw_en) {
+ domain = mlx5_glue->dr_create_domain
+ (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
+ if (!domain) {
+ DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
+ err = errno;
+ goto error;
+ }
+ sh->fdb_domain = domain;
+ }
+ /*
+ * The drop action is just some dummy placeholder in rdma-core. It
+ * does not belong to domains and has no any attributes, and, can be
+ * shared by the entire device.
+ */
+ sh->dr_drop_action = mlx5_glue->dr_create_flow_action_drop();
+ if (!sh->dr_drop_action) {
+ DRV_LOG(ERR, "FDB mlx5dv_dr_create_flow_action_drop");
+ err = errno;
+ goto error;
+ }
+#endif
+ if (!sh->tunnel_hub && priv->config.dv_miss_info)
+ err = mlx5_alloc_tunnel_hub(sh);
+ if (err) {
+ DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err);
+ goto error;
+ }
+ if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
+ mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
+ mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
+ if (sh->fdb_domain)
+ mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
+ }
+ sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
+ if (!priv->config.allow_duplicate_pattern) {
+#ifndef HAVE_MLX5_DR_ALLOW_DUPLICATE
+ DRV_LOG(WARNING, "Disallow duplicate pattern is not supported - maybe old rdma-core version?");
+#endif
+ mlx5_glue->dr_allow_duplicate_rules(sh->rx_domain, 0);
+ mlx5_glue->dr_allow_duplicate_rules(sh->tx_domain, 0);
+ if (sh->fdb_domain)
+ mlx5_glue->dr_allow_duplicate_rules(sh->fdb_domain, 0);
+ }
+
+ __mlx5_discovery_misc5_cap(priv);
+#endif /* HAVE_MLX5DV_DR */
+ sh->default_miss_action =
+ mlx5_glue->dr_create_flow_action_default_miss();
+ if (!sh->default_miss_action)
+ DRV_LOG(WARNING, "Default miss action is not supported.");
+ return 0;
+error:
+ /* Rollback the created objects. */
+ if (sh->rx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->rx_domain);
+ sh->rx_domain = NULL;
+ }
+ if (sh->tx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->tx_domain);
+ sh->tx_domain = NULL;
+ }
+ if (sh->fdb_domain) {
+ mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+ sh->fdb_domain = NULL;
+ }
+ if (sh->dr_drop_action) {
+ mlx5_glue->destroy_flow_action(sh->dr_drop_action);
+ sh->dr_drop_action = NULL;
+ }
+ if (sh->pop_vlan_action) {
+ mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+ sh->pop_vlan_action = NULL;
+ }
+ if (sh->encaps_decaps) {
+ mlx5_hlist_destroy(sh->encaps_decaps);
+ sh->encaps_decaps = NULL;
+ }
+ if (sh->modify_cmds) {
+ mlx5_hlist_destroy(sh->modify_cmds);
+ sh->modify_cmds = NULL;
+ }
+ if (sh->tag_table) {
+ /* tags should be destroyed with flow before. */
+ mlx5_hlist_destroy(sh->tag_table);
+ sh->tag_table = NULL;
+ }
+ if (sh->tunnel_hub) {
+ mlx5_release_tunnel_hub(sh, priv->dev_port);
+ sh->tunnel_hub = NULL;
+ }
+ mlx5_free_table_hash_list(priv);
+ if (sh->port_id_action_list) {
+ mlx5_list_destroy(sh->port_id_action_list);
+ sh->port_id_action_list = NULL;
+ }
+ if (sh->push_vlan_action_list) {
+ mlx5_list_destroy(sh->push_vlan_action_list);
+ sh->push_vlan_action_list = NULL;
+ }
+ if (sh->sample_action_list) {
+ mlx5_list_destroy(sh->sample_action_list);
+ sh->sample_action_list = NULL;
+ }
+ if (sh->dest_array_list) {
+ mlx5_list_destroy(sh->dest_array_list);
+ sh->dest_array_list = NULL;
+ }
+ return err;
+}
+
+/**
+ * Destroy DR related data within private structure.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ */
+void
+mlx5_os_free_shared_dr(struct mlx5_priv *priv)
+{
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+
+ MLX5_ASSERT(sh && sh->refcnt);
+ if (sh->refcnt > 1)
+ return;
+#ifdef HAVE_MLX5DV_DR
+ if (sh->rx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->rx_domain);
+ sh->rx_domain = NULL;
+ }
+ if (sh->tx_domain) {
+ mlx5_glue->dr_destroy_domain(sh->tx_domain);
+ sh->tx_domain = NULL;
+ }
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ if (sh->fdb_domain) {
+ mlx5_glue->dr_destroy_domain(sh->fdb_domain);
+ sh->fdb_domain = NULL;
+ }
+ if (sh->dr_drop_action) {
+ mlx5_glue->destroy_flow_action(sh->dr_drop_action);
+ sh->dr_drop_action = NULL;
+ }
+#endif
+ if (sh->pop_vlan_action) {
+ mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
+ sh->pop_vlan_action = NULL;
+ }
+#endif /* HAVE_MLX5DV_DR */
+ if (sh->default_miss_action)
+ mlx5_glue->destroy_flow_action
+ (sh->default_miss_action);
+ if (sh->encaps_decaps) {
+ mlx5_hlist_destroy(sh->encaps_decaps);
+ sh->encaps_decaps = NULL;
+ }
+ if (sh->modify_cmds) {
+ mlx5_hlist_destroy(sh->modify_cmds);
+ sh->modify_cmds = NULL;
+ }
+ if (sh->tag_table) {
+ /* tags should be destroyed with flow before. */
+ mlx5_hlist_destroy(sh->tag_table);
+ sh->tag_table = NULL;
+ }
+ if (sh->tunnel_hub) {
+ mlx5_release_tunnel_hub(sh, priv->dev_port);
+ sh->tunnel_hub = NULL;
+ }
+ mlx5_free_table_hash_list(priv);
+ if (sh->port_id_action_list) {
+ mlx5_list_destroy(sh->port_id_action_list);
+ sh->port_id_action_list = NULL;
+ }
+ if (sh->push_vlan_action_list) {
+ mlx5_list_destroy(sh->push_vlan_action_list);
+ sh->push_vlan_action_list = NULL;
+ }
+ if (sh->sample_action_list) {
+ mlx5_list_destroy(sh->sample_action_list);
+ sh->sample_action_list = NULL;
+ }
+ if (sh->dest_array_list) {
+ mlx5_list_destroy(sh->dest_array_list);
+ sh->dest_array_list = NULL;
+ }
+}
+
+/**
+ * Initialize shared data between primary and secondary process.
+ *
+ * A memzone is reserved by primary process and secondary processes attach to
+ * the memzone.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_shared_data(void)
+{
+ const struct rte_memzone *mz;
+ int ret = 0;
+
+ rte_spinlock_lock(&mlx5_shared_data_lock);
+ if (mlx5_shared_data == NULL) {
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* Allocate shared memory. */
+ mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
+ sizeof(*mlx5_shared_data),
+ SOCKET_ID_ANY, 0);
+ if (mz == NULL) {
+ DRV_LOG(ERR,
+ "Cannot allocate mlx5 shared data");
+ ret = -rte_errno;
+ goto error;
+ }
+ mlx5_shared_data = mz->addr;
+ memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
+ rte_spinlock_init(&mlx5_shared_data->lock);
+ } else {
+ /* Lookup allocated shared memory. */
+ mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
+ if (mz == NULL) {
+ DRV_LOG(ERR,
+ "Cannot attach mlx5 shared data");
+ ret = -rte_errno;
+ goto error;
+ }
+ mlx5_shared_data = mz->addr;
+ memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
+ }
+ }
+error:
+ rte_spinlock_unlock(&mlx5_shared_data_lock);
+ return ret;
+}
+
+/**
+ * PMD global initialization.
+ *
+ * Independent from individual device, this function initializes global
+ * per-PMD data structures distinguishing primary and secondary processes.
+ * Hence, each initialization is called once per a process.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_init_once(void)
+{
+ struct mlx5_shared_data *sd;
+ struct mlx5_local_data *ld = &mlx5_local_data;
+ int ret = 0;
+
+ if (mlx5_init_shared_data())
+ return -rte_errno;
+ sd = mlx5_shared_data;
+ MLX5_ASSERT(sd);
+ rte_spinlock_lock(&sd->lock);
+ switch (rte_eal_process_type()) {
+ case RTE_PROC_PRIMARY:
+ if (sd->init_done)
+ break;
+ LIST_INIT(&sd->mem_event_cb_list);
+ rte_rwlock_init(&sd->mem_event_rwlock);
+ rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
+ mlx5_mr_mem_event_cb, NULL);
+ ret = mlx5_mp_init_primary(MLX5_MP_NAME,
+ mlx5_mp_os_primary_handle);
+ if (ret)
+ goto out;
+ sd->init_done = true;
+ break;
+ case RTE_PROC_SECONDARY:
+ if (ld->init_done)
+ break;
+ ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
+ mlx5_mp_os_secondary_handle);
+ if (ret)
+ goto out;
+ ++sd->secondary_cnt;
+ ld->init_done = true;
+ break;
+ default:
+ break;
+ }
+out:
+ rte_spinlock_unlock(&sd->lock);
+ return ret;
+}
+
+/**
+ * Create the Tx queue DevX/Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Tx queue array.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_os_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq_data, struct mlx5_txq_ctrl, txq);
+
+ if (txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN)
+ return mlx5_txq_devx_obj_new(dev, idx);
+#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
+ if (!priv->config.dv_esw_en)
+ return mlx5_txq_devx_obj_new(dev, idx);
+#endif
+ return mlx5_txq_ibv_obj_new(dev, idx);
+}
+
+/**
+ * Release an Tx DevX/verbs queue object.
+ *
+ * @param txq_obj
+ * DevX/Verbs Tx queue object.
+ */
+static void
+mlx5_os_txq_obj_release(struct mlx5_txq_obj *txq_obj)
+{
+ if (txq_obj->txq_ctrl->type == MLX5_TXQ_TYPE_HAIRPIN) {
+ mlx5_txq_devx_obj_release(txq_obj);
+ return;
+ }
+#ifdef HAVE_MLX5DV_DEVX_UAR_OFFSET
+ if (!txq_obj->txq_ctrl->priv->config.dv_esw_en) {
+ mlx5_txq_devx_obj_release(txq_obj);
+ return;
+ }
+#endif
+ mlx5_txq_ibv_obj_release(txq_obj);
+}
+
+/**
+ * DV flow counter mode detect and config.
+ *
+ * @param dev
+ * Pointer to rte_eth_dev structure.
+ *
+ */
+static void
+mlx5_flow_counter_mode_config(struct rte_eth_dev *dev __rte_unused)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+ bool fallback;
+
+#ifndef HAVE_IBV_DEVX_ASYNC
+ fallback = true;
+#else
+ fallback = false;
+ if (!priv->config.devx || !priv->config.dv_flow_en ||
+ !priv->config.hca_attr.flow_counters_dump ||
+ !(priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) ||
+ (mlx5_flow_dv_discover_counter_offset_support(dev) == -ENOTSUP))
+ fallback = true;
+#endif
+ if (fallback)
+ DRV_LOG(INFO, "Use fall-back DV counter management. Flow "
+ "counter dump:%d, bulk_alloc_bitmap:0x%hhx.",
+ priv->config.hca_attr.flow_counters_dump,
+ priv->config.hca_attr.flow_counter_bulk_alloc_bitmap);
+ /* Initialize fallback mode only on the port initializes sh. */
+ if (sh->refcnt == 1)
+ sh->cmng.counter_fallback = fallback;
+ else if (fallback != sh->cmng.counter_fallback)
+ DRV_LOG(WARNING, "Port %d in sh has different fallback mode "
+ "with others:%d.", PORT_ID(priv), fallback);
+#endif
+}
+
+/**
+ * DR flow drop action support detect.
+ *
+ * @param dev
+ * Pointer to rte_eth_dev structure.
+ *
+ */
+static void
+mlx5_flow_drop_action_config(struct rte_eth_dev *dev __rte_unused)
+{
+#ifdef HAVE_MLX5DV_DR
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (!priv->config.dv_flow_en || !priv->sh->dr_drop_action)
+ return;
+ /**
+ * DR supports drop action placeholder when it is supported;
+ * otherwise, use the queue drop action.
+ */
+ if (mlx5_flow_discover_dr_action_support(dev))
+ priv->root_drop_action = priv->drop_queue.hrxq->action;
+ else
+ priv->root_drop_action = priv->sh->dr_drop_action;
+#endif
+}
+
+static void
+mlx5_queue_counter_id_prepare(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ void *ctx = priv->sh->ctx;
+
+ priv->q_counters = mlx5_devx_cmd_queue_counter_alloc(ctx);
+ if (!priv->q_counters) {
+ struct ibv_cq *cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
+ struct ibv_wq *wq;
+
+ DRV_LOG(DEBUG, "Port %d queue counter object cannot be created "
+ "by DevX - fall-back to use the kernel driver global "
+ "queue counter.", dev->data->port_id);
+ /* Create WQ by kernel and query its queue counter ID. */
+ if (cq) {
+ wq = mlx5_glue->create_wq(ctx,
+ &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = 1,
+ .max_sge = 1,
+ .pd = priv->sh->pd,
+ .cq = cq,
+ });
+ if (wq) {
+ /* Counter is assigned only on RDY state. */
+ int ret = mlx5_glue->modify_wq(wq,
+ &(struct ibv_wq_attr){
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ .wq_state = IBV_WQS_RDY,
+ });
+
+ if (ret == 0)
+ mlx5_devx_cmd_wq_query(wq,
+ &priv->counter_set_id);
+ claim_zero(mlx5_glue->destroy_wq(wq));
+ }
+ claim_zero(mlx5_glue->destroy_cq(cq));
+ }
+ } else {
+ priv->counter_set_id = priv->q_counters->id;
+ }
+ if (priv->counter_set_id == 0)
+ DRV_LOG(INFO, "Part of the port %d statistics will not be "
+ "available.", dev->data->port_id);
+}
+
+/**
+ * Check if representor spawn info match devargs.
+ *
+ * @param spawn
+ * Verbs device parameters (name, port, switch_info) to spawn.
+ * @param eth_da
+ * Device devargs to probe.
+ *
+ * @return
+ * Match result.
+ */
+static bool
+mlx5_representor_match(struct mlx5_dev_spawn_data *spawn,
+ struct rte_eth_devargs *eth_da)
+{
+ struct mlx5_switch_info *switch_info = &spawn->info;
+ unsigned int p, f;
+ uint16_t id;
+ uint16_t repr_id = mlx5_representor_id_encode(switch_info,
+ eth_da->type);
+
+ switch (eth_da->type) {
+ case RTE_ETH_REPRESENTOR_SF:
+ if (!(spawn->info.port_name == -1 &&
+ switch_info->name_type ==
+ MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
+ switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFSF) {
+ rte_errno = EBUSY;
+ return false;
+ }
+ break;
+ case RTE_ETH_REPRESENTOR_VF:
+ /* Allows HPF representor index -1 as exception. */
+ if (!(spawn->info.port_name == -1 &&
+ switch_info->name_type ==
+ MLX5_PHYS_PORT_NAME_TYPE_PFHPF) &&
+ switch_info->name_type != MLX5_PHYS_PORT_NAME_TYPE_PFVF) {
+ rte_errno = EBUSY;
+ return false;
+ }
+ break;
+ case RTE_ETH_REPRESENTOR_NONE:
+ rte_errno = EBUSY;
+ return false;
+ default:
+ rte_errno = ENOTSUP;
+ DRV_LOG(ERR, "unsupported representor type");
+ return false;
+ }
+ /* Check representor ID: */
+ for (p = 0; p < eth_da->nb_ports; ++p) {
+ if (spawn->pf_bond < 0) {
+ /* For non-LAG mode, allow and ignore pf. */
+ switch_info->pf_num = eth_da->ports[p];
+ repr_id = mlx5_representor_id_encode(switch_info,
+ eth_da->type);
+ }
+ for (f = 0; f < eth_da->nb_representor_ports; ++f) {
+ id = MLX5_REPRESENTOR_ID
+ (eth_da->ports[p], eth_da->type,
+ eth_da->representor_ports[f]);
+ if (repr_id == id)
+ return true;
+ }
+ }
+ rte_errno = EBUSY;
+ return false;
+}
+
+
+/**
+ * Spawn an Ethernet device from Verbs information.
+ *
+ * @param dpdk_dev
+ * Backing DPDK device.
+ * @param spawn
+ * Verbs device parameters (name, port, switch_info) to spawn.
+ * @param config
+ * Device configuration parameters.
+ * @param config
+ * Device arguments.
+ *
+ * @return
+ * A valid Ethernet device object on success, NULL otherwise and rte_errno
+ * is set. The following errors are defined:
+ *
+ * EBUSY: device is not supposed to be spawned.
+ * EEXIST: device is already spawned
+ */
+static struct rte_eth_dev *
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+ struct mlx5_dev_spawn_data *spawn,
+ struct mlx5_dev_config *config,
+ struct rte_eth_devargs *eth_da)
+{
+ const struct mlx5_switch_info *switch_info = &spawn->info;
+ struct mlx5_dev_ctx_shared *sh = NULL;
+ struct ibv_port_attr port_attr;
+ struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+ struct rte_eth_dev *eth_dev = NULL;
+ struct mlx5_priv *priv = NULL;
+ int err = 0;
+ unsigned int hw_padding = 0;
+ unsigned int mps;
+ unsigned int tunnel_en = 0;
+ unsigned int mpls_en = 0;
+ unsigned int swp = 0;
+ unsigned int mprq = 0;
+ unsigned int mprq_min_stride_size_n = 0;
+ unsigned int mprq_max_stride_size_n = 0;
+ unsigned int mprq_min_stride_num_n = 0;
+ unsigned int mprq_max_stride_num_n = 0;
+ struct rte_ether_addr mac;
+ char name[RTE_ETH_NAME_MAX_LEN];
+ int own_domain_id = 0;
+ uint16_t port_id;
+ struct mlx5_port_info vport_info = { .query_flags = 0 };
+ int i;
+
+ /* Determine if this port representor is supposed to be spawned. */
+ if (switch_info->representor && dpdk_dev->devargs &&
+ !mlx5_representor_match(spawn, eth_da))
+ return NULL;
+ /* Build device name. */
+ if (spawn->pf_bond < 0) {
+ /* Single device. */
+ if (!switch_info->representor)
+ strlcpy(name, dpdk_dev->name, sizeof(name));
+ else
+ err = snprintf(name, sizeof(name), "%s_representor_%s%u",
+ dpdk_dev->name,
+ switch_info->name_type ==
+ MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
+ switch_info->port_name);
+ } else {
+ /* Bonding device. */
+ if (!switch_info->representor) {
+ err = snprintf(name, sizeof(name), "%s_%s",
+ dpdk_dev->name,
+ mlx5_os_get_dev_device_name(spawn->phys_dev));
+ } else {
+ err = snprintf(name, sizeof(name), "%s_%s_representor_c%dpf%d%s%u",
+ dpdk_dev->name,
+ mlx5_os_get_dev_device_name(spawn->phys_dev),
+ switch_info->ctrl_num,
+ switch_info->pf_num,
+ switch_info->name_type ==
+ MLX5_PHYS_PORT_NAME_TYPE_PFSF ? "sf" : "vf",
+ switch_info->port_name);
+ }
+ }
+ if (err >= (int)sizeof(name))
+ DRV_LOG(WARNING, "device name overflow %s", name);
+ /* check if the device is already spawned */
+ if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
+ rte_errno = EEXIST;
+ return NULL;
+ }
+ DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ struct mlx5_mp_id mp_id;
+
+ eth_dev = rte_eth_dev_attach_secondary(name);
+ if (eth_dev == NULL) {
+ DRV_LOG(ERR, "can not attach rte ethdev");
+ rte_errno = ENOMEM;
+ return NULL;
+ }
+ eth_dev->device = dpdk_dev;
+ eth_dev->dev_ops = &mlx5_dev_sec_ops;
+ eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
+ eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
+ err = mlx5_proc_priv_init(eth_dev);
+ if (err)
+ return NULL;
+ mp_id.port_id = eth_dev->data->port_id;
+ strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+ /* Receive command fd from primary process */
+ err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
+ if (err < 0)
+ goto err_secondary;
+ /* Remap UAR for Tx queues. */
+ err = mlx5_tx_uar_init_secondary(eth_dev, err);
+ if (err)
+ goto err_secondary;
+ /*
+ * Ethdev pointer is still required as input since
+ * the primary device is not accessible from the
+ * secondary process.
+ */
+ eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
+ eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
+ return eth_dev;
+err_secondary:
+ mlx5_dev_close(eth_dev);
+ return NULL;
+ }
+ /*
+ * Some parameters ("tx_db_nc" in particularly) are needed in
+ * advance to create dv/verbs device context. We proceed the
+ * devargs here to get ones, and later proceed devargs again
+ * to override some hardware settings.
+ */
+ err = mlx5_args(config, dpdk_dev->devargs);
+ if (err) {
+ err = rte_errno;
+ DRV_LOG(ERR, "failed to process device arguments: %s",
+ strerror(rte_errno));
+ goto error;
+ }
+ if (config->dv_miss_info) {
+ if (switch_info->master || switch_info->representor)
+ config->dv_xmeta_en = MLX5_XMETA_MODE_META16;
+ }
+ mlx5_malloc_mem_select(config->sys_mem_en);
+ sh = mlx5_alloc_shared_dev_ctx(spawn, config);
+ if (!sh)
+ return NULL;
+ config->devx = sh->devx;
+#ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
+ config->dest_tir = 1;
+#endif
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+ dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
+#endif
+ /*
+ * Multi-packet send is supported by ConnectX-4 Lx PF as well
+ * as all ConnectX-5 devices.
+ */
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
+#endif
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
+#endif
+ mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
+ if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
+ if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
+ DRV_LOG(DEBUG, "enhanced MPW is supported");
+ mps = MLX5_MPW_ENHANCED;
+ } else {
+ DRV_LOG(DEBUG, "MPW is supported");
+ mps = MLX5_MPW;
+ }
+ } else {
+ DRV_LOG(DEBUG, "MPW isn't supported");
+ mps = MLX5_MPW_DISABLED;
+ }
+#ifdef HAVE_IBV_MLX5_MOD_SWP
+ if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
+ swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
+ DRV_LOG(DEBUG, "SWP support: %u", swp);
+#endif
+ config->swp = !!swp;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
+ struct mlx5dv_striding_rq_caps mprq_caps =
+ dv_attr.striding_rq_caps;
+
+ DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
+ mprq_caps.min_single_stride_log_num_of_bytes);
+ DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
+ mprq_caps.max_single_stride_log_num_of_bytes);
+ DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
+ mprq_caps.min_single_wqe_log_num_of_strides);
+ DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
+ mprq_caps.max_single_wqe_log_num_of_strides);
+ DRV_LOG(DEBUG, "\tsupported_qpts: %d",
+ mprq_caps.supported_qpts);
+ DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
+ mprq = 1;
+ mprq_min_stride_size_n =
+ mprq_caps.min_single_stride_log_num_of_bytes;
+ mprq_max_stride_size_n =
+ mprq_caps.max_single_stride_log_num_of_bytes;
+ mprq_min_stride_num_n =
+ mprq_caps.min_single_wqe_log_num_of_strides;
+ mprq_max_stride_num_n =
+ mprq_caps.max_single_wqe_log_num_of_strides;
+ }
+#endif
+ /* Rx CQE compression is enabled by default. */
+ config->cqe_comp = 1;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
+ tunnel_en = ((dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
+ (dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
+ (dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
+ }
+ DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
+ tunnel_en ? "" : "not ");
+#else
+ DRV_LOG(WARNING,
+ "tunnel offloading disabled due to old OFED/rdma-core version");
+#endif
+ config->tunnel_en = tunnel_en;
+#ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
+ mpls_en = ((dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
+ (dv_attr.tunnel_offloads_caps &
+ MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
+ DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
+ mpls_en ? "" : "not ");
+#else
+ DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
+ " old OFED/rdma-core version or firmware configuration");
+#endif
+ config->mpls_en = mpls_en;
+ /* Check port status. */
+ err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
+ if (err) {
+ DRV_LOG(ERR, "port query failed: %s", strerror(err));
+ goto error;
+ }
+ if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
+ DRV_LOG(ERR, "port is not configured in Ethernet mode");
+ err = EINVAL;
+ goto error;
+ }
+ if (port_attr.state != IBV_PORT_ACTIVE)
+ DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
+ mlx5_glue->port_state_str(port_attr.state),
+ port_attr.state);
+ /* Allocate private eth device data. */
+ priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE,
+ sizeof(*priv),
+ RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+ if (priv == NULL) {
+ DRV_LOG(ERR, "priv allocation failure");
+ err = ENOMEM;
+ goto error;
+ }
+ priv->sh = sh;
+ priv->dev_port = spawn->phys_port;
+ priv->pci_dev = spawn->pci_dev;
+ priv->mtu = RTE_ETHER_MTU;
+ /* Some internal functions rely on Netlink sockets, open them now. */
+ priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
+ priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
+ priv->representor = !!switch_info->representor;
+ priv->master = !!switch_info->master;
+ priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
+ priv->vport_meta_tag = 0;
+ priv->vport_meta_mask = 0;
+ priv->pf_bond = spawn->pf_bond;
+
+ DRV_LOG(DEBUG,
+ "dev_port=%u bus=%s pci=%s master=%d representor=%d pf_bond=%d\n",
+ priv->dev_port, dpdk_dev->bus->name,
+ priv->pci_dev ? priv->pci_dev->name : "NONE",
+ priv->master, priv->representor, priv->pf_bond);
+
+ /*
+ * If we have E-Switch we should determine the vport attributes.
+ * E-Switch may use either source vport field or reg_c[0] metadata
+ * register to match on vport index. The engaged part of metadata
+ * register is defined by mask.
+ */
+ if (switch_info->representor || switch_info->master) {
+ err = mlx5_glue->devx_port_query(sh->ctx,
+ spawn->phys_port,
+ &vport_info);
+ if (err) {
+ DRV_LOG(WARNING,
+ "can't query devx port %d on device %s",
+ spawn->phys_port,
+ mlx5_os_get_dev_device_name(spawn->phys_dev));
+ vport_info.query_flags = 0;
+ }
+ }
+ if (vport_info.query_flags & MLX5_PORT_QUERY_REG_C0) {
+ priv->vport_meta_tag = vport_info.vport_meta_tag;
+ priv->vport_meta_mask = vport_info.vport_meta_mask;
+ if (!priv->vport_meta_mask) {
+ DRV_LOG(ERR, "vport zero mask for port %d"
+ " on bonding device %s",
+ spawn->phys_port,
+ mlx5_os_get_dev_device_name
+ (spawn->phys_dev));
+ err = ENOTSUP;
+ goto error;
+ }
+ if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
+ DRV_LOG(ERR, "invalid vport tag for port %d"
+ " on bonding device %s",
+ spawn->phys_port,
+ mlx5_os_get_dev_device_name
+ (spawn->phys_dev));
+ err = ENOTSUP;
+ goto error;
+ }
+ }
+ if (vport_info.query_flags & MLX5_PORT_QUERY_VPORT) {
+ priv->vport_id = vport_info.vport_id;
+ } else if (spawn->pf_bond >= 0 &&
+ (switch_info->representor || switch_info->master)) {
+ DRV_LOG(ERR, "can't deduce vport index for port %d"
+ " on bonding device %s",
+ spawn->phys_port,
+ mlx5_os_get_dev_device_name(spawn->phys_dev));
+ err = ENOTSUP;
+ goto error;
+ } else {
+ /*
+ * Suppose vport index in compatible way. Kernel/rdma_core
+ * support single E-Switch per PF configurations only and
+ * vport_id field contains the vport index for associated VF,
+ * which is deduced from representor port name.
+ * For example, let's have the IB device port 10, it has
+ * attached network device eth0, which has port name attribute
+ * pf0vf2, we can deduce the VF number as 2, and set vport index
+ * as 3 (2+1). This assigning schema should be changed if the
+ * multiple E-Switch instances per PF configurations or/and PCI
+ * subfunctions are added.
+ */
+ priv->vport_id = switch_info->representor ?
+ switch_info->port_name + 1 : -1;
+ }
+ priv->representor_id = mlx5_representor_id_encode(switch_info,
+ eth_da->type);
+ /*
+ * Look for sibling devices in order to reuse their switch domain
+ * if any, otherwise allocate one.
+ */
+ MLX5_ETH_FOREACH_DEV(port_id, dpdk_dev) {
+ const struct mlx5_priv *opriv =
+ rte_eth_devices[port_id].data->dev_private;
+
+ if (!opriv ||
+ opriv->sh != priv->sh ||
+ opriv->domain_id ==
+ RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
+ continue;
+ priv->domain_id = opriv->domain_id;
+ DRV_LOG(DEBUG, "dev_port-%u inherit domain_id=%u\n",
+ priv->dev_port, priv->domain_id);
+ break;
+ }
+ if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
+ err = rte_eth_switch_domain_alloc(&priv->domain_id);
+ if (err) {
+ err = rte_errno;
+ DRV_LOG(ERR, "unable to allocate switch domain: %s",
+ strerror(rte_errno));
+ goto error;
+ }
+ own_domain_id = 1;
+ DRV_LOG(DEBUG, "dev_port-%u new domain_id=%u\n",
+ priv->dev_port, priv->domain_id);
+ }
+ /* Override some values set by hardware configuration. */
+ mlx5_args(config, dpdk_dev->devargs);
+ err = mlx5_dev_check_sibling_config(priv, config, dpdk_dev);
+ if (err)
+ goto error;
+ config->hw_csum = !!(sh->device_attr.device_cap_flags_ex &
+ IBV_DEVICE_RAW_IP_CSUM);
+ DRV_LOG(DEBUG, "checksum offloading is %ssupported",
+ (config->hw_csum ? "" : "not "));
+#if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
+ !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
+ DRV_LOG(DEBUG, "counters are not supported");
+#endif
+#if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
+ if (config->dv_flow_en) {
+ DRV_LOG(WARNING, "DV flow is not supported");
+ config->dv_flow_en = 0;
+ }
+#endif
+ if (spawn->max_port > UINT8_MAX) {
+ /* Verbs can't support ports larger than 255 by design. */
+ DRV_LOG(ERR, "can't support IB ports > UINT8_MAX");
+ err = EINVAL;
+ goto error;
+ }
+ config->ind_table_max_size =
+ sh->device_attr.max_rwq_indirection_table_size;
+ /*
+ * Remove this check once DPDK supports larger/variable
+ * indirection tables.
+ */
+ if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
+ config->ind_table_max_size = ETH_RSS_RETA_SIZE_512;
+ DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
+ config->ind_table_max_size);
+ config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
+ IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
+ DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
+ (config->hw_vlan_strip ? "" : "not "));
+ config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
+ IBV_RAW_PACKET_CAP_SCATTER_FCS);
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+ hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+ hw_padding = !!(sh->device_attr.device_cap_flags_ex &
+ IBV_DEVICE_PCI_WRITE_END_PADDING);
+#endif
+ if (config->hw_padding && !hw_padding) {
+ DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
+ config->hw_padding = 0;
+ } else if (config->hw_padding) {
+ DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
+ }
+ config->tso = (sh->device_attr.max_tso > 0 &&
+ (sh->device_attr.tso_supported_qpts &
+ (1 << IBV_QPT_RAW_PACKET)));
+ if (config->tso)
+ config->tso_max_payload_sz = sh->device_attr.max_tso;
+ /*
+ * MPW is disabled by default, while the Enhanced MPW is enabled
+ * by default.
+ */
+ if (config->mps == MLX5_ARG_UNSET)
+ config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
+ MLX5_MPW_DISABLED;
+ else
+ config->mps = config->mps ? mps : MLX5_MPW_DISABLED;
+ DRV_LOG(INFO, "%sMPS is %s",
+ config->mps == MLX5_MPW_ENHANCED ? "enhanced " :
+ config->mps == MLX5_MPW ? "legacy " : "",
+ config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
+ if (config->devx) {
+ err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
+ if (err) {
+ err = -err;
+ goto error;
+ }
+ /* Check relax ordering support. */
+ if (!haswell_broadwell_cpu) {
+ sh->cmng.relaxed_ordering_write =
+ config->hca_attr.relaxed_ordering_write;
+ sh->cmng.relaxed_ordering_read =
+ config->hca_attr.relaxed_ordering_read;
+ } else {
+ sh->cmng.relaxed_ordering_read = 0;
+ sh->cmng.relaxed_ordering_write = 0;
+ }
+ sh->rq_ts_format = config->hca_attr.rq_ts_format;
+ sh->sq_ts_format = config->hca_attr.sq_ts_format;
+ sh->steering_format_version =
+ config->hca_attr.steering_format_version;
+ sh->qp_ts_format = config->hca_attr.qp_ts_format;
+ /* Check for LRO support. */
+ if (config->dest_tir && config->hca_attr.lro_cap &&
+ config->dv_flow_en) {
+ /* TBD check tunnel lro caps. */
+ config->lro.supported = config->hca_attr.lro_cap;
+ DRV_LOG(DEBUG, "Device supports LRO");
+ /*
+ * If LRO timeout is not configured by application,
+ * use the minimal supported value.
+ */
+ if (!config->lro.timeout)
+ config->lro.timeout =
+ config->hca_attr.lro_timer_supported_periods[0];
+ DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
+ config->lro.timeout);
+ DRV_LOG(DEBUG, "LRO minimal size of TCP segment "
+ "required for coalescing is %d bytes",
+ config->hca_attr.lro_min_mss_size);
+ }
+#if defined(HAVE_MLX5DV_DR) && \
+ (defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER) || \
+ defined(HAVE_MLX5_DR_CREATE_ACTION_ASO))
+ if (config->hca_attr.qos.sup &&
+ config->hca_attr.qos.flow_meter_old &&
+ config->dv_flow_en) {
+ uint8_t reg_c_mask =
+ config->hca_attr.qos.flow_meter_reg_c_ids;
+ /*
+ * Meter needs two REG_C's for color match and pre-sfx
+ * flow match. Here get the REG_C for color match.
+ * REG_C_0 and REG_C_1 is reserved for metadata feature.
+ */
+ reg_c_mask &= 0xfc;
+ if (__builtin_popcount(reg_c_mask) < 1) {
+ priv->mtr_en = 0;
+ DRV_LOG(WARNING, "No available register for"
+ " meter.");
+ } else {
+ /*
+ * The meter color register is used by the
+ * flow-hit feature as well.
+ * The flow-hit feature must use REG_C_3
+ * Prefer REG_C_3 if it is available.
+ */
+ if (reg_c_mask & (1 << (REG_C_3 - REG_C_0)))
+ priv->mtr_color_reg = REG_C_3;
+ else
+ priv->mtr_color_reg = ffs(reg_c_mask)
+ - 1 + REG_C_0;
+ priv->mtr_en = 1;
+ priv->mtr_reg_share =
+ config->hca_attr.qos.flow_meter;
+ DRV_LOG(DEBUG, "The REG_C meter uses is %d",
+ priv->mtr_color_reg);
+ }
+ }
+ if (config->hca_attr.qos.sup &&
+ config->hca_attr.qos.flow_meter_aso_sup) {
+ uint32_t log_obj_size =
+ rte_log2_u32(MLX5_ASO_MTRS_PER_POOL >> 1);
+ if (log_obj_size >=
+ config->hca_attr.qos.log_meter_aso_granularity &&
+ log_obj_size <=
+ config->hca_attr.qos.log_meter_aso_max_alloc)
+ sh->meter_aso_en = 1;
+ }
+ if (priv->mtr_en) {
+ err = mlx5_aso_flow_mtrs_mng_init(priv->sh);
+ if (err) {
+ err = -err;
+ goto error;
+ }
+ }
+ if (config->hca_attr.flow.tunnel_header_0_1)
+ sh->tunnel_header_0_1 = 1;
+#endif
+#ifdef HAVE_MLX5_DR_CREATE_ACTION_ASO
+ if (config->hca_attr.flow_hit_aso &&
+ priv->mtr_color_reg == REG_C_3) {
+ sh->flow_hit_aso_en = 1;
+ err = mlx5_flow_aso_age_mng_init(sh);
+ if (err) {
+ err = -err;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "Flow Hit ASO is supported.");
+ }
+#endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO */
+#if defined(HAVE_MLX5_DR_CREATE_ACTION_ASO) && \
+ defined(HAVE_MLX5_DR_ACTION_ASO_CT)
+ if (config->hca_attr.ct_offload &&
+ priv->mtr_color_reg == REG_C_3) {
+ err = mlx5_flow_aso_ct_mng_init(sh);
+ if (err) {
+ err = -err;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "CT ASO is supported.");
+ sh->ct_aso_en = 1;
+ }
+#endif /* HAVE_MLX5_DR_CREATE_ACTION_ASO && HAVE_MLX5_DR_ACTION_ASO_CT */
+#if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_SAMPLE)
+ if (config->hca_attr.log_max_ft_sampler_num > 0 &&
+ config->dv_flow_en) {
+ priv->sampler_en = 1;
+ DRV_LOG(DEBUG, "Sampler enabled!");
+ } else {
+ priv->sampler_en = 0;
+ if (!config->hca_attr.log_max_ft_sampler_num)
+ DRV_LOG(WARNING,
+ "No available register for sampler.");
+ else
+ DRV_LOG(DEBUG, "DV flow is not supported!");
+ }
+#endif
+ }
+ if (config->cqe_comp && RTE_CACHE_LINE_SIZE == 128 &&
+ !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP)) {
+ DRV_LOG(WARNING, "Rx CQE 128B compression is not supported");
+ config->cqe_comp = 0;
+ }
+ if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX &&
+ (!config->devx || !config->hca_attr.mini_cqe_resp_flow_tag)) {
+ DRV_LOG(WARNING, "Flow Tag CQE compression"
+ " format isn't supported.");
+ config->cqe_comp = 0;
+ }
+ if (config->cqe_comp_fmt == MLX5_CQE_RESP_FORMAT_L34H_STRIDX &&
+ (!config->devx || !config->hca_attr.mini_cqe_resp_l3_l4_tag)) {
+ DRV_LOG(WARNING, "L3/L4 Header CQE compression"
+ " format isn't supported.");
+ config->cqe_comp = 0;
+ }
+ DRV_LOG(DEBUG, "Rx CQE compression is %ssupported",
+ config->cqe_comp ? "" : "not ");
+ if (config->tx_pp) {
+ DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz",
+ config->hca_attr.dev_freq_khz);
+ DRV_LOG(DEBUG, "Packet pacing is %ssupported",
+ config->hca_attr.qos.packet_pacing ? "" : "not ");
+ DRV_LOG(DEBUG, "Cross channel ops are %ssupported",
+ config->hca_attr.cross_channel ? "" : "not ");
+ DRV_LOG(DEBUG, "WQE index ignore is %ssupported",
+ config->hca_attr.wqe_index_ignore ? "" : "not ");
+ DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported",
+ config->hca_attr.non_wire_sq ? "" : "not ");
+ DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)",
+ config->hca_attr.log_max_static_sq_wq ? "" : "not ",
+ config->hca_attr.log_max_static_sq_wq);
+ DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported",
+ config->hca_attr.qos.wqe_rate_pp ? "" : "not ");
+ if (!config->devx) {
+ DRV_LOG(ERR, "DevX is required for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+ if (!config->hca_attr.qos.packet_pacing) {
+ DRV_LOG(ERR, "Packet pacing is not supported");
+ err = ENODEV;
+ goto error;
+ }
+ if (!config->hca_attr.cross_channel) {
+ DRV_LOG(ERR, "Cross channel operations are"
+ " required for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+ if (!config->hca_attr.wqe_index_ignore) {
+ DRV_LOG(ERR, "WQE index ignore feature is"
+ " required for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+ if (!config->hca_attr.non_wire_sq) {
+ DRV_LOG(ERR, "Non-wire SQ feature is"
+ " required for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+ if (!config->hca_attr.log_max_static_sq_wq) {
+ DRV_LOG(ERR, "Static WQE SQ feature is"
+ " required for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+ if (!config->hca_attr.qos.wqe_rate_pp) {
+ DRV_LOG(ERR, "WQE rate mode is required"
+ " for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
+ DRV_LOG(ERR, "DevX does not provide UAR offset,"
+ " can't create queues for packet pacing");
+ err = ENODEV;
+ goto error;
+#endif
+ }
+ if (config->devx) {
+ uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)];
+
+ err = config->hca_attr.access_register_user ?
+ mlx5_devx_cmd_register_read
+ (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0,
+ reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP;
+ if (!err) {
+ uint32_t ts_mode;
+
+ /* MTUTC register is read successfully. */
+ ts_mode = MLX5_GET(register_mtutc, reg,
+ time_stamp_mode);
+ if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME)
+ config->rt_timestamp = 1;
+ } else {
+ /* Kernel does not support register reading. */
+ if (config->hca_attr.dev_freq_khz ==
+ (NS_PER_S / MS_PER_S))
+ config->rt_timestamp = 1;
+ }
+ }
+ /*
+ * If HW has bug working with tunnel packet decapsulation and
+ * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip
+ * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore.
+ */
+ if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en)
+ config->hw_fcs_strip = 0;
+ DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
+ (config->hw_fcs_strip ? "" : "not "));
+ if (config->mprq.enabled && mprq) {
+ if (config->mprq.stride_num_n &&
+ (config->mprq.stride_num_n > mprq_max_stride_num_n ||
+ config->mprq.stride_num_n < mprq_min_stride_num_n)) {
+ config->mprq.stride_num_n =
+ RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
+ mprq_min_stride_num_n),
+ mprq_max_stride_num_n);
+ DRV_LOG(WARNING,
+ "the number of strides"
+ " for Multi-Packet RQ is out of range,"
+ " setting default value (%u)",
+ 1 << config->mprq.stride_num_n);
+ }
+ if (config->mprq.stride_size_n &&
+ (config->mprq.stride_size_n > mprq_max_stride_size_n ||
+ config->mprq.stride_size_n < mprq_min_stride_size_n)) {
+ config->mprq.stride_size_n =
+ RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
+ mprq_min_stride_size_n),
+ mprq_max_stride_size_n);
+ DRV_LOG(WARNING,
+ "the size of a stride"
+ " for Multi-Packet RQ is out of range,"
+ " setting default value (%u)",
+ 1 << config->mprq.stride_size_n);
+ }
+ config->mprq.min_stride_size_n = mprq_min_stride_size_n;
+ config->mprq.max_stride_size_n = mprq_max_stride_size_n;
+ } else if (config->mprq.enabled && !mprq) {
+ DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
+ config->mprq.enabled = 0;
+ }
+ if (config->max_dump_files_num == 0)
+ config->max_dump_files_num = 128;
+ eth_dev = rte_eth_dev_allocate(name);
+ if (eth_dev == NULL) {
+ DRV_LOG(ERR, "can not allocate rte ethdev");
+ err = ENOMEM;
+ goto error;
+ }
+ if (priv->representor) {
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
+ eth_dev->data->representor_id = priv->representor_id;
+ }
+ priv->mp_id.port_id = eth_dev->data->port_id;
+ strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
+ /*
+ * Store associated network device interface index. This index
+ * is permanent throughout the lifetime of device. So, we may store
+ * the ifindex here and use the cached value further.
+ */
+ MLX5_ASSERT(spawn->ifindex);
+ priv->if_index = spawn->ifindex;
+ eth_dev->data->dev_private = priv;
+ priv->dev_data = eth_dev->data;
+ eth_dev->data->mac_addrs = priv->mac;
+ eth_dev->device = dpdk_dev;
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
+ /* Configure the first MAC address by default. */
+ if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
+ DRV_LOG(ERR,
+ "port %u cannot get MAC address, is mlx5_en"
+ " loaded? (errno: %s)",
+ eth_dev->data->port_id, strerror(rte_errno));
+ err = ENODEV;
+ goto error;
+ }
+ DRV_LOG(INFO,
+ "port %u MAC address is " RTE_ETHER_ADDR_PRT_FMT,
+ eth_dev->data->port_id, RTE_ETHER_ADDR_BYTES(&mac));
+#ifdef RTE_LIBRTE_MLX5_DEBUG
+ {
+ char ifname[MLX5_NAMESIZE];
+
+ if (mlx5_get_ifname(eth_dev, &ifname) == 0)
+ DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
+ eth_dev->data->port_id, ifname);
+ else
+ DRV_LOG(DEBUG, "port %u ifname is unknown",
+ eth_dev->data->port_id);
+ }
+#endif
+ /* Get actual MTU if possible. */
+ err = mlx5_get_mtu(eth_dev, &priv->mtu);
+ if (err) {
+ err = rte_errno;
+ goto error;
+ }
+ DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
+ priv->mtu);
+ /* Initialize burst functions to prevent crashes before link-up. */
+ eth_dev->rx_pkt_burst = removed_rx_burst;
+ eth_dev->tx_pkt_burst = removed_tx_burst;
+ eth_dev->dev_ops = &mlx5_dev_ops;
+ eth_dev->rx_descriptor_status = mlx5_rx_descriptor_status;
+ eth_dev->tx_descriptor_status = mlx5_tx_descriptor_status;
+ eth_dev->rx_queue_count = mlx5_rx_queue_count;
+ /* Register MAC address. */
+ claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
+ if (config->vf && config->vf_nl_en)
+ mlx5_nl_mac_addr_sync(priv->nl_socket_route,
+ mlx5_ifindex(eth_dev),
+ eth_dev->data->mac_addrs,
+ MLX5_MAX_MAC_ADDRESSES);
+ priv->ctrl_flows = 0;
+ rte_spinlock_init(&priv->flow_list_lock);
+ TAILQ_INIT(&priv->flow_meters);
+ priv->mtr_profile_tbl = mlx5_l3t_create(MLX5_L3T_TYPE_PTR);
+ if (!priv->mtr_profile_tbl)
+ goto error;
+ /* Hint libmlx5 to use PMD allocator for data plane resources */
+ mlx5_glue->dv_set_context_attr(sh->ctx,
+ MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+ (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
+ .alloc = &mlx5_alloc_verbs_buf,
+ .free = &mlx5_free_verbs_buf,
+ .data = sh,
+ }));
+ /* Bring Ethernet device up. */
+ DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
+ eth_dev->data->port_id);
+ mlx5_set_link_up(eth_dev);
+ /*
+ * Even though the interrupt handler is not installed yet,
+ * interrupts will still trigger on the async_fd from
+ * Verbs context returned by ibv_open_device().
+ */
+ mlx5_link_update(eth_dev, 0);
+#ifdef HAVE_MLX5DV_DR_ESWITCH
+ if (!(config->hca_attr.eswitch_manager && config->dv_flow_en &&
+ (switch_info->representor || switch_info->master)))
+ config->dv_esw_en = 0;
+#else
+ config->dv_esw_en = 0;
+#endif
+ /* Detect minimal data bytes to inline. */
+ mlx5_set_min_inline(spawn, config);
+ /* Store device configuration on private structure. */
+ priv->config = *config;
+ for (i = 0; i < MLX5_FLOW_TYPE_MAXI; i++) {
+ icfg[i].release_mem_en = !!config->reclaim_mode;
+ if (config->reclaim_mode)
+ icfg[i].per_core_cache = 0;
+ priv->flows[i] = mlx5_ipool_create(&icfg[i]);
+ if (!priv->flows[i])
+ goto error;
+ }
+ /* Create context for virtual machine VLAN workaround. */
+ priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
+ if (config->dv_flow_en) {
+ err = mlx5_alloc_shared_dr(priv);
+ if (err)
+ goto error;
+ }
+ if (config->devx && config->dv_flow_en && config->dest_tir) {
+ priv->obj_ops = devx_obj_ops;
+ priv->obj_ops.drop_action_create =
+ ibv_obj_ops.drop_action_create;
+ priv->obj_ops.drop_action_destroy =
+ ibv_obj_ops.drop_action_destroy;
+#ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
+ priv->obj_ops.txq_obj_modify = ibv_obj_ops.txq_obj_modify;
+#else
+ if (config->dv_esw_en)
+ priv->obj_ops.txq_obj_modify =
+ ibv_obj_ops.txq_obj_modify;
+#endif
+ /* Use specific wrappers for Tx object. */
+ priv->obj_ops.txq_obj_new = mlx5_os_txq_obj_new;
+ priv->obj_ops.txq_obj_release = mlx5_os_txq_obj_release;
+ mlx5_queue_counter_id_prepare(eth_dev);
+ priv->obj_ops.lb_dummy_queue_create =
+ mlx5_rxq_ibv_obj_dummy_lb_create;
+ priv->obj_ops.lb_dummy_queue_release =
+ mlx5_rxq_ibv_obj_dummy_lb_release;
+ } else {
+ priv->obj_ops = ibv_obj_ops;
+ }
+ if (config->tx_pp &&
+ (priv->config.dv_esw_en ||
+ priv->obj_ops.txq_obj_new != mlx5_os_txq_obj_new)) {
+ /*
+ * HAVE_MLX5DV_DEVX_UAR_OFFSET is required to support
+ * packet pacing and already checked above.
+ * Hence, we should only make sure the SQs will be created
+ * with DevX, not with Verbs.
+ * Verbs allocates the SQ UAR on its own and it can't be shared
+ * with Clock Queue UAR as required for Tx scheduling.
+ */
+ DRV_LOG(ERR, "Verbs SQs, UAR can't be shared as required for packet pacing");
+ err = ENODEV;
+ goto error;
+ }
+ priv->drop_queue.hrxq = mlx5_drop_action_create(eth_dev);
+ if (!priv->drop_queue.hrxq)
+ goto error;
+ /* Supported Verbs flow priority number detection. */
+ err = mlx5_flow_discover_priorities(eth_dev);
+ if (err < 0) {
+ err = -err;
+ goto error;
+ }
+ priv->config.flow_prio = err;
+ if (!priv->config.dv_esw_en &&
+ priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ DRV_LOG(WARNING, "metadata mode %u is not supported "
+ "(no E-Switch)", priv->config.dv_xmeta_en);
+ priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
+ }
+ mlx5_set_metadata_mask(eth_dev);
+ if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ !priv->sh->dv_regc0_mask) {
+ DRV_LOG(ERR, "metadata mode %u is not supported "
+ "(no metadata reg_c[0] is available)",
+ priv->config.dv_xmeta_en);
+ err = ENOTSUP;
+ goto error;
+ }
+ priv->hrxqs = mlx5_list_create("hrxq", eth_dev, true,
+ mlx5_hrxq_create_cb,
+ mlx5_hrxq_match_cb,
+ mlx5_hrxq_remove_cb,
+ mlx5_hrxq_clone_cb,
+ mlx5_hrxq_clone_free_cb);
+ if (!priv->hrxqs)
+ goto error;
+ rte_rwlock_init(&priv->ind_tbls_lock);
+ /* Query availability of metadata reg_c's. */
+ err = mlx5_flow_discover_mreg_c(eth_dev);
+ if (err < 0) {
+ err = -err;
+ goto error;
+ }
+ if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
+ DRV_LOG(DEBUG,
+ "port %u extensive metadata register is not supported",
+ eth_dev->data->port_id);
+ if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
+ DRV_LOG(ERR, "metadata mode %u is not supported "
+ "(no metadata registers available)",
+ priv->config.dv_xmeta_en);
+ err = ENOTSUP;
+ goto error;
+ }
+ }
+ if (priv->config.dv_flow_en &&
+ priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+ mlx5_flow_ext_mreg_supported(eth_dev) &&
+ priv->sh->dv_regc0_mask) {
+ priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
+ MLX5_FLOW_MREG_HTABLE_SZ,
+ false, true, eth_dev,
+ flow_dv_mreg_create_cb,
+ flow_dv_mreg_match_cb,
+ flow_dv_mreg_remove_cb,
+ flow_dv_mreg_clone_cb,
+ flow_dv_mreg_clone_free_cb);
+ if (!priv->mreg_cp_tbl) {
+ err = ENOMEM;
+ goto error;
+ }
+ }
+ rte_spinlock_init(&priv->shared_act_sl);
+ mlx5_flow_counter_mode_config(eth_dev);
+ mlx5_flow_drop_action_config(eth_dev);
+ if (priv->config.dv_flow_en)
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_FLOW_OPS_THREAD_SAFE;
+ return eth_dev;
+error:
+ if (priv) {
+ if (priv->mreg_cp_tbl)
+ mlx5_hlist_destroy(priv->mreg_cp_tbl);
+ if (priv->sh)
+ mlx5_os_free_shared_dr(priv);
+ if (priv->nl_socket_route >= 0)
+ close(priv->nl_socket_route);
+ if (priv->nl_socket_rdma >= 0)
+ close(priv->nl_socket_rdma);
+ if (priv->vmwa_context)
+ mlx5_vlan_vmwa_exit(priv->vmwa_context);
+ if (eth_dev && priv->drop_queue.hrxq)
+ mlx5_drop_action_destroy(eth_dev);
+ if (priv->mtr_profile_tbl)
+ mlx5_l3t_destroy(priv->mtr_profile_tbl);
+ if (own_domain_id)
+ claim_zero(rte_eth_switch_domain_free(priv->domain_id));
+ if (priv->hrxqs)
+ mlx5_list_destroy(priv->hrxqs);
+ mlx5_free(priv);
+ if (eth_dev != NULL)
+ eth_dev->data->dev_private = NULL;
+ }
+ if (eth_dev != NULL) {
+ /* mac_addrs must not be freed alone because part of
+ * dev_private
+ **/
+ eth_dev->data->mac_addrs = NULL;
+ rte_eth_dev_release_port(eth_dev);
+ }
+ if (sh)
+ mlx5_free_shared_dev_ctx(sh);
+ MLX5_ASSERT(err > 0);
+ rte_errno = err;
+ return NULL;
+}
+
+/**
+ * Comparison callback to sort device data.
+ *
+ * This is meant to be used with qsort().
+ *
+ * @param a[in]
+ * Pointer to pointer to first data object.
+ * @param b[in]
+ * Pointer to pointer to second data object.
+ *
+ * @return
+ * 0 if both objects are equal, less than 0 if the first argument is less
+ * than the second, greater than 0 otherwise.
+ */
+static int
+mlx5_dev_spawn_data_cmp(const void *a, const void *b)
+{
+ const struct mlx5_switch_info *si_a =
+ &((const struct mlx5_dev_spawn_data *)a)->info;
+ const struct mlx5_switch_info *si_b =
+ &((const struct mlx5_dev_spawn_data *)b)->info;
+ int ret;
+
+ /* Master device first. */
+ ret = si_b->master - si_a->master;
+ if (ret)
+ return ret;
+ /* Then representor devices. */
+ ret = si_b->representor - si_a->representor;
+ if (ret)
+ return ret;
+ /* Unidentified devices come last in no specific order. */
+ if (!si_a->representor)
+ return 0;
+ /* Order representors by name. */
+ return si_a->port_name - si_b->port_name;
+}
+
+/**
+ * Match PCI information for possible slaves of bonding device.
+ *
+ * @param[in] ibv_dev
+ * Pointer to Infiniband device structure.
+ * @param[in] pci_dev
+ * Pointer to primary PCI address structure to match.
+ * @param[in] nl_rdma
+ * Netlink RDMA group socket handle.
+ * @param[in] owner
+ * Rerepsentor owner PF index.
+ * @param[out] bond_info
+ * Pointer to bonding information.
+ *
+ * @return
+ * negative value if no bonding device found, otherwise
+ * positive index of slave PF in bonding.
+ */
+static int
+mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
+ const struct rte_pci_addr *pci_dev,
+ int nl_rdma, uint16_t owner,
+ struct mlx5_bond_info *bond_info)
+{
+ char ifname[IF_NAMESIZE + 1];
+ unsigned int ifindex;
+ unsigned int np, i;
+ FILE *bond_file = NULL, *file;
+ int pf = -1;
+ int ret;
+
+ /*
+ * Try to get master device name. If something goes
+ * wrong suppose the lack of kernel support and no
+ * bonding devices.
+ */
+ memset(bond_info, 0, sizeof(*bond_info));
+ if (nl_rdma < 0)
+ return -1;
+ if (!strstr(ibv_dev->name, "bond"))
+ return -1;
+ np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
+ if (!np)
+ return -1;
+ /*
+ * The Master device might not be on the predefined
+ * port (not on port index 1, it is not garanted),
+ * we have to scan all Infiniband device port and
+ * find master.
+ */
+ for (i = 1; i <= np; ++i) {
+ /* Check whether Infiniband port is populated. */
+ ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
+ if (!ifindex)
+ continue;
+ if (!if_indextoname(ifindex, ifname))
+ continue;
+ /* Try to read bonding slave names from sysfs. */
+ MKSTR(slaves,
+ "/sys/class/net/%s/master/bonding/slaves", ifname);
+ bond_file = fopen(slaves, "r");
+ if (bond_file)
+ break;
+ }
+ if (!bond_file)
+ return -1;
+ /* Use safe format to check maximal buffer length. */
+ MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
+ while (fscanf(bond_file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
+ char tmp_str[IF_NAMESIZE + 32];
+ struct rte_pci_addr pci_addr;
+ struct mlx5_switch_info info;
+
+ /* Process slave interface names in the loop. */
+ snprintf(tmp_str, sizeof(tmp_str),
+ "/sys/class/net/%s", ifname);
+ if (mlx5_get_pci_addr(tmp_str, &pci_addr)) {
+ DRV_LOG(WARNING, "can not get PCI address"
+ " for netdev \"%s\"", ifname);
+ continue;
+ }
+ /* Slave interface PCI address match found. */
+ snprintf(tmp_str, sizeof(tmp_str),
+ "/sys/class/net/%s/phys_port_name", ifname);
+ file = fopen(tmp_str, "rb");
+ if (!file)
+ break;
+ info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
+ if (fscanf(file, "%32s", tmp_str) == 1)
+ mlx5_translate_port_name(tmp_str, &info);
+ fclose(file);
+ /* Only process PF ports. */
+ if (info.name_type != MLX5_PHYS_PORT_NAME_TYPE_LEGACY &&
+ info.name_type != MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
+ continue;
+ /* Check max bonding member. */
+ if (info.port_name >= MLX5_BOND_MAX_PORTS) {
+ DRV_LOG(WARNING, "bonding index out of range, "
+ "please increase MLX5_BOND_MAX_PORTS: %s",
+ tmp_str);
+ break;
+ }
+ /* Match PCI address, allows BDF0+pfx or BDFx+pfx. */
+ if (pci_dev->domain == pci_addr.domain &&
+ pci_dev->bus == pci_addr.bus &&
+ pci_dev->devid == pci_addr.devid &&
+ ((pci_dev->function == 0 &&
+ pci_dev->function + owner == pci_addr.function) ||
+ (pci_dev->function == owner &&
+ pci_addr.function == owner)))
+ pf = info.port_name;
+ /* Get ifindex. */
+ snprintf(tmp_str, sizeof(tmp_str),
+ "/sys/class/net/%s/ifindex", ifname);
+ file = fopen(tmp_str, "rb");
+ if (!file)
+ break;
+ ret = fscanf(file, "%u", &ifindex);
+ fclose(file);
+ if (ret != 1)
+ break;
+ /* Save bonding info. */
+ strncpy(bond_info->ports[info.port_name].ifname, ifname,
+ sizeof(bond_info->ports[0].ifname));
+ bond_info->ports[info.port_name].pci_addr = pci_addr;
+ bond_info->ports[info.port_name].ifindex = ifindex;
+ bond_info->n_port++;
+ }
+ if (pf >= 0) {
+ /* Get bond interface info */
+ ret = mlx5_sysfs_bond_info(ifindex, &bond_info->ifindex,
+ bond_info->ifname);
+ if (ret)
+ DRV_LOG(ERR, "unable to get bond info: %s",
+ strerror(rte_errno));
+ else
+ DRV_LOG(INFO, "PF device %u, bond device %u(%s)",
+ ifindex, bond_info->ifindex, bond_info->ifname);
+ }
+ return pf;
+}
+
+static void
+mlx5_os_config_default(struct mlx5_dev_config *config)
+{
+ memset(config, 0, sizeof(*config));
+ config->mps = MLX5_ARG_UNSET;
+ config->dbnc = MLX5_ARG_UNSET;
+ config->rx_vec_en = 1;
+ config->txq_inline_max = MLX5_ARG_UNSET;
+ config->txq_inline_min = MLX5_ARG_UNSET;
+ config->txq_inline_mpw = MLX5_ARG_UNSET;
+ config->txqs_inline = MLX5_ARG_UNSET;
+ config->vf_nl_en = 1;
+ config->mr_ext_memseg_en = 1;
+ config->mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN;
+ config->mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS;
+ config->dv_esw_en = 1;
+ config->dv_flow_en = 1;
+ config->decap_en = 1;
+ config->log_hp_size = MLX5_ARG_UNSET;
+ config->allow_duplicate_pattern = 1;
+}
+
+/**
+ * Register a PCI device within bonding.
+ *
+ * This function spawns Ethernet devices out of a given PCI device and
+ * bonding owner PF index.
+ *
+ * @param[in] pci_dev
+ * PCI device information.
+ * @param[in] req_eth_da
+ * Requested ethdev device argument.
+ * @param[in] owner_id
+ * Requested owner PF port ID within bonding device, default to 0.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_os_pci_probe_pf(struct rte_pci_device *pci_dev,
+ struct rte_eth_devargs *req_eth_da,
+ uint16_t owner_id)
+{
+ struct ibv_device **ibv_list;
+ /*
+ * Number of found IB Devices matching with requested PCI BDF.
+ * nd != 1 means there are multiple IB devices over the same
+ * PCI device and we have representors and master.
+ */
+ unsigned int nd = 0;
+ /*
+ * Number of found IB device Ports. nd = 1 and np = 1..n means
+ * we have the single multiport IB device, and there may be
+ * representors attached to some of found ports.
+ */
+ unsigned int np = 0;
+ /*
+ * Number of DPDK ethernet devices to Spawn - either over
+ * multiple IB devices or multiple ports of single IB device.
+ * Actually this is the number of iterations to spawn.
+ */
+ unsigned int ns = 0;
+ /*
+ * Bonding device
+ * < 0 - no bonding device (single one)
+ * >= 0 - bonding device (value is slave PF index)
+ */
+ int bd = -1;
+ struct mlx5_dev_spawn_data *list = NULL;
+ struct mlx5_dev_config dev_config;
+ unsigned int dev_config_vf;
+ struct rte_eth_devargs eth_da = *req_eth_da;
+ struct rte_pci_addr owner_pci = pci_dev->addr; /* Owner PF. */
+ struct mlx5_bond_info bond_info;
+ int ret = -1;
+
+ errno = 0;
+ ibv_list = mlx5_glue->get_device_list(&ret);
+ if (!ibv_list) {
+ rte_errno = errno ? errno : ENOSYS;
+ DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
+ return -rte_errno;
+ }
+ /*
+ * First scan the list of all Infiniband devices to find
+ * matching ones, gathering into the list.
+ */
+ struct ibv_device *ibv_match[ret + 1];
+ int nl_route = mlx5_nl_init(NETLINK_ROUTE);
+ int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
+ unsigned int i;
+
+ while (ret-- > 0) {
+ struct rte_pci_addr pci_addr;
+
+ DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+ bd = mlx5_device_bond_pci_match
+ (ibv_list[ret], &owner_pci, nl_rdma, owner_id,
+ &bond_info);
+ if (bd >= 0) {
+ /*
+ * Bonding device detected. Only one match is allowed,
+ * the bonding is supported over multi-port IB device,
+ * there should be no matches on representor PCI
+ * functions or non VF LAG bonding devices with
+ * specified address.
+ */
+ if (nd) {
+ DRV_LOG(ERR,
+ "multiple PCI match on bonding device"
+ "\"%s\" found", ibv_list[ret]->name);
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ /* Amend owner pci address if owner PF ID specified. */
+ if (eth_da.nb_representor_ports)
+ owner_pci.function += owner_id;
+ DRV_LOG(INFO, "PCI information matches for"
+ " slave %d bonding device \"%s\"",
+ bd, ibv_list[ret]->name);
+ ibv_match[nd++] = ibv_list[ret];
+ break;
+ } else {
+ /* Bonding device not found. */
+ if (mlx5_get_pci_addr(ibv_list[ret]->ibdev_path,
+ &pci_addr))
+ continue;
+ if (owner_pci.domain != pci_addr.domain ||
+ owner_pci.bus != pci_addr.bus ||
+ owner_pci.devid != pci_addr.devid ||
+ owner_pci.function != pci_addr.function)
+ continue;
+ DRV_LOG(INFO, "PCI information matches for device \"%s\"",
+ ibv_list[ret]->name);
+ ibv_match[nd++] = ibv_list[ret];
+ }
+ }
+ ibv_match[nd] = NULL;
+ if (!nd) {
+ /* No device matches, just complain and bail out. */
+ DRV_LOG(WARNING,
+ "no Verbs device matches PCI device " PCI_PRI_FMT ","
+ " are kernel drivers loaded?",
+ owner_pci.domain, owner_pci.bus,
+ owner_pci.devid, owner_pci.function);
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ if (nd == 1) {
+ /*
+ * Found single matching device may have multiple ports.
+ * Each port may be representor, we have to check the port
+ * number and check the representors existence.
+ */
+ if (nl_rdma >= 0)
+ np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
+ if (!np)
+ DRV_LOG(WARNING, "can not get IB device \"%s\""
+ " ports number", ibv_match[0]->name);
+ if (bd >= 0 && !np) {
+ DRV_LOG(ERR, "can not get ports"
+ " for bonding device");
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ }
+ /*
+ * Now we can determine the maximal
+ * amount of devices to be spawned.
+ */
+ list = mlx5_malloc(MLX5_MEM_ZERO,
+ sizeof(struct mlx5_dev_spawn_data) *
+ (np ? np : nd),
+ RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+ if (!list) {
+ DRV_LOG(ERR, "spawn data array allocation failure");
+ rte_errno = ENOMEM;
+ ret = -rte_errno;
+ goto exit;
+ }
+ if (bd >= 0 || np > 1) {
+ /*
+ * Single IB device with multiple ports found,
+ * it may be E-Switch master device and representors.
+ * We have to perform identification through the ports.
+ */
+ MLX5_ASSERT(nl_rdma >= 0);
+ MLX5_ASSERT(ns == 0);
+ MLX5_ASSERT(nd == 1);
+ MLX5_ASSERT(np);
+ for (i = 1; i <= np; ++i) {
+ list[ns].bond_info = &bond_info;
+ list[ns].max_port = np;
+ list[ns].phys_port = i;
+ list[ns].phys_dev = ibv_match[0];
+ list[ns].eth_dev = NULL;
+ list[ns].pci_dev = pci_dev;
+ list[ns].pf_bond = bd;
+ list[ns].ifindex = mlx5_nl_ifindex
+ (nl_rdma,
+ mlx5_os_get_dev_device_name
+ (list[ns].phys_dev), i);
+ if (!list[ns].ifindex) {
+ /*
+ * No network interface index found for the
+ * specified port, it means there is no
+ * representor on this port. It's OK,
+ * there can be disabled ports, for example
+ * if sriov_numvfs < sriov_totalvfs.
+ */
+ continue;
+ }
+ ret = -1;
+ if (nl_route >= 0)
+ ret = mlx5_nl_switch_info
+ (nl_route,
+ list[ns].ifindex,
+ &list[ns].info);
+ if (ret || (!list[ns].info.representor &&
+ !list[ns].info.master)) {
+ /*
+ * We failed to recognize representors with
+ * Netlink, let's try to perform the task
+ * with sysfs.
+ */
+ ret = mlx5_sysfs_switch_info
+ (list[ns].ifindex,
+ &list[ns].info);
+ }
+ if (!ret && bd >= 0) {
+ switch (list[ns].info.name_type) {
+ case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
+ if (np == 1) {
+ /*
+ * Force standalone bonding
+ * device for ROCE LAG
+ * confgiurations.
+ */
+ list[ns].info.master = 0;
+ list[ns].info.representor = 0;
+ }
+ if (list[ns].info.port_name == bd)
+ ns++;
+ break;
+ case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
+ /* Fallthrough */
+ case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
+ /* Fallthrough */
+ case MLX5_PHYS_PORT_NAME_TYPE_PFSF:
+ if (list[ns].info.pf_num == bd)
+ ns++;
+ break;
+ default:
+ break;
+ }
+ continue;
+ }
+ if (!ret && (list[ns].info.representor ^
+ list[ns].info.master))
+ ns++;
+ }
+ if (!ns) {
+ DRV_LOG(ERR,
+ "unable to recognize master/representors"
+ " on the IB device with multiple ports");
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ } else {
+ /*
+ * The existence of several matching entries (nd > 1) means
+ * port representors have been instantiated. No existing Verbs
+ * call nor sysfs entries can tell them apart, this can only
+ * be done through Netlink calls assuming kernel drivers are
+ * recent enough to support them.
+ *
+ * In the event of identification failure through Netlink,
+ * try again through sysfs, then:
+ *
+ * 1. A single IB device matches (nd == 1) with single
+ * port (np=0/1) and is not a representor, assume
+ * no switch support.
+ *
+ * 2. Otherwise no safe assumptions can be made;
+ * complain louder and bail out.
+ */
+ for (i = 0; i != nd; ++i) {
+ memset(&list[ns].info, 0, sizeof(list[ns].info));
+ list[ns].bond_info = NULL;
+ list[ns].max_port = 1;
+ list[ns].phys_port = 1;
+ list[ns].phys_dev = ibv_match[i];
+ list[ns].eth_dev = NULL;
+ list[ns].pci_dev = pci_dev;
+ list[ns].pf_bond = -1;
+ list[ns].ifindex = 0;
+ if (nl_rdma >= 0)
+ list[ns].ifindex = mlx5_nl_ifindex
+ (nl_rdma,
+ mlx5_os_get_dev_device_name
+ (list[ns].phys_dev), 1);
+ if (!list[ns].ifindex) {
+ char ifname[IF_NAMESIZE];
+
+ /*
+ * Netlink failed, it may happen with old
+ * ib_core kernel driver (before 4.16).
+ * We can assume there is old driver because
+ * here we are processing single ports IB
+ * devices. Let's try sysfs to retrieve
+ * the ifindex. The method works for
+ * master device only.
+ */
+ if (nd > 1) {
+ /*
+ * Multiple devices found, assume
+ * representors, can not distinguish
+ * master/representor and retrieve
+ * ifindex via sysfs.
+ */
+ continue;
+ }
+ ret = mlx5_get_ifname_sysfs
+ (ibv_match[i]->ibdev_path, ifname);
+ if (!ret)
+ list[ns].ifindex =
+ if_nametoindex(ifname);
+ if (!list[ns].ifindex) {
+ /*
+ * No network interface index found
+ * for the specified device, it means
+ * there it is neither representor
+ * nor master.
+ */
+ continue;
+ }
+ }
+ ret = -1;
+ if (nl_route >= 0)
+ ret = mlx5_nl_switch_info
+ (nl_route,
+ list[ns].ifindex,
+ &list[ns].info);
+ if (ret || (!list[ns].info.representor &&
+ !list[ns].info.master)) {
+ /*
+ * We failed to recognize representors with
+ * Netlink, let's try to perform the task
+ * with sysfs.
+ */
+ ret = mlx5_sysfs_switch_info
+ (list[ns].ifindex,
+ &list[ns].info);
+ }
+ if (!ret && (list[ns].info.representor ^
+ list[ns].info.master)) {
+ ns++;
+ } else if ((nd == 1) &&
+ !list[ns].info.representor &&
+ !list[ns].info.master) {
+ /*
+ * Single IB device with
+ * one physical port and
+ * attached network device.
+ * May be SRIOV is not enabled
+ * or there is no representors.
+ */
+ DRV_LOG(INFO, "no E-Switch support detected");
+ ns++;
+ break;
+ }
+ }
+ if (!ns) {
+ DRV_LOG(ERR,
+ "unable to recognize master/representors"
+ " on the multiple IB devices");
+ rte_errno = ENOENT;
+ ret = -rte_errno;
+ goto exit;
+ }
+ /*
+ * New kernels may add the switch_id attribute for the case
+ * there is no E-Switch and we wrongly recognized the
+ * only device as master. Override this if there is the
+ * single device with single port and new device name
+ * format present.
+ */
+ if (nd == 1 &&
+ list[0].info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK) {
+ list[0].info.master = 0;
+ list[0].info.representor = 0;
+ }
+ }
+ MLX5_ASSERT(ns);
+ /*
+ * Sort list to probe devices in natural order for users convenience
+ * (i.e. master first, then representors from lowest to highest ID).
+ */
+ qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
+ /* Device specific configuration. */
+ switch (pci_dev->id.device_id) {
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
+ case PCI_DEVICE_ID_MELLANOX_CONNECTXVF:
+ dev_config_vf = 1;
+ break;
+ default:
+ dev_config_vf = 0;
+ break;
+ }
+ if (eth_da.type != RTE_ETH_REPRESENTOR_NONE) {
+ /* Set devargs default values. */
+ if (eth_da.nb_mh_controllers == 0) {
+ eth_da.nb_mh_controllers = 1;
+ eth_da.mh_controllers[0] = 0;
+ }
+ if (eth_da.nb_ports == 0 && ns > 0) {
+ if (list[0].pf_bond >= 0 && list[0].info.representor)
+ DRV_LOG(WARNING, "Representor on Bonding device should use pf#vf# syntax: %s",
+ pci_dev->device.devargs->args);
+ eth_da.nb_ports = 1;
+ eth_da.ports[0] = list[0].info.pf_num;
+ }
+ if (eth_da.nb_representor_ports == 0) {
+ eth_da.nb_representor_ports = 1;
+ eth_da.representor_ports[0] = 0;
+ }
+ }
+ for (i = 0; i != ns; ++i) {
+ uint32_t restore;
+
+ /* Default configuration. */
+ mlx5_os_config_default(&dev_config);
+ dev_config.vf = dev_config_vf;
+ list[i].numa_node = pci_dev->device.numa_node;
+ list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
+ &list[i],
+ &dev_config,
+ ð_da);
+ if (!list[i].eth_dev) {
+ if (rte_errno != EBUSY && rte_errno != EEXIST)
+ break;
+ /* Device is disabled or already spawned. Ignore it. */
+ continue;
+ }
+ restore = list[i].eth_dev->data->dev_flags;
+ rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
+ /**
+ * Each representor has a dedicated interrupts vector.
+ * rte_eth_copy_pci_info() assigns PF interrupts handle to
+ * representor eth_dev object because representor and PF
+ * share the same PCI address.
+ * Override representor device with a dedicated
+ * interrupts handle here.
+ * Representor interrupts handle is released in mlx5_dev_stop().
+ */
+ if (list[i].info.representor) {
+ struct rte_intr_handle *intr_handle;
+ intr_handle = mlx5_malloc(MLX5_MEM_SYS | MLX5_MEM_ZERO,
+ sizeof(*intr_handle), 0,
+ SOCKET_ID_ANY);
+ if (!intr_handle) {
+ DRV_LOG(ERR,
+ "port %u failed to allocate memory for interrupt handler "
+ "Rx interrupts will not be supported",
+ i);
+ rte_errno = ENOMEM;
+ ret = -rte_errno;
+ goto exit;
+ }
+ list[i].eth_dev->intr_handle = intr_handle;
+ }
+ /* Restore non-PCI flags cleared by the above call. */
+ list[i].eth_dev->data->dev_flags |= restore;
+ rte_eth_dev_probing_finish(list[i].eth_dev);
+ }
+ if (i != ns) {
+ DRV_LOG(ERR,
+ "probe of PCI device " PCI_PRI_FMT " aborted after"
+ " encountering an error: %s",
+ owner_pci.domain, owner_pci.bus,
+ owner_pci.devid, owner_pci.function,
+ strerror(rte_errno));
+ ret = -rte_errno;
+ /* Roll back. */
+ while (i--) {
+ if (!list[i].eth_dev)
+ continue;
+ mlx5_dev_close(list[i].eth_dev);
+ /* mac_addrs must not be freed because in dev_private */
+ list[i].eth_dev->data->mac_addrs = NULL;
+ claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
+ }
+ /* Restore original error. */
+ rte_errno = -ret;
+ } else {
+ ret = 0;
+ }
+exit:
+ /*
+ * Do the routine cleanup:
+ * - close opened Netlink sockets
+ * - free allocated spawn data array
+ * - free the Infiniband device list
+ */
+ if (nl_rdma >= 0)
+ close(nl_rdma);
+ if (nl_route >= 0)
+ close(nl_route);
+ if (list)
+ mlx5_free(list);
+ MLX5_ASSERT(ibv_list);
+ mlx5_glue->free_device_list(ibv_list);
+ return ret;
+}
+
+static int
+mlx5_os_parse_eth_devargs(struct rte_device *dev,
+ struct rte_eth_devargs *eth_da)
+{
+ int ret = 0;
+
+ if (dev->devargs == NULL)
+ return 0;
+ memset(eth_da, 0, sizeof(*eth_da));
+ /* Parse representor information first from class argument. */
+ if (dev->devargs->cls_str)
+ ret = rte_eth_devargs_parse(dev->devargs->cls_str, eth_da);
+ if (ret != 0) {
+ DRV_LOG(ERR, "failed to parse device arguments: %s",
+ dev->devargs->cls_str);
+ return -rte_errno;
+ }
+ if (eth_da->type == RTE_ETH_REPRESENTOR_NONE) {
+ /* Parse legacy device argument */
+ ret = rte_eth_devargs_parse(dev->devargs->args, eth_da);
+ if (ret) {
+ DRV_LOG(ERR, "failed to parse device arguments: %s",
+ dev->devargs->args);
+ return -rte_errno;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Callback to register a PCI device.
+ *
+ * This function spawns Ethernet devices out of a given PCI device.
+ *
+ * @param[in] pci_dev
+ * PCI device information.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_os_pci_probe(struct rte_pci_device *pci_dev)
+{
+ struct rte_eth_devargs eth_da = { .nb_ports = 0 };
+ int ret = 0;
+ uint16_t p;
+
+ ret = mlx5_os_parse_eth_devargs(&pci_dev->device, ð_da);
+ if (ret != 0)
+ return ret;
+
+ if (eth_da.nb_ports > 0) {
+ /* Iterate all port if devargs pf is range: "pf[0-1]vf[...]". */
+ for (p = 0; p < eth_da.nb_ports; p++) {
+ ret = mlx5_os_pci_probe_pf(pci_dev, ð_da,
+ eth_da.ports[p]);
+ if (ret)
+ break;
+ }
+ if (ret) {
+ DRV_LOG(ERR, "Probe of PCI device " PCI_PRI_FMT " "
+ "aborted due to proding failure of PF %u",
+ pci_dev->addr.domain, pci_dev->addr.bus,
+ pci_dev->addr.devid, pci_dev->addr.function,
+ eth_da.ports[p]);
+ mlx5_net_remove(&pci_dev->device);
+ }
+ } else {
+ ret = mlx5_os_pci_probe_pf(pci_dev, ð_da, 0);
+ }
+ return ret;
+}
+
+/* Probe a single SF device on auxiliary bus, no representor support. */
+static int
+mlx5_os_auxiliary_probe(struct rte_device *dev)
+{
+ struct rte_eth_devargs eth_da = { .nb_ports = 0 };
+ struct mlx5_dev_config config;
+ struct mlx5_dev_spawn_data spawn = { .pf_bond = -1 };
+ struct rte_auxiliary_device *adev = RTE_DEV_TO_AUXILIARY(dev);
+ struct rte_eth_dev *eth_dev;
+ int ret = 0;
+
+ /* Parse ethdev devargs. */
+ ret = mlx5_os_parse_eth_devargs(dev, ð_da);
+ if (ret != 0)
+ return ret;
+ /* Set default config data. */
+ mlx5_os_config_default(&config);
+ config.sf = 1;
+ /* Init spawn data. */
+ spawn.max_port = 1;
+ spawn.phys_port = 1;
+ spawn.phys_dev = mlx5_os_get_ibv_dev(dev);
+ if (spawn.phys_dev == NULL)
+ return -rte_errno;
+ ret = mlx5_auxiliary_get_ifindex(dev->name);
+ if (ret < 0) {
+ DRV_LOG(ERR, "failed to get ethdev ifindex: %s", dev->name);
+ return ret;
+ }
+ spawn.ifindex = ret;
+ spawn.numa_node = dev->numa_node;
+ /* Spawn device. */
+ eth_dev = mlx5_dev_spawn(dev, &spawn, &config, ð_da);
+ if (eth_dev == NULL)
+ return -rte_errno;
+ /* Post create. */
+ eth_dev->intr_handle = &adev->intr_handle;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
+ eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV;
+ eth_dev->data->numa_node = dev->numa_node;
+ }
+ rte_eth_dev_probing_finish(eth_dev);
+ return 0;
+}
+
+/**
+ * Net class driver callback to probe a device.
+ *
+ * This function probe PCI bus device(s) or a single SF on auxiliary bus.
+ *
+ * @param[in] dev
+ * Pointer to the generic device.
+ *
+ * @return
+ * 0 on success, the function cannot fail.
+ */
+int
+mlx5_os_net_probe(struct rte_device *dev)
+{
+ int ret;
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ mlx5_pmd_socket_init();
+ ret = mlx5_init_once();
+ if (ret) {
+ DRV_LOG(ERR, "unable to init PMD global data: %s",
+ strerror(rte_errno));
+ return -rte_errno;
+ }
+ if (mlx5_dev_is_pci(dev))
+ return mlx5_os_pci_probe(RTE_DEV_TO_PCI(dev));
+ else
+ return mlx5_os_auxiliary_probe(dev);
+}
+
+static int
+mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
+{
+ char *env;
+ int value;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ /* Get environment variable to store. */
+ env = getenv(MLX5_SHUT_UP_BF);
+ value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
+ if (config->dbnc == MLX5_ARG_UNSET)
+ setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
+ else
+ setenv(MLX5_SHUT_UP_BF,
+ config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
+ return value;
+}
+
+static void
+mlx5_restore_doorbell_mapping_env(int value)
+{
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ /* Restore the original environment variable state. */
+ if (value == MLX5_ARG_UNSET)
+ unsetenv(MLX5_SHUT_UP_BF);
+ else
+ setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
+}
+
+/**
+ * Extract pdn of PD object using DV API.
+ *
+ * @param[in] pd
+ * Pointer to the verbs PD object.
+ * @param[out] pdn
+ * Pointer to the PD object number variable.
+ *
+ * @return
+ * 0 on success, error value otherwise.
+ */
+int
+mlx5_os_get_pdn(void *pd, uint32_t *pdn)
+{
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ struct mlx5dv_obj obj;
+ struct mlx5dv_pd pd_info;
+ int ret = 0;
+
+ obj.pd.in = pd;
+ obj.pd.out = &pd_info;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
+ if (ret) {
+ DRV_LOG(DEBUG, "Fail to get PD object info");
+ return ret;
+ }
+ *pdn = pd_info.pdn;
+ return 0;
+#else
+ (void)pd;
+ (void)pdn;
+ return -ENOTSUP;
+#endif /* HAVE_IBV_FLOW_DV_SUPPORT */
+}
+
+/**
+ * Function API to open IB device.
+ *
+ * This function calls the Linux glue APIs to open a device.
+ *
+ * @param[in] spawn
+ * Pointer to the IB device attributes (name, port, etc).
+ * @param[out] config
+ * Pointer to device configuration structure.
+ * @param[out] sh
+ * Pointer to shared context structure.
+ *
+ * @return
+ * 0 on success, a positive error value otherwise.
+ */
+int
+mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
+ const struct mlx5_dev_config *config,
+ struct mlx5_dev_ctx_shared *sh)
+{
+ int dbmap_env;
+ int err = 0;
+
+ pthread_mutex_init(&sh->txpp.mutex, NULL);
+ /*
+ * Configure environment variable "MLX5_BF_SHUT_UP"
+ * before the device creation. The rdma_core library
+ * checks the variable at device creation and
+ * stores the result internally.
+ */
+ dbmap_env = mlx5_config_doorbell_mapping_env(config);
+ /* Try to open IB device with DV first, then usual Verbs. */
+ errno = 0;
+ sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev);
+ if (sh->ctx) {
+ sh->devx = 1;
+ DRV_LOG(DEBUG, "DevX is supported");
+ /* The device is created, no need for environment. */
+ mlx5_restore_doorbell_mapping_env(dbmap_env);
+ } else {
+ /* The environment variable is still configured. */
+ sh->ctx = mlx5_glue->open_device(spawn->phys_dev);
+ err = errno ? errno : ENODEV;
+ /*
+ * The environment variable is not needed anymore,
+ * all device creation attempts are completed.
+ */
+ mlx5_restore_doorbell_mapping_env(dbmap_env);
+ if (!sh->ctx)
+ return err;
+ DRV_LOG(DEBUG, "DevX is NOT supported");
+ err = 0;
+ }
+ if (!err && sh->ctx) {
+ /* Hint libmlx5 to use PMD allocator for data plane resources */
+ mlx5_glue->dv_set_context_attr(sh->ctx,
+ MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
+ (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
+ .alloc = &mlx5_alloc_verbs_buf,
+ .free = &mlx5_free_verbs_buf,
+ .data = sh,
+ }));
+ }
+ return err;
+}
+
+/**
+ * Install shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param sh
+ * Pointer to mlx5_dev_ctx_shared object.
+ */
+void
+mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
+{
+ int ret;
+ int flags;
+
+ sh->intr_handle.fd = -1;
+ flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
+ ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
+ F_SETFL, flags | O_NONBLOCK);
+ if (ret) {
+ DRV_LOG(INFO, "failed to change file descriptor async event"
+ " queue");
+ } else {
+ sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
+ sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
+ if (rte_intr_callback_register(&sh->intr_handle,
+ mlx5_dev_interrupt_handler, sh)) {
+ DRV_LOG(INFO, "Fail to install the shared interrupt.");
+ sh->intr_handle.fd = -1;
+ }
+ }
+ if (sh->devx) {
+#ifdef HAVE_IBV_DEVX_ASYNC
+ sh->intr_handle_devx.fd = -1;
+ sh->devx_comp =
+ (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx);
+ struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp;
+ if (!devx_comp) {
+ DRV_LOG(INFO, "failed to allocate devx_comp.");
+ return;
+ }
+ flags = fcntl(devx_comp->fd, F_GETFL);
+ ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
+ if (ret) {
+ DRV_LOG(INFO, "failed to change file descriptor"
+ " devx comp");
+ return;
+ }
+ sh->intr_handle_devx.fd = devx_comp->fd;
+ sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
+ if (rte_intr_callback_register(&sh->intr_handle_devx,
+ mlx5_dev_interrupt_handler_devx, sh)) {
+ DRV_LOG(INFO, "Fail to install the devx shared"
+ " interrupt.");
+ sh->intr_handle_devx.fd = -1;
+ }
+#endif /* HAVE_IBV_DEVX_ASYNC */
+ }
+}
+
+/**
+ * Uninstall shared asynchronous device events handler.
+ * This function is implemented to support event sharing
+ * between multiple ports of single IB device.
+ *
+ * @param dev
+ * Pointer to mlx5_dev_ctx_shared object.
+ */
+void
+mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
+{
+ if (sh->intr_handle.fd >= 0)
+ mlx5_intr_callback_unregister(&sh->intr_handle,
+ mlx5_dev_interrupt_handler, sh);
+#ifdef HAVE_IBV_DEVX_ASYNC
+ if (sh->intr_handle_devx.fd >= 0)
+ rte_intr_callback_unregister(&sh->intr_handle_devx,
+ mlx5_dev_interrupt_handler_devx, sh);
+ if (sh->devx_comp)
+ mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
+#endif
+}
+
+/**
+ * Read statistics by a named counter.
+ *
+ * @param[in] priv
+ * Pointer to the private device data structure.
+ * @param[in] ctr_name
+ * Pointer to the name of the statistic counter to read
+ * @param[out] stat
+ * Pointer to read statistic value.
+ * @return
+ * 0 on success and stat is valud, 1 if failed to read the value
+ * rte_errno is set.
+ *
+ */
+int
+mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name,
+ uint64_t *stat)
+{
+ int fd;
+
+ if (priv->sh) {
+ if (priv->q_counters != NULL &&
+ strcmp(ctr_name, "out_of_buffer") == 0)
+ return mlx5_devx_cmd_queue_counter_query
+ (priv->q_counters, 0, (uint32_t *)stat);
+ MKSTR(path, "%s/ports/%d/hw_counters/%s",
+ priv->sh->ibdev_path,
+ priv->dev_port,
+ ctr_name);
+ fd = open(path, O_RDONLY);
+ /*
+ * in switchdev the file location is not per port
+ * but rather in <ibdev_path>/hw_counters/<file_name>.
+ */
+ if (fd == -1) {
+ MKSTR(path1, "%s/hw_counters/%s",
+ priv->sh->ibdev_path,
+ ctr_name);
+ fd = open(path1, O_RDONLY);
+ }
+ if (fd != -1) {
+ char buf[21] = {'\0'};
+ ssize_t n = read(fd, buf, sizeof(buf));
+
+ close(fd);
+ if (n != -1) {
+ *stat = strtoull(buf, NULL, 10);
+ return 0;
+ }
+ }
+ }
+ *stat = 0;
+ return 1;
+}
+
+/**
+ * Set the reg_mr and dereg_mr call backs
+ *
+ * @param reg_mr_cb[out]
+ * Pointer to reg_mr func
+ * @param dereg_mr_cb[out]
+ * Pointer to dereg_mr func
+ *
+ */
+void
+mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
+ mlx5_dereg_mr_t *dereg_mr_cb)
+{
+ *reg_mr_cb = mlx5_mr_verbs_ops.reg_mr;
+ *dereg_mr_cb = mlx5_mr_verbs_ops.dereg_mr;
+}
+
+/**
+ * Remove a MAC address from device
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param index
+ * MAC address index.
+ */
+void
+mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const int vf = priv->config.vf;
+
+ if (vf)
+ mlx5_nl_mac_addr_remove(priv->nl_socket_route,
+ mlx5_ifindex(dev), priv->mac_own,
+ &dev->data->mac_addrs[index], index);
+}
+
+/**
+ * Adds a MAC address to the device
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param mac_addr
+ * MAC address to register.
+ * @param index
+ * MAC address index.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise
+ */
+int
+mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
+ uint32_t index)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ const int vf = priv->config.vf;
+ int ret = 0;
+
+ if (vf)
+ ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
+ mlx5_ifindex(dev), priv->mac_own,
+ mac, index);
+ return ret;
+}
+
+/**
+ * Modify a VF MAC address
+ *
+ * @param priv
+ * Pointer to device private data.
+ * @param mac_addr
+ * MAC address to modify into.
+ * @param iface_idx
+ * Net device interface index
+ * @param vf_index
+ * VF index
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise
+ */
+int
+mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv,
+ unsigned int iface_idx,
+ struct rte_ether_addr *mac_addr,
+ int vf_index)
+{
+ return mlx5_nl_vf_mac_addr_modify
+ (priv->nl_socket_route, iface_idx, mac_addr, vf_index);
+}
+
+/**
+ * Set device promiscuous mode
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param enable
+ * 0 - promiscuous is disabled, otherwise - enabled
+ *
+ * @return
+ * 0 on success, a negative error value otherwise
+ */
+int
+mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ return mlx5_nl_promisc(priv->nl_socket_route,
+ mlx5_ifindex(dev), !!enable);
+}
+
+/**
+ * Set device promiscuous mode
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ * @param enable
+ * 0 - all multicase is disabled, otherwise - enabled
+ *
+ * @return
+ * 0 on success, a negative error value otherwise
+ */
+int
+mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ return mlx5_nl_allmulti(priv->nl_socket_route,
+ mlx5_ifindex(dev), !!enable);
+}
+
+/**
+ * Flush device MAC addresses
+ *
+ * @param dev
+ * Pointer to Ethernet device structure.
+ *
+ */
+void
+mlx5_os_mac_addr_flush(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ mlx5_nl_mac_addr_flush(priv->nl_socket_route, mlx5_ifindex(dev),
+ dev->data->mac_addrs,
+ MLX5_MAX_MAC_ADDRESSES, priv->mac_own);
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_os.h b/drivers/net/mlx5/freebsd/mlx5_os.h
new file mode 100644
index 0000000000..2991d37df2
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_os.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2015 6WIND S.A.
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_OS_H_
+#define RTE_PMD_MLX5_OS_H_
+
+#include <net/if.h>
+
+/* verb enumerations translations to local enums. */
+enum {
+ MLX5_FS_NAME_MAX = IBV_SYSFS_NAME_MAX + 1,
+ MLX5_FS_PATH_MAX = IBV_SYSFS_PATH_MAX + 1
+};
+
+/* Maximal data of sendmsg message(in bytes). */
+#define MLX5_SENDMSG_MAX 64
+
+#define MLX5_NAMESIZE IF_NAMESIZE
+
+int mlx5_auxiliary_get_ifindex(const char *sf_name);
+
+#endif /* RTE_PMD_MLX5_OS_H_ */
diff --git a/drivers/net/mlx5/freebsd/mlx5_socket.c b/drivers/net/mlx5/freebsd/mlx5_socket.c
new file mode 100644
index 0000000000..6356b66dc4
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_socket.c
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "rte_eal.h"
+#include "mlx5_utils.h"
+#include "mlx5.h"
+
+/* PMD socket service for tools. */
+
+#define MLX5_SOCKET_PATH "/var/tmp/dpdk_net_mlx5_%d"
+
+int server_socket; /* Unix socket for primary process. */
+struct rte_intr_handle server_intr_handle; /* Interrupt handler. */
+
+/**
+ * Handle server pmd socket interrupts.
+ */
+static void
+mlx5_pmd_socket_handle(void *cb __rte_unused)
+{
+ int conn_sock;
+ int ret;
+ struct cmsghdr *cmsg = NULL;
+ uint32_t data[MLX5_SENDMSG_MAX / sizeof(uint32_t)];
+ uint64_t flow_ptr = 0;
+ uint8_t buf[CMSG_SPACE(sizeof(int))] = { 0 };
+ struct iovec io = {
+ .iov_base = data,
+ .iov_len = sizeof(data),
+ };
+ struct msghdr msg = {
+ .msg_iov = &io,
+ .msg_iovlen = 1,
+ .msg_control = buf,
+ .msg_controllen = sizeof(buf),
+ };
+
+ uint32_t port_id;
+ int fd;
+ FILE *file = NULL;
+ struct rte_eth_dev *dev;
+ struct rte_flow_error err;
+ struct mlx5_flow_dump_req *dump_req;
+ struct mlx5_flow_dump_ack *dump_ack;
+
+ memset(data, 0, sizeof(data));
+ /* Accept the connection from the client. */
+ conn_sock = accept(server_socket, NULL, NULL);
+ if (conn_sock < 0) {
+ DRV_LOG(WARNING, "connection failed: %s", strerror(errno));
+ return;
+ }
+ ret = recvmsg(conn_sock, &msg, MSG_WAITALL);
+ if (ret != sizeof(struct mlx5_flow_dump_req)) {
+ DRV_LOG(WARNING, "wrong message received: %s",
+ strerror(errno));
+ goto error;
+ }
+
+ /* Receive file descriptor. */
+ cmsg = CMSG_FIRSTHDR(&msg);
+ if (cmsg == NULL || cmsg->cmsg_type != SCM_RIGHTS ||
+ cmsg->cmsg_len < sizeof(int)) {
+ DRV_LOG(WARNING, "invalid file descriptor message");
+ goto error;
+ }
+ memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+ file = fdopen(fd, "w");
+ if (!file) {
+ DRV_LOG(WARNING, "Failed to open file");
+ goto error;
+ }
+ /* Receive port number. */
+ if (msg.msg_iovlen != 1 || msg.msg_iov->iov_len < sizeof(uint16_t)) {
+ DRV_LOG(WARNING, "wrong port number message");
+ goto error;
+ }
+
+ dump_req = (struct mlx5_flow_dump_req *)msg.msg_iov->iov_base;
+ if (dump_req) {
+ port_id = dump_req->port_id;
+ flow_ptr = dump_req->flow_id;
+ } else {
+ DRV_LOG(WARNING, "Invalid message");
+ goto error;
+ }
+
+ if (!rte_eth_dev_is_valid_port(port_id)) {
+ DRV_LOG(WARNING, "Invalid port %u", port_id);
+ goto error;
+ }
+
+ /* Dump flow. */
+ dev = &rte_eth_devices[port_id];
+ if (flow_ptr == 0)
+ ret = mlx5_flow_dev_dump(dev, NULL, file, NULL);
+ else
+ ret = mlx5_flow_dev_dump(dev,
+ (struct rte_flow *)((uintptr_t)flow_ptr), file, &err);
+
+ /* Set-up the ancillary data and reply. */
+ msg.msg_controllen = 0;
+ msg.msg_control = NULL;
+ msg.msg_iovlen = 1;
+ msg.msg_iov = &io;
+ dump_ack = (struct mlx5_flow_dump_ack *)data;
+ dump_ack->rc = -ret;
+ io.iov_len = sizeof(struct mlx5_flow_dump_ack);
+ io.iov_base = dump_ack;
+ do {
+ ret = sendmsg(conn_sock, &msg, 0);
+ } while (ret < 0 && errno == EINTR);
+ if (ret < 0)
+ DRV_LOG(WARNING, "failed to send response %s",
+ strerror(errno));
+error:
+ if (conn_sock >= 0)
+ close(conn_sock);
+ if (file)
+ fclose(file);
+}
+
+/**
+ * Install interrupt handler.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @return
+ * 0 on success, a negative errno value otherwise.
+ */
+static int
+mlx5_pmd_interrupt_handler_install(void)
+{
+ MLX5_ASSERT(server_socket);
+ server_intr_handle.fd = server_socket;
+ server_intr_handle.type = RTE_INTR_HANDLE_EXT;
+ return rte_intr_callback_register(&server_intr_handle,
+ mlx5_pmd_socket_handle, NULL);
+}
+
+/**
+ * Uninstall interrupt handler.
+ */
+static void
+mlx5_pmd_interrupt_handler_uninstall(void)
+{
+ if (server_socket) {
+ mlx5_intr_callback_unregister(&server_intr_handle,
+ mlx5_pmd_socket_handle,
+ NULL);
+ }
+ server_intr_handle.fd = 0;
+ server_intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
+}
+
+/**
+ * Initialise the socket to communicate with the secondary process
+ *
+ * @param[in] dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, a negative value otherwise.
+ */
+int
+mlx5_pmd_socket_init(void)
+{
+ struct sockaddr_un sun = {
+ .sun_family = AF_UNIX,
+ };
+ int ret;
+ int flags;
+
+ MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
+ if (server_socket)
+ return 0;
+ /*
+ * Initialize the socket to communicate with the secondary
+ * process.
+ */
+ ret = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "Failed to open mlx5 socket: %s",
+ strerror(errno));
+ goto error;
+ }
+ server_socket = ret;
+ flags = fcntl(server_socket, F_GETFL, 0);
+ if (flags == -1)
+ goto error;
+ ret = fcntl(server_socket, F_SETFL, flags | O_NONBLOCK);
+ if (ret < 0)
+ goto error;
+ snprintf(sun.sun_path, sizeof(sun.sun_path), MLX5_SOCKET_PATH,
+ getpid());
+ remove(sun.sun_path);
+ ret = bind(server_socket, (const struct sockaddr *)&sun, sizeof(sun));
+ if (ret < 0) {
+ DRV_LOG(WARNING,
+ "cannot bind mlx5 socket: %s", strerror(errno));
+ goto close;
+ }
+ ret = listen(server_socket, 0);
+ if (ret < 0) {
+ DRV_LOG(WARNING, "cannot listen on mlx5 socket: %s",
+ strerror(errno));
+ goto close;
+ }
+ if (mlx5_pmd_interrupt_handler_install()) {
+ DRV_LOG(WARNING, "cannot register interrupt handler for mlx5 socket: %s",
+ strerror(errno));
+ goto close;
+ }
+ return 0;
+close:
+ remove(sun.sun_path);
+error:
+ claim_zero(close(server_socket));
+ server_socket = 0;
+ DRV_LOG(ERR, "Cannot initialize socket: %s", strerror(errno));
+ return -errno;
+}
+
+/**
+ * Un-Initialize the pmd socket
+ */
+RTE_FINI(mlx5_pmd_socket_uninit)
+{
+ if (!server_socket)
+ return;
+ mlx5_pmd_interrupt_handler_uninstall();
+ claim_zero(close(server_socket));
+ server_socket = 0;
+ MKSTR(path, MLX5_SOCKET_PATH, getpid());
+ claim_zero(remove(path));
+}
diff --git a/drivers/net/mlx5/freebsd/mlx5_verbs.c b/drivers/net/mlx5/freebsd/mlx5_verbs.c
new file mode 100644
index 0000000000..d4fa202ac4
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_verbs.c
@@ -0,0 +1,1208 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#include <stddef.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sys/queue.h>
+
+#include "mlx5_autoconf.h"
+
+#include <rte_mbuf.h>
+#include <rte_malloc.h>
+#include <ethdev_driver.h>
+#include <rte_common.h>
+
+#include <mlx5_glue.h>
+#include <mlx5_common.h>
+#include <mlx5_common_mr.h>
+#include <mlx5_verbs.h>
+#include <mlx5_rx.h>
+#include <mlx5_tx.h>
+#include <mlx5_utils.h>
+#include <mlx5_malloc.h>
+
+/**
+ * Register mr. Given protection domain pointer, pointer to addr and length
+ * register the memory region.
+ *
+ * @param[in] pd
+ * Pointer to protection domain context.
+ * @param[in] addr
+ * Pointer to memory start address.
+ * @param[in] length
+ * Length of the memory to register.
+ * @param[out] pmd_mr
+ * pmd_mr struct set with lkey, address, length and pointer to mr object
+ *
+ * @return
+ * 0 on successful registration, -1 otherwise
+ */
+static int
+mlx5_reg_mr(void *pd, void *addr, size_t length,
+ struct mlx5_pmd_mr *pmd_mr)
+{
+ return mlx5_common_verbs_reg_mr(pd, addr, length, pmd_mr);
+}
+
+/**
+ * Deregister mr. Given the mlx5 pmd MR - deregister the MR
+ *
+ * @param[in] pmd_mr
+ * pmd_mr struct set with lkey, address, length and pointer to mr object
+ *
+ */
+static void
+mlx5_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
+{
+ mlx5_common_verbs_dereg_mr(pmd_mr);
+}
+
+/* verbs operations. */
+const struct mlx5_mr_ops mlx5_mr_verbs_ops = {
+ .reg_mr = mlx5_reg_mr,
+ .dereg_mr = mlx5_dereg_mr,
+};
+
+/**
+ * Modify Rx WQ vlan stripping offload
+ *
+ * @param rxq_obj
+ * Rx queue object.
+ *
+ * @return 0 on success, non-0 otherwise
+ */
+static int
+mlx5_rxq_obj_modify_wq_vlan_strip(struct mlx5_rxq_obj *rxq_obj, int on)
+{
+ uint16_t vlan_offloads =
+ (on ? IBV_WQ_FLAGS_CVLAN_STRIPPING : 0) |
+ 0;
+ struct ibv_wq_attr mod;
+ mod = (struct ibv_wq_attr){
+ .attr_mask = IBV_WQ_ATTR_FLAGS,
+ .flags_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING,
+ .flags = vlan_offloads,
+ };
+
+ return mlx5_glue->modify_wq(rxq_obj->wq, &mod);
+}
+
+/**
+ * Modifies the attributes for the specified WQ.
+ *
+ * @param rxq_obj
+ * Verbs Rx queue object.
+ * @param type
+ * Type of change queue state.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_modify_wq(struct mlx5_rxq_obj *rxq_obj, uint8_t type)
+{
+ struct ibv_wq_attr mod = {
+ .attr_mask = IBV_WQ_ATTR_STATE,
+ .wq_state = (enum ibv_wq_state)type,
+ };
+
+ return mlx5_glue->modify_wq(rxq_obj->wq, &mod);
+}
+
+/**
+ * Modify QP using Verbs API.
+ *
+ * @param txq_obj
+ * Verbs Tx queue object.
+ * @param type
+ * Type of change queue state.
+ * @param dev_port
+ * IB device port number.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_modify_qp(struct mlx5_txq_obj *obj, enum mlx5_txq_modify_type type,
+ uint8_t dev_port)
+{
+ struct ibv_qp_attr mod = {
+ .qp_state = IBV_QPS_RESET,
+ .port_num = dev_port,
+ };
+ int attr_mask = (IBV_QP_STATE | IBV_QP_PORT);
+ int ret;
+
+ if (type != MLX5_TXQ_MOD_RST2RDY) {
+ ret = mlx5_glue->modify_qp(obj->qp, &mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to RESET %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ if (type == MLX5_TXQ_MOD_RDY2RST)
+ return 0;
+ }
+ if (type == MLX5_TXQ_MOD_ERR2RDY)
+ attr_mask = IBV_QP_STATE;
+ mod.qp_state = IBV_QPS_INIT;
+ ret = mlx5_glue->modify_qp(obj->qp, &mod, attr_mask);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ mod.qp_state = IBV_QPS_RTR;
+ ret = mlx5_glue->modify_qp(obj->qp, &mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ mod.qp_state = IBV_QPS_RTS;
+ ret = mlx5_glue->modify_qp(obj->qp, &mod, IBV_QP_STATE);
+ if (ret) {
+ DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s",
+ strerror(errno));
+ rte_errno = errno;
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * Create a CQ Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array.
+ *
+ * @return
+ * The Verbs CQ object initialized, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_cq *
+mlx5_rxq_ibv_cq_create(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ struct mlx5_rxq_obj *rxq_obj = rxq_ctrl->obj;
+ unsigned int cqe_n = mlx5_rxq_cqe_num(rxq_data);
+ struct {
+ struct ibv_cq_init_attr_ex ibv;
+ struct mlx5dv_cq_init_attr mlx5;
+ } cq_attr;
+
+ cq_attr.ibv = (struct ibv_cq_init_attr_ex){
+ .cqe = cqe_n,
+ .channel = rxq_obj->ibv_channel,
+ .comp_mask = 0,
+ };
+ cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
+ .comp_mask = 0,
+ };
+ if (priv->config.cqe_comp && !rxq_data->hw_timestamp) {
+ cq_attr.mlx5.comp_mask |=
+ MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
+ rxq_data->byte_mask = UINT32_MAX;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ if (mlx5_rxq_mprq_enabled(rxq_data)) {
+ cq_attr.mlx5.cqe_comp_res_format =
+ MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX;
+ rxq_data->mcqe_format =
+ MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+ } else {
+ cq_attr.mlx5.cqe_comp_res_format =
+ MLX5DV_CQE_RES_FORMAT_HASH;
+ rxq_data->mcqe_format =
+ MLX5_CQE_RESP_FORMAT_HASH;
+ }
+#else
+ cq_attr.mlx5.cqe_comp_res_format = MLX5DV_CQE_RES_FORMAT_HASH;
+ rxq_data->mcqe_format = MLX5_CQE_RESP_FORMAT_HASH;
+#endif
+ /*
+ * For vectorized Rx, it must not be doubled in order to
+ * make cq_ci and rq_ci aligned.
+ */
+ if (mlx5_rxq_check_vec_support(rxq_data) < 0)
+ cq_attr.ibv.cqe *= 2;
+ } else if (priv->config.cqe_comp && rxq_data->hw_timestamp) {
+ DRV_LOG(DEBUG,
+ "Port %u Rx CQE compression is disabled for HW"
+ " timestamp.",
+ dev->data->port_id);
+ }
+#ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
+ if (RTE_CACHE_LINE_SIZE == 128) {
+ cq_attr.mlx5.comp_mask |= MLX5DV_CQ_INIT_ATTR_MASK_FLAGS;
+ cq_attr.mlx5.flags |= MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD;
+ }
+#endif
+ return mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(priv->sh->ctx,
+ &cq_attr.ibv,
+ &cq_attr.mlx5));
+}
+
+/**
+ * Create a WQ Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array.
+ *
+ * @return
+ * The Verbs WQ object initialized, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_wq *
+mlx5_rxq_ibv_wq_create(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ struct mlx5_rxq_obj *rxq_obj = rxq_ctrl->obj;
+ unsigned int wqe_n = 1 << rxq_data->elts_n;
+ struct {
+ struct ibv_wq_init_attr ibv;
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ struct mlx5dv_wq_init_attr mlx5;
+#endif
+ } wq_attr;
+
+ wq_attr.ibv = (struct ibv_wq_init_attr){
+ .wq_context = NULL, /* Could be useful in the future. */
+ .wq_type = IBV_WQT_RQ,
+ /* Max number of outstanding WRs. */
+ .max_wr = wqe_n >> rxq_data->sges_n,
+ /* Max number of scatter/gather elements in a WR. */
+ .max_sge = 1 << rxq_data->sges_n,
+ .pd = priv->sh->pd,
+ .cq = rxq_obj->ibv_cq,
+ .comp_mask = IBV_WQ_FLAGS_CVLAN_STRIPPING | 0,
+ .create_flags = (rxq_data->vlan_strip ?
+ IBV_WQ_FLAGS_CVLAN_STRIPPING : 0),
+ };
+ /* By default, FCS (CRC) is stripped by hardware. */
+ if (rxq_data->crc_present) {
+ wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
+ wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+ }
+ if (priv->config.hw_padding) {
+#if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
+ wq_attr.ibv.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
+ wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
+ wq_attr.ibv.create_flags |= IBV_WQ_FLAGS_PCI_WRITE_END_PADDING;
+ wq_attr.ibv.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
+#endif
+ }
+#ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
+ wq_attr.mlx5 = (struct mlx5dv_wq_init_attr){
+ .comp_mask = 0,
+ };
+ if (mlx5_rxq_mprq_enabled(rxq_data)) {
+ struct mlx5dv_striding_rq_init_attr *mprq_attr =
+ &wq_attr.mlx5.striding_rq_attrs;
+
+ wq_attr.mlx5.comp_mask |= MLX5DV_WQ_INIT_ATTR_MASK_STRIDING_RQ;
+ *mprq_attr = (struct mlx5dv_striding_rq_init_attr){
+ .single_stride_log_num_of_bytes = rxq_data->strd_sz_n,
+ .single_wqe_log_num_of_strides = rxq_data->strd_num_n,
+ .two_byte_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT,
+ };
+ }
+ rxq_obj->wq = mlx5_glue->dv_create_wq(priv->sh->ctx, &wq_attr.ibv,
+ &wq_attr.mlx5);
+#else
+ rxq_obj->wq = mlx5_glue->create_wq(priv->sh->ctx, &wq_attr.ibv);
+#endif
+ if (rxq_obj->wq) {
+ /*
+ * Make sure number of WRs*SGEs match expectations since a queue
+ * cannot allocate more than "desc" buffers.
+ */
+ if (wq_attr.ibv.max_wr != (wqe_n >> rxq_data->sges_n) ||
+ wq_attr.ibv.max_sge != (1u << rxq_data->sges_n)) {
+ DRV_LOG(ERR,
+ "Port %u Rx queue %u requested %u*%u but got"
+ " %u*%u WRs*SGEs.",
+ dev->data->port_id, idx,
+ wqe_n >> rxq_data->sges_n,
+ (1 << rxq_data->sges_n),
+ wq_attr.ibv.max_wr, wq_attr.ibv.max_sge);
+ claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+ rxq_obj->wq = NULL;
+ rte_errno = EINVAL;
+ }
+ }
+ return rxq_obj->wq;
+}
+
+/**
+ * Create the Rx queue Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Rx queue array.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rxq_ibv_obj_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
+ struct mlx5_rxq_obj *tmpl = rxq_ctrl->obj;
+ struct mlx5dv_cq cq_info;
+ struct mlx5dv_rwq rwq;
+ int ret = 0;
+ struct mlx5dv_obj obj;
+
+ MLX5_ASSERT(rxq_data);
+ MLX5_ASSERT(tmpl);
+ tmpl->rxq_ctrl = rxq_ctrl;
+ if (rxq_ctrl->irq) {
+ tmpl->ibv_channel =
+ mlx5_glue->create_comp_channel(priv->sh->ctx);
+ if (!tmpl->ibv_channel) {
+ DRV_LOG(ERR, "Port %u: comp channel creation failure.",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ tmpl->fd = ((struct ibv_comp_channel *)(tmpl->ibv_channel))->fd;
+ }
+ /* Create CQ using Verbs API. */
+ tmpl->ibv_cq = mlx5_rxq_ibv_cq_create(dev, idx);
+ if (!tmpl->ibv_cq) {
+ DRV_LOG(ERR, "Port %u Rx queue %u CQ creation failure.",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ obj.cq.in = tmpl->ibv_cq;
+ obj.cq.out = &cq_info;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ);
+ if (ret) {
+ rte_errno = ret;
+ goto error;
+ }
+ if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+ DRV_LOG(ERR,
+ "Port %u wrong MLX5_CQE_SIZE environment "
+ "variable value: it should be set to %u.",
+ dev->data->port_id, RTE_CACHE_LINE_SIZE);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ /* Fill the rings. */
+ rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
+ rxq_data->cq_db = cq_info.dbrec;
+ rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+ rxq_data->cq_uar = cq_info.cq_uar;
+ rxq_data->cqn = cq_info.cqn;
+ /* Create WQ (RQ) using Verbs API. */
+ tmpl->wq = mlx5_rxq_ibv_wq_create(dev, idx);
+ if (!tmpl->wq) {
+ DRV_LOG(ERR, "Port %u Rx queue %u WQ creation failure.",
+ dev->data->port_id, idx);
+ rte_errno = ENOMEM;
+ goto error;
+ }
+ /* Change queue state to ready. */
+ ret = mlx5_ibv_modify_wq(tmpl, IBV_WQS_RDY);
+ if (ret) {
+ DRV_LOG(ERR,
+ "Port %u Rx queue %u WQ state to IBV_WQS_RDY failed.",
+ dev->data->port_id, idx);
+ rte_errno = ret;
+ goto error;
+ }
+ obj.rwq.in = tmpl->wq;
+ obj.rwq.out = &rwq;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_RWQ);
+ if (ret) {
+ rte_errno = ret;
+ goto error;
+ }
+ rxq_data->wqes = rwq.buf;
+ rxq_data->rq_db = rwq.dbrec;
+ rxq_data->cq_arm_sn = 0;
+ mlx5_rxq_initialize(rxq_data);
+ rxq_data->cq_ci = 0;
+ dev->data->rx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
+ rxq_ctrl->wqn = ((struct ibv_wq *)(tmpl->wq))->wq_num;
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (tmpl->wq)
+ claim_zero(mlx5_glue->destroy_wq(tmpl->wq));
+ if (tmpl->ibv_cq)
+ claim_zero(mlx5_glue->destroy_cq(tmpl->ibv_cq));
+ if (tmpl->ibv_channel)
+ claim_zero(mlx5_glue->destroy_comp_channel(tmpl->ibv_channel));
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Release an Rx verbs queue object.
+ *
+ * @param rxq_obj
+ * Verbs Rx queue object.
+ */
+static void
+mlx5_rxq_ibv_obj_release(struct mlx5_rxq_obj *rxq_obj)
+{
+ MLX5_ASSERT(rxq_obj);
+ MLX5_ASSERT(rxq_obj->wq);
+ MLX5_ASSERT(rxq_obj->ibv_cq);
+ claim_zero(mlx5_glue->destroy_wq(rxq_obj->wq));
+ claim_zero(mlx5_glue->destroy_cq(rxq_obj->ibv_cq));
+ if (rxq_obj->ibv_channel)
+ claim_zero(mlx5_glue->destroy_comp_channel
+ (rxq_obj->ibv_channel));
+}
+
+/**
+ * Get event for an Rx verbs queue object.
+ *
+ * @param rxq_obj
+ * Verbs Rx queue object.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rx_ibv_get_event(struct mlx5_rxq_obj *rxq_obj)
+{
+ struct ibv_cq *ev_cq;
+ void *ev_ctx;
+ int ret = mlx5_glue->get_cq_event(rxq_obj->ibv_channel,
+ &ev_cq, &ev_ctx);
+
+ if (ret < 0 || ev_cq != rxq_obj->ibv_cq)
+ goto exit;
+ mlx5_glue->ack_cq_events(rxq_obj->ibv_cq, 1);
+ return 0;
+exit:
+ if (ret < 0)
+ rte_errno = errno;
+ else
+ rte_errno = EINVAL;
+ return -rte_errno;
+}
+
+/**
+ * Creates a receive work queue as a filed of indirection table.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param log_n
+ * Log of number of queues in the array.
+ * @param ind_tbl
+ * Verbs indirection table object.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_ind_table_new(struct rte_eth_dev *dev, const unsigned int log_n,
+ struct mlx5_ind_table_obj *ind_tbl)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_wq *wq[1 << log_n];
+ unsigned int i, j;
+
+ MLX5_ASSERT(ind_tbl);
+ for (i = 0; i != ind_tbl->queues_n; ++i) {
+ struct mlx5_rxq_data *rxq = (*priv->rxqs)[ind_tbl->queues[i]];
+ struct mlx5_rxq_ctrl *rxq_ctrl =
+ container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+
+ wq[i] = rxq_ctrl->obj->wq;
+ }
+ MLX5_ASSERT(i > 0);
+ /* Finalise indirection table. */
+ for (j = 0; i != (unsigned int)(1 << log_n); ++j, ++i)
+ wq[i] = wq[j];
+ ind_tbl->ind_table = mlx5_glue->create_rwq_ind_table(priv->sh->ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = log_n,
+ .ind_tbl = wq,
+ .comp_mask = 0,
+ });
+ if (!ind_tbl->ind_table) {
+ rte_errno = errno;
+ return -rte_errno;
+ }
+ return 0;
+}
+
+/**
+ * Destroys the specified Indirection Table.
+ *
+ * @param ind_table
+ * Indirection table to release.
+ */
+static void
+mlx5_ibv_ind_table_destroy(struct mlx5_ind_table_obj *ind_tbl)
+{
+ claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl->ind_table));
+}
+
+/**
+ * Create an Rx Hash queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param hrxq
+ * Pointer to Rx Hash queue.
+ * @param tunnel
+ * Tunnel type.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_hrxq_new(struct rte_eth_dev *dev, struct mlx5_hrxq *hrxq,
+ int tunnel __rte_unused)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_qp *qp = NULL;
+ struct mlx5_ind_table_obj *ind_tbl = hrxq->ind_table;
+ const uint8_t *rss_key = hrxq->rss_key;
+ uint64_t hash_fields = hrxq->hash_fields;
+ int err;
+#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
+ struct mlx5dv_qp_init_attr qp_init_attr;
+
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ if (tunnel) {
+ qp_init_attr.comp_mask =
+ MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+ qp_init_attr.create_flags = MLX5DV_QP_CREATE_TUNNEL_OFFLOADS;
+ }
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ if (dev->data->dev_conf.lpbk_mode) {
+ /* Allow packet sent from NIC loop back w/o source MAC check. */
+ qp_init_attr.comp_mask |=
+ MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+ qp_init_attr.create_flags |=
+ MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
+ }
+#endif
+ qp = mlx5_glue->dv_create_qp
+ (priv->sh->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask =
+ IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_IND_TABLE |
+ IBV_QP_INIT_ATTR_RX_HASH,
+ .rx_hash_conf = (struct ibv_rx_hash_conf){
+ .rx_hash_function =
+ IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = hrxq->rss_key_len,
+ .rx_hash_key =
+ (void *)(uintptr_t)rss_key,
+ .rx_hash_fields_mask = hash_fields,
+ },
+ .rwq_ind_tbl = ind_tbl->ind_table,
+ .pd = priv->sh->pd,
+ },
+ &qp_init_attr);
+#else
+ qp = mlx5_glue->create_qp_ex
+ (priv->sh->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask =
+ IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_IND_TABLE |
+ IBV_QP_INIT_ATTR_RX_HASH,
+ .rx_hash_conf = (struct ibv_rx_hash_conf){
+ .rx_hash_function =
+ IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = hrxq->rss_key_len,
+ .rx_hash_key =
+ (void *)(uintptr_t)rss_key,
+ .rx_hash_fields_mask = hash_fields,
+ },
+ .rwq_ind_tbl = ind_tbl->ind_table,
+ .pd = priv->sh->pd,
+ });
+#endif
+ if (!qp) {
+ rte_errno = errno;
+ goto error;
+ }
+ hrxq->qp = qp;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+ if (!hrxq->action) {
+ rte_errno = errno;
+ goto error;
+ }
+#endif
+ return 0;
+error:
+ err = rte_errno; /* Save rte_errno before cleanup. */
+ if (qp)
+ claim_zero(mlx5_glue->destroy_qp(qp));
+ rte_errno = err; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/**
+ * Destroy a Verbs queue pair.
+ *
+ * @param hrxq
+ * Hash Rx queue to release its qp.
+ */
+static void
+mlx5_ibv_qp_destroy(struct mlx5_hrxq *hrxq)
+{
+ claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+}
+
+/**
+ * Release a drop Rx queue Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_rxq_ibv_obj_drop_release(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
+
+ if (rxq->wq)
+ claim_zero(mlx5_glue->destroy_wq(rxq->wq));
+ if (rxq->ibv_cq)
+ claim_zero(mlx5_glue->destroy_cq(rxq->ibv_cq));
+ mlx5_free(rxq);
+ priv->drop_queue.rxq = NULL;
+}
+
+/**
+ * Create a drop Rx queue Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_rxq_ibv_obj_drop_create(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct ibv_context *ctx = priv->sh->ctx;
+ struct mlx5_rxq_obj *rxq = priv->drop_queue.rxq;
+
+ if (rxq)
+ return 0;
+ rxq = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*rxq), 0, SOCKET_ID_ANY);
+ if (!rxq) {
+ DRV_LOG(DEBUG, "Port %u cannot allocate drop Rx queue memory.",
+ dev->data->port_id);
+ rte_errno = ENOMEM;
+ return -rte_errno;
+ }
+ priv->drop_queue.rxq = rxq;
+ rxq->ibv_cq = mlx5_glue->create_cq(ctx, 1, NULL, NULL, 0);
+ if (!rxq->ibv_cq) {
+ DRV_LOG(DEBUG, "Port %u cannot allocate CQ for drop queue.",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ rxq->wq = mlx5_glue->create_wq(ctx, &(struct ibv_wq_init_attr){
+ .wq_type = IBV_WQT_RQ,
+ .max_wr = 1,
+ .max_sge = 1,
+ .pd = priv->sh->pd,
+ .cq = rxq->ibv_cq,
+ });
+ if (!rxq->wq) {
+ DRV_LOG(DEBUG, "Port %u cannot allocate WQ for drop queue.",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ priv->drop_queue.rxq = rxq;
+ return 0;
+error:
+ mlx5_rxq_ibv_obj_drop_release(dev);
+ return -rte_errno;
+}
+
+/**
+ * Create a Verbs drop action for Rx Hash queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_ibv_drop_action_create(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
+ struct ibv_rwq_ind_table *ind_tbl = NULL;
+ struct mlx5_rxq_obj *rxq;
+ int ret;
+
+ MLX5_ASSERT(hrxq && hrxq->ind_table);
+ ret = mlx5_rxq_ibv_obj_drop_create(dev);
+ if (ret < 0)
+ goto error;
+ rxq = priv->drop_queue.rxq;
+ ind_tbl = mlx5_glue->create_rwq_ind_table
+ (priv->sh->ctx,
+ &(struct ibv_rwq_ind_table_init_attr){
+ .log_ind_tbl_size = 0,
+ .ind_tbl = (struct ibv_wq **)&rxq->wq,
+ .comp_mask = 0,
+ });
+ if (!ind_tbl) {
+ DRV_LOG(DEBUG, "Port %u"
+ " cannot allocate indirection table for drop queue.",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ hrxq->qp = mlx5_glue->create_qp_ex(priv->sh->ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask = IBV_QP_INIT_ATTR_PD |
+ IBV_QP_INIT_ATTR_IND_TABLE |
+ IBV_QP_INIT_ATTR_RX_HASH,
+ .rx_hash_conf = (struct ibv_rx_hash_conf){
+ .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
+ .rx_hash_key_len = MLX5_RSS_HASH_KEY_LEN,
+ .rx_hash_key = rss_hash_default_key,
+ .rx_hash_fields_mask = 0,
+ },
+ .rwq_ind_tbl = ind_tbl,
+ .pd = priv->sh->pd
+ });
+ if (!hrxq->qp) {
+ DRV_LOG(DEBUG, "Port %u cannot allocate QP for drop queue.",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ hrxq->action = mlx5_glue->dv_create_flow_action_dest_ibv_qp(hrxq->qp);
+ if (!hrxq->action) {
+ rte_errno = errno;
+ goto error;
+ }
+#endif
+ hrxq->ind_table->ind_table = ind_tbl;
+ return 0;
+error:
+ if (hrxq->qp)
+ claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+ if (ind_tbl)
+ claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl));
+ if (priv->drop_queue.rxq)
+ mlx5_rxq_ibv_obj_drop_release(dev);
+ return -rte_errno;
+}
+
+/**
+ * Release a drop hash Rx queue.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+static void
+mlx5_ibv_drop_action_destroy(struct rte_eth_dev *dev)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_hrxq *hrxq = priv->drop_queue.hrxq;
+ struct ibv_rwq_ind_table *ind_tbl = hrxq->ind_table->ind_table;
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ claim_zero(mlx5_glue->destroy_flow_action(hrxq->action));
+#endif
+ claim_zero(mlx5_glue->destroy_qp(hrxq->qp));
+ claim_zero(mlx5_glue->destroy_rwq_ind_table(ind_tbl));
+ mlx5_rxq_ibv_obj_drop_release(dev);
+}
+
+/**
+ * Create a QP Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Tx queue array.
+ *
+ * @return
+ * The QP Verbs object, NULL otherwise and rte_errno is set.
+ */
+static struct ibv_qp *
+mlx5_txq_ibv_qp_create(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq_data, struct mlx5_txq_ctrl, txq);
+ struct ibv_qp *qp_obj = NULL;
+ struct ibv_qp_init_attr_ex qp_attr = { 0 };
+ const int desc = 1 << txq_data->elts_n;
+
+ MLX5_ASSERT(txq_ctrl->obj->cq);
+ /* CQ to be associated with the send queue. */
+ qp_attr.send_cq = txq_ctrl->obj->cq;
+ /* CQ to be associated with the receive queue. */
+ qp_attr.recv_cq = txq_ctrl->obj->cq;
+ /* Max number of outstanding WRs. */
+ qp_attr.cap.max_send_wr = ((priv->sh->device_attr.max_qp_wr < desc) ?
+ priv->sh->device_attr.max_qp_wr : desc);
+ /*
+ * Max number of scatter/gather elements in a WR, must be 1 to prevent
+ * libmlx5 from trying to affect must be 1 to prevent libmlx5 from
+ * trying to affect too much memory. TX gather is not impacted by the
+ * device_attr.max_sge limit and will still work properly.
+ */
+ qp_attr.cap.max_send_sge = 1;
+ qp_attr.qp_type = IBV_QPT_RAW_PACKET,
+ /* Do *NOT* enable this, completions events are managed per Tx burst. */
+ qp_attr.sq_sig_all = 0;
+ qp_attr.pd = priv->sh->pd;
+ qp_attr.comp_mask = IBV_QP_INIT_ATTR_PD;
+ if (txq_data->inlen_send)
+ qp_attr.cap.max_inline_data = txq_ctrl->max_inline_data;
+ if (txq_data->tso_en) {
+ qp_attr.max_tso_header = txq_ctrl->max_tso_header;
+ qp_attr.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER;
+ }
+ qp_obj = mlx5_glue->create_qp_ex(priv->sh->ctx, &qp_attr);
+ if (qp_obj == NULL) {
+ DRV_LOG(ERR, "Port %u Tx queue %u QP creation failure.",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ }
+ return qp_obj;
+}
+
+/**
+ * Create the Tx queue Verbs object.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ * @param idx
+ * Queue index in DPDK Tx queue array.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_txq_ibv_obj_new(struct rte_eth_dev *dev, uint16_t idx)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_txq_data *txq_data = (*priv->txqs)[idx];
+ struct mlx5_txq_ctrl *txq_ctrl =
+ container_of(txq_data, struct mlx5_txq_ctrl, txq);
+ struct mlx5_txq_obj *txq_obj = txq_ctrl->obj;
+ unsigned int cqe_n;
+ struct mlx5dv_qp qp;
+ struct mlx5dv_cq cq_info;
+ struct mlx5dv_obj obj;
+ const int desc = 1 << txq_data->elts_n;
+ int ret = 0;
+
+ MLX5_ASSERT(txq_data);
+ MLX5_ASSERT(txq_obj);
+ txq_obj->txq_ctrl = txq_ctrl;
+ if (mlx5_getenv_int("MLX5_ENABLE_CQE_COMPRESSION")) {
+ DRV_LOG(ERR, "Port %u MLX5_ENABLE_CQE_COMPRESSION "
+ "must never be set.", dev->data->port_id);
+ rte_errno = EINVAL;
+ return -rte_errno;
+ }
+ cqe_n = desc / MLX5_TX_COMP_THRESH +
+ 1 + MLX5_TX_COMP_THRESH_INLINE_DIV;
+ txq_obj->cq = mlx5_glue->create_cq(priv->sh->ctx, cqe_n, NULL, NULL, 0);
+ if (txq_obj->cq == NULL) {
+ DRV_LOG(ERR, "Port %u Tx queue %u CQ creation failure.",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ txq_obj->qp = mlx5_txq_ibv_qp_create(dev, idx);
+ if (txq_obj->qp == NULL) {
+ rte_errno = errno;
+ goto error;
+ }
+ ret = mlx5_ibv_modify_qp(txq_obj, MLX5_TXQ_MOD_RST2RDY,
+ (uint8_t)priv->dev_port);
+ if (ret) {
+ DRV_LOG(ERR, "Port %u Tx queue %u QP state modifying failed.",
+ dev->data->port_id, idx);
+ rte_errno = errno;
+ goto error;
+ }
+ qp.comp_mask = MLX5DV_QP_MASK_UAR_MMAP_OFFSET;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /* If using DevX, need additional mask to read tisn value. */
+ if (priv->sh->devx && !priv->sh->tdn)
+ qp.comp_mask |= MLX5DV_QP_MASK_RAW_QP_HANDLES;
+#endif
+ obj.cq.in = txq_obj->cq;
+ obj.cq.out = &cq_info;
+ obj.qp.in = txq_obj->qp;
+ obj.qp.out = &qp;
+ ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_QP);
+ if (ret != 0) {
+ rte_errno = errno;
+ goto error;
+ }
+ if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+ DRV_LOG(ERR,
+ "Port %u wrong MLX5_CQE_SIZE environment variable"
+ " value: it should be set to %u.",
+ dev->data->port_id, RTE_CACHE_LINE_SIZE);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ txq_data->cqe_n = log2above(cq_info.cqe_cnt);
+ txq_data->cqe_s = 1 << txq_data->cqe_n;
+ txq_data->cqe_m = txq_data->cqe_s - 1;
+ txq_data->qp_num_8s = ((struct ibv_qp *)txq_obj->qp)->qp_num << 8;
+ txq_data->wqes = qp.sq.buf;
+ txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
+ txq_data->wqe_s = 1 << txq_data->wqe_n;
+ txq_data->wqe_m = txq_data->wqe_s - 1;
+ txq_data->wqes_end = txq_data->wqes + txq_data->wqe_s;
+ txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
+ txq_data->cq_db = cq_info.dbrec;
+ txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf;
+ txq_data->cq_ci = 0;
+ txq_data->cq_pi = 0;
+ txq_data->wqe_ci = 0;
+ txq_data->wqe_pi = 0;
+ txq_data->wqe_comp = 0;
+ txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV;
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+ /*
+ * If using DevX need to query and store TIS transport domain value.
+ * This is done once per port.
+ * Will use this value on Rx, when creating matching TIR.
+ */
+ if (priv->sh->devx && !priv->sh->tdn) {
+ ret = mlx5_devx_cmd_qp_query_tis_td(txq_obj->qp, qp.tisn,
+ &priv->sh->tdn);
+ if (ret) {
+ DRV_LOG(ERR, "Fail to query port %u Tx queue %u QP TIS "
+ "transport domain.", dev->data->port_id, idx);
+ rte_errno = EINVAL;
+ goto error;
+ } else {
+ DRV_LOG(DEBUG, "Port %u Tx queue %u TIS number %d "
+ "transport domain %d.", dev->data->port_id,
+ idx, qp.tisn, priv->sh->tdn);
+ }
+ }
+#endif
+ txq_ctrl->bf_reg = qp.bf.reg;
+ if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
+ txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
+ DRV_LOG(DEBUG, "Port %u: uar_mmap_offset 0x%" PRIx64 ".",
+ dev->data->port_id, txq_ctrl->uar_mmap_offset);
+ } else {
+ DRV_LOG(ERR,
+ "Port %u failed to retrieve UAR info, invalid"
+ " libmlx5.so",
+ dev->data->port_id);
+ rte_errno = EINVAL;
+ goto error;
+ }
+ txq_uar_init(txq_ctrl);
+ dev->data->tx_queue_state[idx] = RTE_ETH_QUEUE_STATE_STARTED;
+ return 0;
+error:
+ ret = rte_errno; /* Save rte_errno before cleanup. */
+ if (txq_obj->cq)
+ claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
+ if (txq_obj->qp)
+ claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
+ rte_errno = ret; /* Restore rte_errno. */
+ return -rte_errno;
+}
+
+/*
+ * Create the dummy QP with minimal resources for loopback.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ *
+ * @return
+ * 0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_rxq_ibv_obj_dummy_lb_create(struct rte_eth_dev *dev)
+{
+#if defined(HAVE_IBV_DEVICE_TUNNEL_SUPPORT) && defined(HAVE_IBV_FLOW_DV_SUPPORT)
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+ struct ibv_context *ctx = sh->ctx;
+ struct mlx5dv_qp_init_attr qp_init_attr = {0};
+ struct {
+ struct ibv_cq_init_attr_ex ibv;
+ struct mlx5dv_cq_init_attr mlx5;
+ } cq_attr = {{0}};
+
+ if (dev->data->dev_conf.lpbk_mode) {
+ /* Allow packet sent from NIC loop back w/o source MAC check. */
+ qp_init_attr.comp_mask |=
+ MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS;
+ qp_init_attr.create_flags |=
+ MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC;
+ } else {
+ return 0;
+ }
+ /* Only need to check refcnt, 0 after "sh" is allocated. */
+ if (!!(__atomic_fetch_add(&sh->self_lb.refcnt, 1, __ATOMIC_RELAXED))) {
+ MLX5_ASSERT(sh->self_lb.ibv_cq && sh->self_lb.qp);
+ priv->lb_used = 1;
+ return 0;
+ }
+ cq_attr.ibv = (struct ibv_cq_init_attr_ex){
+ .cqe = 1,
+ .channel = NULL,
+ .comp_mask = 0,
+ };
+ cq_attr.mlx5 = (struct mlx5dv_cq_init_attr){
+ .comp_mask = 0,
+ };
+ /* Only CQ is needed, no WQ(RQ) is required in this case. */
+ sh->self_lb.ibv_cq = mlx5_glue->cq_ex_to_cq(mlx5_glue->dv_create_cq(ctx,
+ &cq_attr.ibv,
+ &cq_attr.mlx5));
+ if (!sh->self_lb.ibv_cq) {
+ DRV_LOG(ERR, "Port %u cannot allocate CQ for loopback.",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ sh->self_lb.qp = mlx5_glue->dv_create_qp(ctx,
+ &(struct ibv_qp_init_attr_ex){
+ .qp_type = IBV_QPT_RAW_PACKET,
+ .comp_mask = IBV_QP_INIT_ATTR_PD,
+ .pd = sh->pd,
+ .send_cq = sh->self_lb.ibv_cq,
+ .recv_cq = sh->self_lb.ibv_cq,
+ .cap.max_recv_wr = 1,
+ },
+ &qp_init_attr);
+ if (!sh->self_lb.qp) {
+ DRV_LOG(DEBUG, "Port %u cannot allocate QP for loopback.",
+ dev->data->port_id);
+ rte_errno = errno;
+ goto error;
+ }
+ priv->lb_used = 1;
+ return 0;
+error:
+ if (sh->self_lb.ibv_cq) {
+ claim_zero(mlx5_glue->destroy_cq(sh->self_lb.ibv_cq));
+ sh->self_lb.ibv_cq = NULL;
+ }
+ (void)__atomic_sub_fetch(&sh->self_lb.refcnt, 1, __ATOMIC_RELAXED);
+ return -rte_errno;
+#else
+ RTE_SET_USED(dev);
+ return 0;
+#endif
+}
+
+/*
+ * Release the dummy queue resources for loopback.
+ *
+ * @param dev
+ * Pointer to Ethernet device.
+ */
+void
+mlx5_rxq_ibv_obj_dummy_lb_release(struct rte_eth_dev *dev)
+{
+#if defined(HAVE_IBV_DEVICE_TUNNEL_SUPPORT) && defined(HAVE_IBV_FLOW_DV_SUPPORT)
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_ctx_shared *sh = priv->sh;
+
+ if (!priv->lb_used)
+ return;
+ MLX5_ASSERT(__atomic_load_n(&sh->self_lb.refcnt, __ATOMIC_RELAXED));
+ if (!(__atomic_sub_fetch(&sh->self_lb.refcnt, 1, __ATOMIC_RELAXED))) {
+ if (sh->self_lb.qp) {
+ claim_zero(mlx5_glue->destroy_qp(sh->self_lb.qp));
+ sh->self_lb.qp = NULL;
+ }
+ if (sh->self_lb.ibv_cq) {
+ claim_zero(mlx5_glue->destroy_cq(sh->self_lb.ibv_cq));
+ sh->self_lb.ibv_cq = NULL;
+ }
+ }
+ priv->lb_used = 0;
+#else
+ RTE_SET_USED(dev);
+ return;
+#endif
+}
+
+/**
+ * Release an Tx verbs queue object.
+ *
+ * @param txq_obj
+ * Verbs Tx queue object..
+ */
+void
+mlx5_txq_ibv_obj_release(struct mlx5_txq_obj *txq_obj)
+{
+ MLX5_ASSERT(txq_obj);
+ claim_zero(mlx5_glue->destroy_qp(txq_obj->qp));
+ claim_zero(mlx5_glue->destroy_cq(txq_obj->cq));
+}
+
+struct mlx5_obj_ops ibv_obj_ops = {
+ .rxq_obj_modify_vlan_strip = mlx5_rxq_obj_modify_wq_vlan_strip,
+ .rxq_obj_new = mlx5_rxq_ibv_obj_new,
+ .rxq_event_get = mlx5_rx_ibv_get_event,
+ .rxq_obj_modify = mlx5_ibv_modify_wq,
+ .rxq_obj_release = mlx5_rxq_ibv_obj_release,
+ .ind_table_new = mlx5_ibv_ind_table_new,
+ .ind_table_destroy = mlx5_ibv_ind_table_destroy,
+ .hrxq_new = mlx5_ibv_hrxq_new,
+ .hrxq_destroy = mlx5_ibv_qp_destroy,
+ .drop_action_create = mlx5_ibv_drop_action_create,
+ .drop_action_destroy = mlx5_ibv_drop_action_destroy,
+ .txq_obj_new = mlx5_txq_ibv_obj_new,
+ .txq_obj_modify = mlx5_ibv_modify_qp,
+ .txq_obj_release = mlx5_txq_ibv_obj_release,
+ .lb_dummy_queue_create = NULL,
+ .lb_dummy_queue_release = NULL,
+};
diff --git a/drivers/net/mlx5/freebsd/mlx5_verbs.h b/drivers/net/mlx5/freebsd/mlx5_verbs.h
new file mode 100644
index 0000000000..f7e8e2fe98
--- /dev/null
+++ b/drivers/net/mlx5/freebsd/mlx5_verbs.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2020 Mellanox Technologies, Ltd
+ */
+
+#ifndef RTE_PMD_MLX5_VERBS_H_
+#define RTE_PMD_MLX5_VERBS_H_
+
+#include "mlx5.h"
+
+int mlx5_txq_ibv_obj_new(struct rte_eth_dev *dev, uint16_t idx);
+void mlx5_txq_ibv_obj_release(struct mlx5_txq_obj *txq_obj);
+int mlx5_rxq_ibv_obj_dummy_lb_create(struct rte_eth_dev *dev);
+void mlx5_rxq_ibv_obj_dummy_lb_release(struct rte_eth_dev *dev);
+
+/* Verbs ops struct */
+extern const struct mlx5_mr_ops mlx5_mr_verbs_ops;
+extern struct mlx5_obj_ops ibv_obj_ops;
+#endif /* RTE_PMD_MLX5_VERBS_H_ */
--
2.30.2
next prev parent reply other threads:[~2021-09-27 14:58 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-09-27 13:34 [dpdk-dev] [PATCH 00/19] MLX5 FreeBSD support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 01/19] common/mlx5: stub for FreeBSD Srikanth Kaka
2021-09-27 13:34 ` Srikanth Kaka [this message]
2021-09-27 13:34 ` [dpdk-dev] [PATCH 03/19] common/mlx5: disabling auxiliary bus support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 04/19] net/mlx5: " Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 05/19] net/mlx5: modified PCI probe to work on FreeBSD Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 06/19] common/mlx5: define PF_INET socket Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 07/19] net/mlx5: use the newly defined INET socket Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 08/19] common/mlx5: derive PCI addr in FreeBSD Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 09/19] common/mlx5: get interface name Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 10/19] net/mlx5: fix socket MAC request Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 11/19] net/mlx5: removing port representator support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 12/19] net/mlx5: Added procedure to detect link state Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 13/19] net/mlx5: added placeholder for VLAN vmwa Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 14/19] net/mlx5: added stats support Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 15/19] net/mlx5: making flow control DPDK callback invalid Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 16/19] net/mlx5: making module DPDK callbacks invalid Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 17/19] common/mlx5: fixed missing dependency in mlx5_glue.h Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 18/19] net/mlx5: fixed compilation warnings Srikanth Kaka
2021-09-27 13:34 ` [dpdk-dev] [PATCH 19/19] mlx5: Added meson support for FreeBSD Srikanth Kaka
2021-09-29 12:20 ` [dpdk-dev] [PATCH 00/19] MLX5 FreeBSD support Thomas Monjalon
2021-09-29 15:56 ` Srikanth K
2021-09-29 16:20 ` Thomas Monjalon
2021-09-30 16:27 ` Srikanth K
2021-09-30 16:55 ` Thomas Monjalon
2021-10-01 11:35 ` Srikanth K
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210927133450.10653-3-srikanth.k@oneconvergence.com \
--to=srikanth.k@oneconvergence.com \
--cc=avelu@juniper.net \
--cc=dev@dpdk.org \
--cc=matan@nvidia.com \
--cc=vag.singh@oneconvergence.com \
--cc=viacheslavo@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).