DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH v2 1/4] event/cnxk: add Rx adapter support
@ 2021-05-24 12:22 pbhagavatula
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 2/4] event/cnxk: add Rx adapter fastpath ops pbhagavatula
                   ` (3 more replies)
  0 siblings, 4 replies; 93+ messages in thread
From: pbhagavatula @ 2021-05-24 12:22 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Rx adapter.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 This patch set depends on
 http://patches.dpdk.org/project/dpdk/list/?series=15515

 doc/guides/eventdevs/cnxk.rst            |   4 +
 drivers/event/cnxk/cn10k_eventdev.c      |  76 +++++++++++
 drivers/event/cnxk/cn10k_worker.h        |   4 +
 drivers/event/cnxk/cn9k_eventdev.c       |  82 ++++++++++++
 drivers/event/cnxk/cn9k_worker.h         |   4 +
 drivers/event/cnxk/cnxk_eventdev.h       |  21 +++
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 157 +++++++++++++++++++++++
 7 files changed, 348 insertions(+)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 36da3800cc..03dfcbd6a8 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -39,6 +39,10 @@ Features of the OCTEON cnxk SSO PMD are:
   time granularity of 2.5us on CN9K and 1us on CN10K.
 - Up to 256 TIM rings a.k.a event timer adapters.
 - Up to 8 rings traversed in parallel.
+- HW managed packets enqueued from ethdev to eventdev exposed through event eth
+  RX adapter.
+- N:1 ethernet device Rx queue to Event queue mapping.
+- Full Rx offload support defined through ethdev queue configuration.

 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index bf4052c76c..66040df060 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -407,6 +407,76 @@ cn10k_sso_selftest(void)
 	return cnxk_sso_selftest(RTE_STR(event_cn10k));
 }

+static int
+cn10k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int rc;
+
+	RTE_SET_USED(event_dev);
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 9);
+	if (rc)
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_SW_CAP;
+	else
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+
+	return 0;
+}
+
+static void
+cn10k_sso_set_lookup_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
+		ws->lookup_mem = lookup_mem;
+	}
+}
+
+static int
+cn10k_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	void *lookup_mem;
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	rc = cnxk_sso_rx_adapter_queue_add(event_dev, eth_dev, rx_queue_id,
+					   queue_conf);
+	if (rc)
+		return -EINVAL;
+
+	lookup_mem = ((struct cn10k_eth_rxq *)eth_dev->data->rx_queues[0])
+			     ->lookup_mem;
+	cn10k_sso_set_lookup_mem(event_dev, lookup_mem);
+	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t rx_queue_id)
+{
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
+}
+
 static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.dev_infos_get = cn10k_sso_info_get,
 	.dev_configure = cn10k_sso_dev_configure,
@@ -420,6 +490,12 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.port_unlink = cn10k_sso_port_unlink,
 	.timeout_ticks = cnxk_sso_timeout_ticks,

+	.eth_rx_adapter_caps_get = cn10k_sso_rx_adapter_caps_get,
+	.eth_rx_adapter_queue_add = cn10k_sso_rx_adapter_queue_add,
+	.eth_rx_adapter_queue_del = cn10k_sso_rx_adapter_queue_del,
+	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
+	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,

 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 2f093a8dd5..085857bccf 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -5,9 +5,13 @@
 #ifndef __CN10K_WORKER_H__
 #define __CN10K_WORKER_H__

+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"

+#include "cn10k_ethdev.h"
+#include "cn10k_rx.h"
+
 /* SSO Operations */

 static __rte_always_inline uint8_t
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 0684417eab..8e6bf54df9 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -481,6 +481,82 @@ cn9k_sso_selftest(void)
 	return cnxk_sso_selftest(RTE_STR(event_cn9k));
 }

+static int
+cn9k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
+			     const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int rc;
+
+	RTE_SET_USED(event_dev);
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 9);
+	if (rc)
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_SW_CAP;
+	else
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+
+	return 0;
+}
+
+static void
+cn9k_sso_set_lookup_mem(const struct rte_eventdev *event_dev, void *lookup_mem)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		if (dev->dual_ws) {
+			struct cn9k_sso_hws_dual *dws =
+				event_dev->data->ports[i];
+			dws->lookup_mem = lookup_mem;
+		} else {
+			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
+			ws->lookup_mem = lookup_mem;
+		}
+	}
+}
+
+static int
+cn9k_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	void *lookup_mem;
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (rc)
+		return -EINVAL;
+
+	rc = cnxk_sso_rx_adapter_queue_add(event_dev, eth_dev, rx_queue_id,
+					   queue_conf);
+	if (rc)
+		return -EINVAL;
+
+	lookup_mem = ((struct cn9k_eth_rxq *)eth_dev->data->rx_queues[0])
+			     ->lookup_mem;
+	cn9k_sso_set_lookup_mem(event_dev, lookup_mem);
+	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn9k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t rx_queue_id)
+{
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (rc)
+		return -EINVAL;
+
+	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
+}
+
 static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.dev_infos_get = cn9k_sso_info_get,
 	.dev_configure = cn9k_sso_dev_configure,
@@ -494,6 +570,12 @@ static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.port_unlink = cn9k_sso_port_unlink,
 	.timeout_ticks = cnxk_sso_timeout_ticks,

+	.eth_rx_adapter_caps_get = cn9k_sso_rx_adapter_caps_get,
+	.eth_rx_adapter_queue_add = cn9k_sso_rx_adapter_queue_add,
+	.eth_rx_adapter_queue_del = cn9k_sso_rx_adapter_queue_del,
+	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
+	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,

 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 38fca08fb6..f5a4401465 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -5,9 +5,13 @@
 #ifndef __CN9K_WORKER_H__
 #define __CN9K_WORKER_H__

+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"

+#include "cn9k_ethdev.h"
+#include "cn9k_rx.h"
+
 /* SSO Operations */

 static __rte_always_inline uint8_t
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 668e51d62a..6e0bb8ac5c 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -6,6 +6,8 @@
 #define __CNXK_EVENTDEV_H__

 #include <rte_devargs.h>
+#include <rte_ethdev.h>
+#include <rte_event_eth_rx_adapter.h>
 #include <rte_kvargs.h>
 #include <rte_mbuf_pool_ops.h>
 #include <rte_pci.h>
@@ -81,7 +83,10 @@ struct cnxk_sso_evdev {
 	uint64_t nb_xaq_cfg;
 	rte_iova_t fc_iova;
 	struct rte_mempool *xaq_pool;
+	uint64_t rx_offloads;
 	uint64_t adptr_xae_cnt;
+	uint16_t rx_adptr_pool_cnt;
+	uint64_t *rx_adptr_pools;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -108,6 +113,7 @@ struct cnxk_sso_evdev {
 struct cn10k_sso_hws {
 	/* Get Work Fastpath data */
 	CN10K_SSO_HWS_OPS;
+	void *lookup_mem;
 	uint32_t gw_wdata;
 	uint8_t swtag_req;
 	uint8_t hws_id;
@@ -132,6 +138,7 @@ struct cn10k_sso_hws {
 struct cn9k_sso_hws {
 	/* Get Work Fastpath data */
 	CN9K_SSO_HWS_OPS;
+	void *lookup_mem;
 	uint8_t swtag_req;
 	uint8_t hws_id;
 	/* Add Work Fastpath data */
@@ -148,6 +155,7 @@ struct cn9k_sso_hws_state {
 struct cn9k_sso_hws_dual {
 	/* Get Work Fastpath data */
 	struct cn9k_sso_hws_state ws_state[2]; /* Ping and Pong */
+	void *lookup_mem;
 	uint8_t swtag_req;
 	uint8_t vws; /* Ping pong bit */
 	uint8_t hws_id;
@@ -250,4 +258,17 @@ int cnxk_sso_xstats_reset(struct rte_eventdev *event_dev,
 /* CN9K */
 void cn9k_sso_set_rsrc(void *arg);

+/* Common adapter ops */
+int cnxk_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf);
+int cnxk_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t rx_queue_id);
+int cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev);
+int cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
+			     const struct rte_eth_dev *eth_dev);
+
 #endif /* __CNXK_EVENTDEV_H__ */
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 89a1d82c14..8de7b6f895 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -2,6 +2,7 @@
  * Copyright(C) 2021 Marvell.
  */

+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"

 void
@@ -11,6 +12,32 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 	int i;

 	switch (event_type) {
+	case RTE_EVENT_TYPE_ETHDEV: {
+		struct cnxk_eth_rxq_sp *rxq = data;
+		uint64_t *old_ptr;
+
+		for (i = 0; i < dev->rx_adptr_pool_cnt; i++) {
+			if ((uint64_t)rxq->qconf.mp == dev->rx_adptr_pools[i])
+				return;
+		}
+
+		dev->rx_adptr_pool_cnt++;
+		old_ptr = dev->rx_adptr_pools;
+		dev->rx_adptr_pools = rte_realloc(
+			dev->rx_adptr_pools,
+			sizeof(uint64_t) * dev->rx_adptr_pool_cnt, 0);
+		if (dev->rx_adptr_pools == NULL) {
+			dev->adptr_xae_cnt += rxq->qconf.mp->size;
+			dev->rx_adptr_pools = old_ptr;
+			dev->rx_adptr_pool_cnt--;
+			return;
+		}
+		dev->rx_adptr_pools[dev->rx_adptr_pool_cnt - 1] =
+			(uint64_t)rxq->qconf.mp;
+
+		dev->adptr_xae_cnt += rxq->qconf.mp->size;
+		break;
+	}
 	case RTE_EVENT_TYPE_TIMER: {
 		struct cnxk_tim_ring *timr = data;
 		uint16_t *old_ring_ptr;
@@ -65,3 +92,133 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		break;
 	}
 }
+
+static int
+cnxk_sso_rxq_enable(struct cnxk_eth_dev *cnxk_eth_dev, uint16_t rq_id,
+		    uint16_t port_id, const struct rte_event *ev,
+		    uint8_t custom_flowid)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+	rq->sso_ena = 1;
+	rq->tt = ev->sched_type;
+	rq->hwgrp = ev->queue_id;
+	rq->flow_tag_width = 20;
+	rq->wqe_skip = 1;
+	rq->tag_mask = (port_id & 0xF) << 20;
+	rq->tag_mask |= (((port_id >> 4) & 0xF) | (RTE_EVENT_TYPE_ETHDEV << 4))
+			<< 24;
+
+	if (custom_flowid) {
+		rq->flow_tag_width = 0;
+		rq->tag_mask |= ev->flow_id;
+	}
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+static int
+cnxk_sso_rxq_disable(struct cnxk_eth_dev *cnxk_eth_dev, uint16_t rq_id)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+	rq->sso_ena = 0;
+	rq->flow_tag_width = 32;
+	rq->tag_mask = 0;
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+int
+cnxk_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t port = eth_dev->data->port_id;
+	struct cnxk_eth_rxq_sp *rxq_sp;
+	int i, rc = 0;
+
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			rxq_sp = eth_dev->data->rx_queues[i];
+			rxq_sp = rxq_sp - 1;
+			cnxk_sso_updt_xae_cnt(dev, rxq_sp,
+					      RTE_EVENT_TYPE_ETHDEV);
+			rc = cnxk_sso_xae_reconfigure(
+				(struct rte_eventdev *)(uintptr_t)event_dev);
+			rc |= cnxk_sso_rxq_enable(
+				cnxk_eth_dev, i, port, &queue_conf->ev,
+				!!(queue_conf->rx_queue_flags &
+				   RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID));
+		}
+	} else {
+		rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+		rxq_sp = rxq_sp - 1;
+		cnxk_sso_updt_xae_cnt(dev, rxq_sp, RTE_EVENT_TYPE_ETHDEV);
+		rc = cnxk_sso_xae_reconfigure(
+			(struct rte_eventdev *)(uintptr_t)event_dev);
+		rc |= cnxk_sso_rxq_enable(
+			cnxk_eth_dev, (uint16_t)rx_queue_id, port,
+			&queue_conf->ev,
+			!!(queue_conf->rx_queue_flags &
+			   RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID));
+	}
+
+	if (rc < 0) {
+		plt_err("Failed to configure Rx adapter port=%d, q=%d", port,
+			queue_conf->ev.queue_id);
+		return rc;
+	}
+
+	dev->rx_offloads |= cnxk_eth_dev->rx_offload_flags;
+
+	return 0;
+}
+
+int
+cnxk_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t rx_queue_id)
+{
+	struct cnxk_eth_dev *dev = eth_dev->data->dev_private;
+	int i, rc = 0;
+
+	RTE_SET_USED(event_dev);
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
+			rc = cnxk_sso_rxq_disable(dev, i);
+	} else {
+		rc = cnxk_sso_rxq_disable(dev, (uint16_t)rx_queue_id);
+	}
+
+	if (rc < 0)
+		plt_err("Failed to clear Rx adapter config port=%d, q=%d",
+			eth_dev->data->port_id, rx_queue_id);
+
+	return rc;
+}
+
+int
+cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
+			  const struct rte_eth_dev *eth_dev)
+{
+	RTE_SET_USED(event_dev);
+	RTE_SET_USED(eth_dev);
+
+	return 0;
+}
+
+int
+cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
+			 const struct rte_eth_dev *eth_dev)
+{
+	RTE_SET_USED(event_dev);
+	RTE_SET_USED(eth_dev);
+
+	return 0;
+}
--
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 2/4] event/cnxk: add Rx adapter fastpath ops
  2021-05-24 12:22 [dpdk-dev] [PATCH v2 1/4] event/cnxk: add Rx adapter support pbhagavatula
@ 2021-05-24 12:23 ` pbhagavatula
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 3/4] event/cnxk: add Tx adapter support pbhagavatula
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-05-24 12:23 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Rx adapter fastpath operations.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c | 115 ++++++++-
 drivers/event/cnxk/cn10k_worker.c   | 164 +++++++++----
 drivers/event/cnxk/cn10k_worker.h   |  91 +++++--
 drivers/event/cnxk/cn9k_eventdev.c  | 254 ++++++++++++++++++-
 drivers/event/cnxk/cn9k_worker.c    | 364 +++++++++++++++++++---------
 drivers/event/cnxk/cn9k_worker.h    | 158 +++++++++---
 drivers/event/cnxk/meson.build      |   7 +
 7 files changed, 931 insertions(+), 222 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 66040df060..b1ad5b2878 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -247,17 +247,120 @@ static void
 cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	const event_dequeue_t sso_hws_deq[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_tmo_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_tmo_deq_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_tmo_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_seg[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_seg_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq_seg[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_tmo_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_tmo_deq_seg_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn10k_sso_hws_tmo_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
 
 	event_dev->enqueue = cn10k_sso_hws_enq;
 	event_dev->enqueue_burst = cn10k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn10k_sso_hws_enq_new_burst;
 	event_dev->enqueue_forward_burst = cn10k_sso_hws_enq_fwd_burst;
-
-	event_dev->dequeue = cn10k_sso_hws_deq;
-	event_dev->dequeue_burst = cn10k_sso_hws_deq_burst;
-	if (dev->is_timeout_deq) {
-		event_dev->dequeue = cn10k_sso_hws_tmo_deq;
-		event_dev->dequeue_burst = cn10k_sso_hws_tmo_deq_burst;
+	if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+		event_dev->dequeue = sso_hws_deq_seg
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_seg_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
+	} else {
+		event_dev->dequeue = sso_hws_deq
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
 	}
 }
 
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index e2aa534c64..4365aec992 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -60,56 +60,118 @@ cn10k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 	return 1;
 }
 
-uint16_t __rte_hot
-cn10k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn10k_sso_hws *ws = port;
-
-	RTE_SET_USED(timeout_ticks);
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);
-		return 1;
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	uint16_t __rte_hot cn10k_sso_hws_deq_##name(                           \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+                                                                               \
+		RTE_SET_USED(timeout_ticks);                                   \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);               \
+			return 1;                                              \
+		}                                                              \
+                                                                               \
+		return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_deq_burst_##name(                     \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn10k_sso_hws_deq_##name(port, ev, timeout_ticks);      \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);               \
+			return ret;                                            \
+		}                                                              \
+                                                                               \
+		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn10k_sso_hws_get_work(ws, ev, flags,            \
+						     ws->lookup_mem);          \
+                                                                               \
+		return ret;                                                    \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn10k_sso_hws_tmo_deq_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+                                                                               \
+		RTE_SET_USED(timeout_ticks);                                   \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);               \
+			return 1;                                              \
+		}                                                              \
+                                                                               \
+		return cn10k_sso_hws_get_work(                                 \
+			ws, ev, flags | NIX_RX_MULTI_SEG_F, ws->lookup_mem);   \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn10k_sso_hws_deq_seg_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);               \
+			return ret;                                            \
+		}                                                              \
+                                                                               \
+		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn10k_sso_hws_get_work(ws, ev, flags,            \
+						     ws->lookup_mem);          \
+                                                                               \
+		return ret;                                                    \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn10k_sso_hws_tmo_deq_seg_##name(port, ev,              \
+							timeout_ticks);        \
 	}
 
-	return cn10k_sso_hws_get_work(ws, ev);
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
-			uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn10k_sso_hws_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn10k_sso_hws *ws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);
-		return ret;
-	}
-
-	ret = cn10k_sso_hws_get_work(ws, ev);
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
-		ret = cn10k_sso_hws_get_work(ws, ev);
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-			    uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn10k_sso_hws_tmo_deq(port, ev, timeout_ticks);
-}
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 085857bccf..ad320d2dc0 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -83,20 +83,40 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 		cn10k_sso_hws_fwd_group(ws, ev, grp);
 }
 
+static __rte_always_inline void
+cn10k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
+		  const uint32_t tag, const uint32_t flags,
+		  const void *const lookup_mem)
+{
+	union mbuf_initializer mbuf_init = {
+		.fields = {.data_off = RTE_PKTMBUF_HEADROOM,
+			   .refcnt = 1,
+			   .nb_segs = 1,
+			   .port = port_id},
+	};
+
+	cn10k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag,
+			      (struct rte_mbuf *)mbuf, lookup_mem,
+			      mbuf_init.value, flags);
+}
+
 static __rte_always_inline uint16_t
-cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
+cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
+		       const uint32_t flags, void *lookup_mem)
 {
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
 		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		: [wdata] "+r"(gw.get_work)
+		"sub %[mbuf], %H[wdata], #0x80				\n"
+		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
 		: [gw_loc] "r"(ws->getwrk_op)
 		: "memory");
 #else
@@ -104,11 +124,25 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
 	do {
 		roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
 	} while (gw.u64[0] & BIT_ULL(63));
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn10k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					  gw.u64[0] & 0xFFFFF, flags,
+					  lookup_mem);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -123,6 +157,7 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
@@ -133,19 +168,34 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=			\n"
 		     "done%=:	dmb ld					\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80		\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_wqe_op)
 		     : "memory");
 #else
 	do {
 		roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
 	} while (gw.u64[0] & BIT_ULL(63));
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn10k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					  gw.u64[0] & 0xFFFFF, 0, NULL);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -164,16 +214,29 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 					       const struct rte_event ev[],
 					       uint16_t nb_events);
 
-uint16_t __rte_hot cn10k_sso_hws_deq(void *port, struct rte_event *ev,
-				     uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_deq_burst(void *port, struct rte_event ev[],
-					   uint16_t nb_events,
-					   uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev,
-					 uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_tmo_deq_burst(void *port,
-					       struct rte_event ev[],
-					       uint16_t nb_events,
-					       uint64_t timeout_ticks);
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	uint16_t __rte_hot cn10k_sso_hws_deq_##name(                           \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_burst_##name(                     \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_tmo_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
 
 #endif
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 8e6bf54df9..16acea4cda 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -252,17 +252,179 @@ static void
 cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	/* Single WS modes */
+	const event_dequeue_t sso_hws_deq[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_tmo_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_tmo_deq_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_tmo_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_seg[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_seg_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq_seg[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_tmo_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_tmo_deq_seg_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_tmo_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	/* Dual WS modes */
+	const event_dequeue_t sso_hws_dual_deq[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_dual_deq_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_dual_tmo_deq[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_tmo_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_dual_tmo_deq_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_tmo_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_dual_deq_seg[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_dual_deq_seg_burst[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_dual_tmo_deq_seg[2][2][2][2] = {
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_tmo_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_dual_tmo_deq_seg_burst[2][2][2][2] =
+		{
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	[f3][f2][f1][f0] = cn9k_sso_hws_dual_tmo_deq_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
 
 	event_dev->enqueue = cn9k_sso_hws_enq;
 	event_dev->enqueue_burst = cn9k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn9k_sso_hws_enq_new_burst;
 	event_dev->enqueue_forward_burst = cn9k_sso_hws_enq_fwd_burst;
-
-	event_dev->dequeue = cn9k_sso_hws_deq;
-	event_dev->dequeue_burst = cn9k_sso_hws_deq_burst;
-	if (dev->deq_tmo_ns) {
-		event_dev->dequeue = cn9k_sso_hws_tmo_deq;
-		event_dev->dequeue_burst = cn9k_sso_hws_tmo_deq_burst;
+	if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+		event_dev->dequeue = sso_hws_deq_seg
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_seg_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
+	} else {
+		event_dev->dequeue = sso_hws_deq
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
 	}
 
 	if (dev->dual_ws) {
@@ -272,14 +434,82 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 		event_dev->enqueue_forward_burst =
 			cn9k_sso_hws_dual_enq_fwd_burst;
 
-		event_dev->dequeue = cn9k_sso_hws_dual_deq;
-		event_dev->dequeue_burst = cn9k_sso_hws_dual_deq_burst;
-		if (dev->deq_tmo_ns) {
-			event_dev->dequeue = cn9k_sso_hws_dual_tmo_deq;
-			event_dev->dequeue_burst =
-				cn9k_sso_hws_dual_tmo_deq_burst;
+		if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+			event_dev->dequeue = sso_hws_dual_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_dual_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			if (dev->is_timeout_deq) {
+				event_dev->dequeue = sso_hws_dual_tmo_deq_seg
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_CHECKSUM_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_PTYPE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_RSS_F)];
+				event_dev->dequeue_burst =
+					sso_hws_dual_tmo_deq_seg_burst
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_CHECKSUM_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_PTYPE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_RSS_F)];
+			}
+		} else {
+			event_dev->dequeue = sso_hws_dual_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_dual_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			if (dev->is_timeout_deq) {
+				event_dev->dequeue = sso_hws_dual_tmo_deq
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_CHECKSUM_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_PTYPE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_RSS_F)];
+				event_dev->dequeue_burst =
+					sso_hws_dual_tmo_deq_burst
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_CHECKSUM_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_PTYPE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_RSS_F)];
+			}
 		}
 	}
+
+	rte_mb();
 }
 
 static void *
diff --git a/drivers/event/cnxk/cn9k_worker.c b/drivers/event/cnxk/cn9k_worker.c
index 9ceacc98dd..0f031a5fa3 100644
--- a/drivers/event/cnxk/cn9k_worker.c
+++ b/drivers/event/cnxk/cn9k_worker.c
@@ -60,59 +60,121 @@ cn9k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 	return 1;
 }
 
-uint16_t __rte_hot
-cn9k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws *ws = port;
-
-	RTE_SET_USED(timeout_ticks);
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_op);
-		return 1;
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	uint16_t __rte_hot cn9k_sso_hws_deq_##name(                            \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+                                                                               \
+		RTE_SET_USED(timeout_ticks);                                   \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return 1;                                              \
+		}                                                              \
+                                                                               \
+		return cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_deq_burst_##name(                      \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_deq_##name(port, ev, timeout_ticks);       \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return ret;                                            \
+		}                                                              \
+                                                                               \
+		ret = cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);    \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn9k_sso_hws_get_work(ws, ev, flags,             \
+						    ws->lookup_mem);           \
+                                                                               \
+		return ret;                                                    \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_tmo_deq_##name(port, ev, timeout_ticks);   \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+                                                                               \
+		RTE_SET_USED(timeout_ticks);                                   \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return 1;                                              \
+		}                                                              \
+                                                                               \
+		return cn9k_sso_hws_get_work(                                  \
+			ws, ev, flags | NIX_RX_MULTI_SEG_F, ws->lookup_mem);   \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_deq_seg_##name(port, ev, timeout_ticks);   \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_seg_##name(                    \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+                                                                               \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return ret;                                            \
+		}                                                              \
+                                                                               \
+		ret = cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);    \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn9k_sso_hws_get_work(ws, ev, flags,             \
+						    ws->lookup_mem);           \
+                                                                               \
+		return ret;                                                    \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_seg_burst_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_tmo_deq_seg_##name(port, ev,               \
+						       timeout_ticks);         \
 	}
 
-	return cn9k_sso_hws_get_work(ws, ev);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
-		       uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws *ws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_op);
-		return ret;
-	}
-
-	ret = cn9k_sso_hws_get_work(ws, ev);
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
-		ret = cn9k_sso_hws_get_work(ws, ev);
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-			   uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_tmo_deq(port, ev, timeout_ticks);
-}
+NIX_RX_FASTPATH_MODES
+#undef R
 
 /* Dual ws ops. */
 
@@ -172,65 +234,145 @@ cn9k_sso_hws_dual_enq_fwd_burst(void *port, const struct rte_event ev[],
 	return 1;
 }
 
-uint16_t __rte_hot
-cn9k_sso_hws_dual_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws_dual *dws = port;
-	uint16_t gw;
-
-	RTE_SET_USED(timeout_ticks);
-	if (dws->swtag_req) {
-		dws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(dws->ws_state[!dws->vws].tag_op);
-		return 1;
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t gw;                                                   \
+                                                                               \
+		RTE_SET_USED(timeout_ticks);                                   \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return 1;                                              \
+		}                                                              \
+                                                                               \
+		gw = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],      \
+						&dws->ws_state[!dws->vws], ev, \
+						flags, dws->lookup_mem);       \
+		dws->vws = !dws->vws;                                          \
+		return gw;                                                     \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_dual_deq_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+                                                                               \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return ret;                                            \
+		}                                                              \
+                                                                               \
+		ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],     \
+						 &dws->ws_state[!dws->vws],    \
+						 ev, flags, dws->lookup_mem);  \
+		dws->vws = !dws->vws;                                          \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
+			ret = cn9k_sso_hws_dual_get_work(                      \
+				&dws->ws_state[dws->vws],                      \
+				&dws->ws_state[!dws->vws], ev, flags,          \
+				dws->lookup_mem);                              \
+			dws->vws = !dws->vws;                                  \
+		}                                                              \
+                                                                               \
+		return ret;                                                    \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_dual_tmo_deq_##name(port, ev,              \
+							timeout_ticks);        \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t gw;                                                   \
+                                                                               \
+		RTE_SET_USED(timeout_ticks);                                   \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return 1;                                              \
+		}                                                              \
+                                                                               \
+		gw = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],      \
+						&dws->ws_state[!dws->vws], ev, \
+						flags, dws->lookup_mem);       \
+		dws->vws = !dws->vws;                                          \
+		return gw;                                                     \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_dual_deq_seg_##name(port, ev,              \
+							timeout_ticks);        \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_seg_##name(               \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+                                                                               \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return ret;                                            \
+		}                                                              \
+                                                                               \
+		ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],     \
+						 &dws->ws_state[!dws->vws],    \
+						 ev, flags, dws->lookup_mem);  \
+		dws->vws = !dws->vws;                                          \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
+			ret = cn9k_sso_hws_dual_get_work(                      \
+				&dws->ws_state[dws->vws],                      \
+				&dws->ws_state[!dws->vws], ev, flags,          \
+				dws->lookup_mem);                              \
+			dws->vws = !dws->vws;                                  \
+		}                                                              \
+                                                                               \
+		return ret;                                                    \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_seg_burst_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+                                                                               \
+		return cn9k_sso_hws_dual_tmo_deq_seg_##name(port, ev,          \
+							    timeout_ticks);    \
 	}
 
-	gw = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-					&dws->ws_state[!dws->vws], ev);
-	dws->vws = !dws->vws;
-	return gw;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_deq_burst(void *port, struct rte_event ev[],
-			    uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_dual_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_tmo_deq(void *port, struct rte_event *ev,
-			  uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws_dual *dws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (dws->swtag_req) {
-		dws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(dws->ws_state[!dws->vws].tag_op);
-		return ret;
-	}
-
-	ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-					 &dws->ws_state[!dws->vws], ev);
-	dws->vws = !dws->vws;
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {
-		ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-						 &dws->ws_state[!dws->vws], ev);
-		dws->vws = !dws->vws;
-	}
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_tmo_deq_burst(void *port, struct rte_event ev[],
-				uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_dual_tmo_deq(port, ev, timeout_ticks);
-}
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index f5a4401465..1fde652ff8 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -128,17 +128,38 @@ cn9k_sso_hws_dual_forward_event(struct cn9k_sso_hws_dual *dws,
 	}
 }
 
+static __rte_always_inline void
+cn9k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
+		 const uint32_t tag, const uint32_t flags,
+		 const void *const lookup_mem)
+{
+	union mbuf_initializer mbuf_init = {
+		.fields = {.data_off = RTE_PKTMBUF_HEADROOM,
+			   .refcnt = 1,
+			   .nb_segs = 1,
+			   .port = port_id},
+	};
+
+	cn9k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag,
+			     (struct rte_mbuf *)mbuf, lookup_mem,
+			     mbuf_init.value, flags);
+}
+
 static __rte_always_inline uint16_t
 cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 			   struct cn9k_sso_hws_state *ws_pair,
-			   struct rte_event *ev)
+			   struct rte_event *ev, const uint32_t flags,
+			   const void *const lookup_mem)
 {
 	const uint64_t set_gw = BIT_ULL(16) | 1;
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
+	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
+		rte_prefetch_non_temporal(lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "rty%=:					\n"
@@ -147,7 +168,10 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	str %[gw], [%[pong]]		\n"
 		     "		dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     "		prfm pldl1keep, [%[mbuf]]	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op),
 		       [gw] "r"(set_gw), [pong] "r"(ws_pair->getwrk_op));
 #else
@@ -156,12 +180,26 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 		gw.u64[0] = plt_read64(ws->tag_op);
 	gw.u64[1] = plt_read64(ws->wqp_op);
 	plt_write64(set_gw, ws_pair->getwrk_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, flags,
+					 lookup_mem);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -169,16 +207,21 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 }
 
 static __rte_always_inline uint16_t
-cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
+cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
+		      const uint32_t flags, const void *const lookup_mem)
 {
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 	plt_write64(BIT_ULL(16) | /* wait for work. */
 			    1,	  /* Use Mask set 0. */
 		    ws->getwrk_op);
+
+	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
+		rte_prefetch_non_temporal(lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
@@ -190,7 +233,10 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     "		prfm pldl1keep, [%[mbuf]]	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op));
 #else
 	gw.u64[0] = plt_read64(ws->tag_op);
@@ -198,12 +244,26 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
 		gw.u64[0] = plt_read64(ws->tag_op);
 
 	gw.u64[1] = plt_read64(ws->wqp_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, flags,
+					 lookup_mem);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -218,6 +278,7 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
@@ -230,7 +291,9 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op));
 #else
 	gw.u64[0] = plt_read64(ws->tag_op);
@@ -238,12 +301,25 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		gw.u64[0] = plt_read64(ws->tag_op);
 
 	gw.u64[1] = plt_read64(ws->wqp_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, 0, NULL);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -274,28 +350,54 @@ uint16_t __rte_hot cn9k_sso_hws_dual_enq_fwd_burst(void *port,
 						   const struct rte_event ev[],
 						   uint16_t nb_events);
 
-uint16_t __rte_hot cn9k_sso_hws_deq(void *port, struct rte_event *ev,
-				    uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_deq_burst(void *port, struct rte_event ev[],
-					  uint16_t nb_events,
-					  uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_tmo_deq(void *port, struct rte_event *ev,
-					uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-					      uint16_t nb_events,
-					      uint64_t timeout_ticks);
-
-uint16_t __rte_hot cn9k_sso_hws_dual_deq(void *port, struct rte_event *ev,
-					 uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst(void *port,
-					       struct rte_event ev[],
-					       uint16_t nb_events,
-					       uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq(void *port, struct rte_event *ev,
-					     uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_burst(void *port,
-						   struct rte_event ev[],
-						   uint16_t nb_events,
-						   uint64_t timeout_ticks);
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	uint16_t __rte_hot cn9k_sso_hws_deq_##name(                            \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_burst_##name(                      \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_seg_##name(                    \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_tmo_deq_seg_burst_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
+
+#define R(name, f3, f2, f1, f0, flags)                                         \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_seg_##name(               \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_seg_burst_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
 
 #endif
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index 87bb9f76a9..0a3bcffd64 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -8,6 +8,13 @@ if not is_linux or not dpdk_conf.get('RTE_ARCH_64')
     subdir_done()
 endif
 
+extra_flags = ['-Wno-strict-aliasing']
+foreach flag: extra_flags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
 sources = files(
         'cn9k_eventdev.c',
         'cn9k_worker.c',
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 3/4] event/cnxk: add Tx adapter support
  2021-05-24 12:22 [dpdk-dev] [PATCH v2 1/4] event/cnxk: add Rx adapter support pbhagavatula
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 2/4] event/cnxk: add Rx adapter fastpath ops pbhagavatula
@ 2021-05-24 12:23 ` pbhagavatula
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 4/4] event/cnxk: add Tx adapter fastpath ops pbhagavatula
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  3 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-05-24 12:23 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Tx adapter.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |   4 +-
 drivers/event/cnxk/cn10k_eventdev.c      |  90 +++++++++++++++++
 drivers/event/cnxk/cn9k_eventdev.c       | 117 +++++++++++++++++++++++
 drivers/event/cnxk/cnxk_eventdev.h       |  22 ++++-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 106 ++++++++++++++++++++
 5 files changed, 335 insertions(+), 4 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 03dfcbd6a8..502bac17e0 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -42,7 +42,9 @@ Features of the OCTEON cnxk SSO PMD are:
 - HW managed packets enqueued from ethdev to eventdev exposed through event eth
   RX adapter.
 - N:1 ethernet device Rx queue to Event queue mapping.
-- Full Rx offload support defined through ethdev queue configuration.
+- Lockfree Tx from event eth Tx adapter using ``DEV_TX_OFFLOAD_MT_LOCKFREE``
+  capability while maintaining receive packet order.
+- Full Rx/Tx offload support defined through ethdev queue configuration.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index b1ad5b2878..99d2b7a8ba 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -243,6 +243,39 @@ cn10k_sso_rsrc_init(void *arg, uint8_t hws, uint8_t hwgrp)
 	return roc_sso_rsrc_init(&dev->sso, hws, hwgrp);
 }
 
+static int
+cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	if (dev->tx_adptr_data == NULL)
+		return 0;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
+		void *ws_cookie;
+
+		ws_cookie = cnxk_sso_hws_get_cookie(ws);
+		ws_cookie = rte_realloc_socket(
+			ws_cookie,
+			sizeof(struct cnxk_sso_hws_cookie) +
+				sizeof(struct cn10k_sso_hws) +
+				(sizeof(uint64_t) * (dev->max_port_id + 1) *
+				 RTE_MAX_QUEUES_PER_PORT),
+			RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+		if (ws_cookie == NULL)
+			return -ENOMEM;
+		ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
+		memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
+		       sizeof(uint64_t) * (dev->max_port_id + 1) *
+			       RTE_MAX_QUEUES_PER_PORT);
+		event_dev->data->ports[i] = ws;
+	}
+
+	return 0;
+}
+
 static void
 cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
@@ -482,6 +515,10 @@ cn10k_sso_start(struct rte_eventdev *event_dev)
 {
 	int rc;
 
+	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+
 	rc = cnxk_sso_start(event_dev, cn10k_sso_hws_reset,
 			    cn10k_sso_hws_flush_events);
 	if (rc < 0)
@@ -580,6 +617,55 @@ cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
+			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (ret)
+		*caps = 0;
+	else
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+
+	return 0;
+}
+
+static int
+cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn10k_sso_tx_adapter_queue_del(uint8_t id, const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_del(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	return cn10k_sso_updt_tx_adptr_data(event_dev);
+}
+
 static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.dev_infos_get = cn10k_sso_info_get,
 	.dev_configure = cn10k_sso_dev_configure,
@@ -599,6 +685,10 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_tx_adapter_caps_get = cn10k_sso_tx_adapter_caps_get,
+	.eth_tx_adapter_queue_add = cn10k_sso_tx_adapter_queue_add,
+	.eth_tx_adapter_queue_del = cn10k_sso_tx_adapter_queue_del,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 16acea4cda..2f071f19ea 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -248,6 +248,66 @@ cn9k_sso_rsrc_init(void *arg, uint8_t hws, uint8_t hwgrp)
 	return roc_sso_rsrc_init(&dev->sso, hws, hwgrp);
 }
 
+static int
+cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	if (dev->tx_adptr_data == NULL)
+		return 0;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		if (dev->dual_ws) {
+			struct cn9k_sso_hws_dual *dws =
+				event_dev->data->ports[i];
+			void *ws_cookie;
+
+			ws_cookie = cnxk_sso_hws_get_cookie(dws);
+			ws_cookie = rte_realloc_socket(
+				ws_cookie,
+				sizeof(struct cnxk_sso_hws_cookie) +
+					sizeof(struct cn9k_sso_hws_dual) +
+					(sizeof(uint64_t) *
+					 (dev->max_port_id + 1) *
+					 RTE_MAX_QUEUES_PER_PORT),
+				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+			if (ws_cookie == NULL)
+				return -ENOMEM;
+			dws = RTE_PTR_ADD(ws_cookie,
+					  sizeof(struct cnxk_sso_hws_cookie));
+			memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
+			       sizeof(uint64_t) * (dev->max_port_id + 1) *
+				       RTE_MAX_QUEUES_PER_PORT);
+			event_dev->data->ports[i] = dws;
+		} else {
+			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
+			void *ws_cookie;
+
+			ws_cookie = cnxk_sso_hws_get_cookie(ws);
+			ws_cookie = rte_realloc_socket(
+				ws_cookie,
+				sizeof(struct cnxk_sso_hws_cookie) +
+					sizeof(struct cn9k_sso_hws_dual) +
+					(sizeof(uint64_t) *
+					 (dev->max_port_id + 1) *
+					 RTE_MAX_QUEUES_PER_PORT),
+				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+			if (ws_cookie == NULL)
+				return -ENOMEM;
+			ws = RTE_PTR_ADD(ws_cookie,
+					 sizeof(struct cnxk_sso_hws_cookie));
+			memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
+			       sizeof(uint64_t) * (dev->max_port_id + 1) *
+				       RTE_MAX_QUEUES_PER_PORT);
+			event_dev->data->ports[i] = ws;
+		}
+	}
+	rte_mb();
+
+	return 0;
+}
+
 static void
 cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
@@ -683,6 +743,10 @@ cn9k_sso_start(struct rte_eventdev *event_dev)
 {
 	int rc;
 
+	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+
 	rc = cnxk_sso_start(event_dev, cn9k_sso_hws_reset,
 			    cn9k_sso_hws_flush_events);
 	if (rc < 0)
@@ -787,6 +851,55 @@ cn9k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn9k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
+			     const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (ret)
+		*caps = 0;
+	else
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+
+	return 0;
+}
+
+static int
+cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn9k_sso_tx_adapter_queue_del(uint8_t id, const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_del(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	return cn9k_sso_updt_tx_adptr_data(event_dev);
+}
+
 static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.dev_infos_get = cn9k_sso_info_get,
 	.dev_configure = cn9k_sso_dev_configure,
@@ -806,6 +919,10 @@ static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_tx_adapter_caps_get = cn9k_sso_tx_adapter_caps_get,
+	.eth_tx_adapter_queue_add = cn9k_sso_tx_adapter_queue_add,
+	.eth_tx_adapter_queue_del = cn9k_sso_tx_adapter_queue_del,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 6e0bb8ac5c..57c3327aa0 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -8,6 +8,7 @@
 #include <rte_devargs.h>
 #include <rte_ethdev.h>
 #include <rte_event_eth_rx_adapter.h>
+#include <rte_event_eth_tx_adapter.h>
 #include <rte_kvargs.h>
 #include <rte_mbuf_pool_ops.h>
 #include <rte_pci.h>
@@ -84,9 +85,12 @@ struct cnxk_sso_evdev {
 	rte_iova_t fc_iova;
 	struct rte_mempool *xaq_pool;
 	uint64_t rx_offloads;
+	uint64_t tx_offloads;
 	uint64_t adptr_xae_cnt;
 	uint16_t rx_adptr_pool_cnt;
 	uint64_t *rx_adptr_pools;
+	uint64_t *tx_adptr_data;
+	uint16_t max_port_id;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -121,8 +125,10 @@ struct cn10k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base;
+	/* Tx Fastpath data */
+	uint64_t base __rte_cache_aligned;
 	uintptr_t lmt_base;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 /* CN9K HWS ops */
@@ -145,7 +151,9 @@ struct cn9k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base;
+	/* Tx Fastpath data */
+	uint64_t base __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 struct cn9k_sso_hws_state {
@@ -163,7 +171,9 @@ struct cn9k_sso_hws_dual {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base[2];
+	/* Tx Fastpath data */
+	uint64_t base[2] __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 struct cnxk_sso_hws_cookie {
@@ -270,5 +280,11 @@ int cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
 			      const struct rte_eth_dev *eth_dev);
 int cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
 			     const struct rte_eth_dev *eth_dev);
+int cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t tx_queue_id);
+int cnxk_sso_tx_adapter_queue_del(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t tx_queue_id);
 
 #endif /* __CNXK_EVENTDEV_H__ */
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 8de7b6f895..d4d07c793f 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -5,6 +5,8 @@
 #include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 
+#define CNXK_SSO_SQB_LIMIT (0x180)
+
 void
 cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		      uint32_t event_type)
@@ -222,3 +224,107 @@ cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
 
 	return 0;
 }
+
+static int
+cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
+{
+	uint16_t sqb_limit;
+
+	sqb_limit = RTE_MIN(nb_sqb_bufs, sq->nb_sqb_bufs);
+	return roc_npa_aura_limit_modify(sq->aura_handle, sqb_limit);
+}
+
+static int
+cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
+			    uint16_t eth_port_id, uint16_t tx_queue_id,
+			    void *txq)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t max_port_id = dev->max_port_id;
+	uint64_t *txq_data = dev->tx_adptr_data;
+
+	if (txq_data == NULL || eth_port_id > max_port_id) {
+		max_port_id = RTE_MAX(max_port_id, eth_port_id);
+		txq_data = rte_realloc_socket(
+			txq_data,
+			(sizeof(uint64_t) * (max_port_id + 1) *
+			 RTE_MAX_QUEUES_PER_PORT),
+			RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
+		if (txq_data == NULL)
+			return -ENOMEM;
+	}
+
+	((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
+		 txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
+	dev->max_port_id = max_port_id;
+	dev->tx_adptr_data = txq_data;
+	return 0;
+}
+
+int
+cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	struct roc_nix_sq *sq;
+	int i, ret;
+	void *txq;
+
+	if (tx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
+			txq = eth_dev->data->tx_queues[i];
+			sq = &cnxk_eth_dev->sqs[i];
+			cnxk_sso_sqb_aura_limit_edit(sq, CNXK_SSO_SQB_LIMIT);
+			ret = cnxk_sso_updt_tx_queue_data(
+				event_dev, eth_dev->data->port_id, i, txq);
+			if (ret < 0)
+				return ret;
+		}
+	} else {
+		txq = eth_dev->data->tx_queues[tx_queue_id];
+		sq = &cnxk_eth_dev->sqs[tx_queue_id];
+		cnxk_sso_sqb_aura_limit_edit(sq, CNXK_SSO_SQB_LIMIT);
+		ret = cnxk_sso_updt_tx_queue_data(
+			event_dev, eth_dev->data->port_id, tx_queue_id, txq);
+		if (ret < 0)
+			return ret;
+	}
+
+	dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
+
+	return 0;
+}
+
+int
+cnxk_sso_tx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct roc_nix_sq *sq;
+	int i, ret;
+
+	RTE_SET_USED(event_dev);
+	if (tx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
+			sq = &cnxk_eth_dev->sqs[i];
+			cnxk_sso_sqb_aura_limit_edit(sq, sq->nb_sqb_bufs);
+			ret = cnxk_sso_updt_tx_queue_data(
+				event_dev, eth_dev->data->port_id, tx_queue_id,
+				NULL);
+			if (ret < 0)
+				return ret;
+		}
+	} else {
+		sq = &cnxk_eth_dev->sqs[tx_queue_id];
+		cnxk_sso_sqb_aura_limit_edit(sq, sq->nb_sqb_bufs);
+		ret = cnxk_sso_updt_tx_queue_data(
+			event_dev, eth_dev->data->port_id, tx_queue_id, NULL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 4/4] event/cnxk: add Tx adapter fastpath ops
  2021-05-24 12:22 [dpdk-dev] [PATCH v2 1/4] event/cnxk: add Rx adapter support pbhagavatula
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 2/4] event/cnxk: add Rx adapter fastpath ops pbhagavatula
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 3/4] event/cnxk: add Tx adapter support pbhagavatula
@ 2021-05-24 12:23 ` pbhagavatula
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  3 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-05-24 12:23 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Tx adapter fastpath operations.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c | 35 ++++++++++++
 drivers/event/cnxk/cn10k_worker.c   | 32 +++++++++++
 drivers/event/cnxk/cn10k_worker.h   | 67 ++++++++++++++++++++++
 drivers/event/cnxk/cn9k_eventdev.c  | 76 +++++++++++++++++++++++++
 drivers/event/cnxk/cn9k_worker.c    | 60 ++++++++++++++++++++
 drivers/event/cnxk/cn9k_worker.h    | 87 +++++++++++++++++++++++++++++
 6 files changed, 357 insertions(+)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 99d2b7a8ba..817cb08480 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -336,6 +336,22 @@ cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 #undef R
 	};
 
+	/* Tx modes */
+	const event_tx_adapter_enqueue sso_hws_tx_adptr_enq[2][2][2][2][2] = {
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	[f4][f3][f2][f1][f0] = cn10k_sso_hws_tx_adptr_enq_##name,
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	const event_tx_adapter_enqueue sso_hws_tx_adptr_enq_seg[2][2][2][2][2] =
+		{
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	[f4][f3][f2][f1][f0] = cn10k_sso_hws_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
 	event_dev->enqueue = cn10k_sso_hws_enq;
 	event_dev->enqueue_burst = cn10k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn10k_sso_hws_enq_new_burst;
@@ -395,6 +411,25 @@ cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
 		}
 	}
+
+	if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+		/* [SEC] [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM] */
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq_seg
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	} else {
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	}
+
+	event_dev->txa_enqueue_same_dest = event_dev->txa_enqueue;
 }
 
 static void
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 4365aec992..fb26e17034 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -175,3 +175,35 @@ cn10k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 
 NIX_RX_FASTPATH_MODES
 #undef R
+
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint64_t cmd[sz];                                              \
+                                                                               \
+		RTE_SET_USED(nb_events);                                       \
+		return cn10k_sso_hws_event_tx(                                 \
+			ws, &ev[0], cmd,                                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_seg_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn10k_sso_hws *ws = port;                               \
+                                                                               \
+		RTE_SET_USED(nb_events);                                       \
+		return cn10k_sso_hws_event_tx(                                 \
+			ws, &ev[0], cmd,                                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index ad320d2dc0..b3f71202ad 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -11,6 +11,7 @@
 
 #include "cn10k_ethdev.h"
 #include "cn10k_rx.h"
+#include "cn10k_tx.h"
 
 /* SSO Operations */
 
@@ -239,4 +240,70 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 NIX_RX_FASTPATH_MODES
 #undef R
 
+static __rte_always_inline const struct cn10k_eth_txq *
+cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
+			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+{
+	return (const struct cn10k_eth_txq *)
+		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
+		       uint64_t *cmd,
+		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		       const uint32_t flags)
+{
+	const struct cn10k_eth_txq *txq;
+	struct rte_mbuf *m = ev->mbuf;
+	uint16_t ref_cnt = m->refcnt;
+	uintptr_t lmt_addr;
+	uint16_t lmt_id;
+	uintptr_t pa;
+
+	lmt_addr = ws->lmt_base;
+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
+	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F)
+		cn10k_nix_xmit_prepare_tso(m, flags);
+
+	cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags);
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		const uint16_t segdw =
+			cn10k_nix_prepare_mseg(m, (uint64_t *)lmt_addr, flags);
+		pa = txq->io_addr | ((segdw - 1) << 4);
+	} else {
+		pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
+	}
+	if (!ev->sched_type)
+		cnxk_sso_hws_head_wait(ws->base + SSOW_LF_GWS_TAG);
+
+	roc_lmt_submit_steorl(lmt_id, pa);
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if (ref_cnt > 1)
+			return 1;
+	}
+
+	cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_TAG,
+				 ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+
+	return 1;
+}
+
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_seg_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_dual_tx_adptr_enq_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_dual_tx_adptr_enq_seg_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events);
+
+NIX_TX_FASTPATH_MODES
+#undef T
+
 #endif
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 2f071f19ea..a1206dcb61 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -427,6 +427,38 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 #undef R
 		};
 
+	/* Tx modes */
+	const event_tx_adapter_enqueue sso_hws_tx_adptr_enq[2][2][2][2][2] = {
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	[f4][f3][f2][f1][f0] = cn9k_sso_hws_tx_adptr_enq_##name,
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	const event_tx_adapter_enqueue sso_hws_tx_adptr_enq_seg[2][2][2][2][2] =
+		{
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	[f4][f3][f2][f1][f0] = cn9k_sso_hws_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_dual_tx_adptr_enq[2][2][2][2][2] = {
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	[f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_dual_tx_adptr_enq_seg[2][2][2][2][2] = {
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	[f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
 	event_dev->enqueue = cn9k_sso_hws_enq;
 	event_dev->enqueue_burst = cn9k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn9k_sso_hws_enq_new_burst;
@@ -487,6 +519,23 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 		}
 	}
 
+	if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+		/* [SEC] [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM] */
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq_seg
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	} else {
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	}
+
 	if (dev->dual_ws) {
 		event_dev->enqueue = cn9k_sso_hws_dual_enq;
 		event_dev->enqueue_burst = cn9k_sso_hws_dual_enq_burst;
@@ -567,8 +616,35 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 						    NIX_RX_OFFLOAD_RSS_F)];
 			}
 		}
+
+		if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+			/* [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM]
+			 */
+			event_dev->txa_enqueue = sso_hws_dual_tx_adptr_enq_seg
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+		} else {
+			event_dev->txa_enqueue = sso_hws_dual_tx_adptr_enq
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+		}
 	}
 
+	event_dev->txa_enqueue_same_dest = event_dev->txa_enqueue;
 	rte_mb();
 }
 
diff --git a/drivers/event/cnxk/cn9k_worker.c b/drivers/event/cnxk/cn9k_worker.c
index 0f031a5fa3..0ffeeeb93a 100644
--- a/drivers/event/cnxk/cn9k_worker.c
+++ b/drivers/event/cnxk/cn9k_worker.c
@@ -376,3 +376,63 @@ cn9k_sso_hws_dual_enq_fwd_burst(void *port, const struct rte_event ev[],
 
 NIX_RX_FASTPATH_MODES
 #undef R
+
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_##name(                   \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint64_t cmd[sz];                                              \
+                                                                               \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base, &ev[0], cmd,                                 \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_seg_##name(               \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn9k_sso_hws *ws = port;                                \
+                                                                               \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base, &ev[0], cmd,                                 \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *ws = port;                           \
+		uint64_t cmd[sz];                                              \
+                                                                               \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base[!ws->vws], &ev[0], cmd,                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}                                                                      \
+                                                                               \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_seg_##name(          \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn9k_sso_hws_dual *ws = port;                           \
+                                                                               \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base[!ws->vws], &ev[0], cmd,                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 1fde652ff8..9ffb8df5b8 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -11,6 +11,7 @@
 
 #include "cn9k_ethdev.h"
 #include "cn9k_rx.h"
+#include "cn9k_tx.h"
 
 /* SSO Operations */
 
@@ -400,4 +401,90 @@ NIX_RX_FASTPATH_MODES
 NIX_RX_FASTPATH_MODES
 #undef R
 
+static __rte_always_inline const struct cn9k_eth_txq *
+cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
+			 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+{
+	return (const struct cn9k_eth_txq *)
+		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+}
+
+static __rte_always_inline void
+cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
+			 uint64_t *cmd, const uint32_t flags)
+{
+	roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_xmit_prepare(m, cmd, flags);
+}
+
+static __rte_always_inline uint16_t
+cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
+		      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		      const uint32_t flags)
+{
+	struct rte_mbuf *m = ev->mbuf;
+	const struct cn9k_eth_txq *txq;
+	uint16_t ref_cnt = m->refcnt;
+
+	/* Perform header writes before barrier for TSO */
+	cn9k_nix_xmit_prepare_tso(m, flags);
+	/* Lets commit any changes in the packet here in case when
+	 * fast free is set as no further changes will be made to mbuf.
+	 * In case of fast free is not set, both cn9k_nix_prepare_mseg()
+	 * and cn9k_nix_xmit_prepare() has a barrier after refcnt update.
+	 */
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+		rte_io_wmb();
+	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
+	cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
+
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		if (!ev->sched_type) {
+			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
+			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
+						       txq->io_addr, segdw);
+		} else {
+			cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr,
+					       segdw);
+		}
+	} else {
+		if (!ev->sched_type) {
+			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
+			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
+						  txq->io_addr, flags);
+		} else {
+			cn9k_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr,
+					  flags);
+		}
+	}
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if (ref_cnt > 1)
+			return 1;
+	}
+
+	cnxk_sso_hws_swtag_flush(base + SSOW_LF_GWS_TAG,
+				 base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+
+	return 1;
+}
+
+#define T(name, f4, f3, f2, f1, f0, sz, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_##name(                   \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_seg_##name(               \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_seg_##name(          \
+		void *port, struct rte_event ev[], uint16_t nb_events);
+
+NIX_TX_FASTPATH_MODES
+#undef T
+
 #endif
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine
  2021-05-24 12:22 [dpdk-dev] [PATCH v2 1/4] event/cnxk: add Rx adapter support pbhagavatula
                   ` (2 preceding siblings ...)
  2021-05-24 12:23 ` [dpdk-dev] [PATCH v2 4/4] event/cnxk: add Tx adapter fastpath ops pbhagavatula
@ 2021-06-19 11:01 ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 02/13] net/cnxk: enable ptp processing in vector Rx pbhagavatula
                     ` (12 more replies)
  3 siblings, 13 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add multi-segment Rx vector routine, form the primary mbufs using
vector path switch to scalar path when extracting segments.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 Depends-on: http://patches.dpdk.org/project/dpdk/list/?series=17394

 drivers/net/cnxk/cn10k_rx.c          | 31 +++++++++++------
 drivers/net/cnxk/cn10k_rx.h          | 51 +++++++++++++++++++++-------
 drivers/net/cnxk/cn10k_rx_vec_mseg.c | 17 ++++++++++
 drivers/net/cnxk/cn9k_rx.c           | 31 +++++++++++------
 drivers/net/cnxk/cn9k_rx.h           | 51 +++++++++++++++++++++-------
 drivers/net/cnxk/cn9k_rx_vec_mseg.c  | 18 ++++++++++
 drivers/net/cnxk/meson.build         |  2 ++
 7 files changed, 157 insertions(+), 44 deletions(-)
 create mode 100644 drivers/net/cnxk/cn10k_rx_vec_mseg.c
 create mode 100644 drivers/net/cnxk/cn9k_rx_vec_mseg.c

diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
index 5c956c06b4..3a9fd71309 100644
--- a/drivers/net/cnxk/cn10k_rx.c
+++ b/drivers/net/cnxk/cn10k_rx.c
@@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
+
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 }

 void
@@ -60,20 +62,29 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 #undef R
 	};

-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
-		pick_rx_func(eth_dev, nix_eth_rx_burst);
-	else
-		pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
+	const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_nix_recv_pkts_vec_mseg_##name,

-	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
-		pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};

 	/* Copy multi seg version with no offload for tear down sequence */
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
-	rte_mb();
+
+	/* For PTP enabled, scalar rx function should be chosen as most of the
+	 * PTP apps are implemented to rx burst 1 pkt.
+	 */
+	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		return pick_rx_func(eth_dev, nix_eth_rx_burst);
+	}
+
+	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+		return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
+	return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
 }
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 1cc37cbaa0..5926ff7f46 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -119,8 +119,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,

 	sg = *(const uint64_t *)(rx + 1);
 	nb_segs = (sg >> 48) & 0x3;
-	mbuf->nb_segs = nb_segs;
+
+	if (nb_segs == 1) {
+		mbuf->next = NULL;
+		return;
+	}
+
+	mbuf->pkt_len = rx->pkt_lenm1 + 1;
 	mbuf->data_len = sg & 0xFFFF;
+	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;

 	eol = ((const rte_iova_t *)(rx + 1) + ((rx->desc_sizem1 + 1) << 1));
@@ -195,15 +202,14 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags = nix_update_match_id(rx->match_id, ol_flags, mbuf);

 	mbuf->ol_flags = ol_flags;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
 	mbuf->pkt_len = len;
+	mbuf->data_len = len;
+	*(uint64_t *)(&mbuf->rearm_data) = val;

-	if (flag & NIX_RX_MULTI_SEG_F) {
+	if (flag & NIX_RX_MULTI_SEG_F)
 		nix_cqe_xtract_mseg(rx, mbuf, val);
-	} else {
-		mbuf->data_len = len;
+	else
 		mbuf->next = NULL;
-	}
 }

 static inline uint16_t
@@ -481,16 +487,34 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Update that no more segments */
-		mbuf0->next = NULL;
-		mbuf1->next = NULL;
-		mbuf2->next = NULL;
-		mbuf3->next = NULL;
-
 		/* Store the mbufs to rx_pkts */
 		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
 		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);

+		if (flags & NIX_RX_MULTI_SEG_F) {
+			/* Multi segment is enable build mseg list for
+			 * individual mbufs in scalar mode.
+			 */
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer);
+		} else {
+			/* Update that no more segments */
+			mbuf0->next = NULL;
+			mbuf1->next = NULL;
+			mbuf2->next = NULL;
+			mbuf3->next = NULL;
+		}
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
@@ -645,6 +669,9 @@ R(vlan_ts_mark_cksum_ptype_rss,	1, 1, 1, 1, 1, 1,			       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_##name(      \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);

 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_rx_vec_mseg.c b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
new file mode 100644
index 0000000000..04d1e46c82
--- /dev/null
+++ b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_ethdev.h"
+#include "cn10k_rx.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
+					  (flags) | NIX_RX_MULTI_SEG_F);       \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
index 0acedd0a1f..d293d4eac3 100644
--- a/drivers/net/cnxk/cn9k_rx.c
+++ b/drivers/net/cnxk/cn9k_rx.c
@@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
+
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 }

 void
@@ -60,20 +62,29 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 #undef R
 	};

-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
-		pick_rx_func(eth_dev, nix_eth_rx_burst);
-	else
-		pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
+	const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_nix_recv_pkts_vec_mseg_##name,

-	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
-		pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};

 	/* Copy multi seg version with no offload for tear down sequence */
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
-	rte_mb();
+
+	/* For PTP enabled, scalar rx function should be chosen as most of the
+	 * PTP apps are implemented to rx burst 1 pkt.
+	 */
+	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		return pick_rx_func(eth_dev, nix_eth_rx_burst);
+	}
+
+	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+		return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
+	return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
 }
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 10ef5c6905..5ae9e8195c 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -120,8 +120,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,

 	sg = *(const uint64_t *)(rx + 1);
 	nb_segs = (sg >> 48) & 0x3;
-	mbuf->nb_segs = nb_segs;
+
+	if (nb_segs == 1) {
+		mbuf->next = NULL;
+		return;
+	}
+
+	mbuf->pkt_len = rx->pkt_lenm1 + 1;
 	mbuf->data_len = sg & 0xFFFF;
+	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;

 	eol = ((const rte_iova_t *)(rx + 1) +
@@ -198,15 +205,14 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);

 	mbuf->ol_flags = ol_flags;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
 	mbuf->pkt_len = len;
+	mbuf->data_len = len;
+	*(uint64_t *)(&mbuf->rearm_data) = val;

-	if (flag & NIX_RX_MULTI_SEG_F) {
+	if (flag & NIX_RX_MULTI_SEG_F)
 		nix_cqe_xtract_mseg(rx, mbuf, val);
-	} else {
-		mbuf->data_len = len;
+	else
 		mbuf->next = NULL;
-	}
 }

 static inline uint16_t
@@ -484,16 +490,34 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Update that no more segments */
-		mbuf0->next = NULL;
-		mbuf1->next = NULL;
-		mbuf2->next = NULL;
-		mbuf3->next = NULL;
-
 		/* Store the mbufs to rx_pkts */
 		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
 		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);

+		if (flags & NIX_RX_MULTI_SEG_F) {
+			/* Multi segment is enable build mseg list for
+			 * individual mbufs in scalar mode.
+			 */
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer);
+		} else {
+			/* Update that no more segments */
+			mbuf0->next = NULL;
+			mbuf1->next = NULL;
+			mbuf2->next = NULL;
+			mbuf3->next = NULL;
+		}
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
@@ -647,6 +671,9 @@ R(vlan_ts_mark_cksum_ptype_rss,	1, 1, 1, 1, 1, 1,			       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);

 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn9k_rx_vec_mseg.c b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
new file mode 100644
index 0000000000..e46d8a4749
--- /dev/null
+++ b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_ethdev.h"
+#include "cn9k_rx.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
+						 (flags) |                     \
+							 NIX_RX_MULTI_SEG_F);  \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/net/cnxk/meson.build b/drivers/net/cnxk/meson.build
index 2071d0dcb2..aa8c7253fb 100644
--- a/drivers/net/cnxk/meson.build
+++ b/drivers/net/cnxk/meson.build
@@ -23,6 +23,7 @@ sources += files('cn9k_ethdev.c',
 		 'cn9k_rx.c',
 		 'cn9k_rx_mseg.c',
 		 'cn9k_rx_vec.c',
+		 'cn9k_rx_vec_mseg.c',
 		 'cn9k_tx.c',
 		 'cn9k_tx_mseg.c',
 		 'cn9k_tx_vec.c')
@@ -32,6 +33,7 @@ sources += files('cn10k_ethdev.c',
 		 'cn10k_rx.c',
 		 'cn10k_rx_mseg.c',
 		 'cn10k_rx_vec.c',
+		 'cn10k_rx_vec_mseg.c',
 		 'cn10k_tx.c',
 		 'cn10k_tx_mseg.c',
 		 'cn10k_tx_vec.c')
--
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 02/13] net/cnxk: enable ptp processing in vector Rx
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 03/13] net/cnxk: enable VLAN processing in vector Tx pbhagavatula
                     ` (11 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable PTP offload in vector Rx burst function, use vector path
for processing mbufs and finally switch to scalar when extracting
timestamp.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_ethdev.c |   1 -
 drivers/net/cnxk/cn10k_rx.c     |   5 +-
 drivers/net/cnxk/cn10k_rx.h     | 124 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn10k_rx_vec.c |   3 -
 drivers/net/cnxk/cn9k_ethdev.c  |   1 -
 drivers/net/cnxk/cn9k_rx.c      |   5 +-
 drivers/net/cnxk/cn9k_rx.h      | 124 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_rx_vec.c  |   3 -
 drivers/net/cnxk/cnxk_ethdev.h  |  19 ++---
 9 files changed, 232 insertions(+), 53 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index b079edbd35..7caec6cf14 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -301,7 +301,6 @@ nix_ptp_enable_vf(struct rte_eth_dev *eth_dev)
 	if (nix_recalc_mtu(eth_dev))
 		plt_err("Failed to set MTU size for ptp");
 
-	dev->scalar_ena = true;
 	dev->rx_offload_flags |= NIX_RX_OFFLOAD_TSTAMP_F;
 
 	/* Setting up the function pointers as per new offload flags */
diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
index 3a9fd71309..69e767ac3d 100644
--- a/drivers/net/cnxk/cn10k_rx.c
+++ b/drivers/net/cnxk/cn10k_rx.c
@@ -75,10 +75,7 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
 
-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+	if (dev->scalar_ena) {
 		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
 			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
 		return pick_rx_func(eth_dev, nix_eth_rx_burst);
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 5926ff7f46..abdd58e888 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -109,7 +109,7 @@ nix_update_match_id(const uint16_t match_id, uint64_t ol_flags,
 
 static __rte_always_inline void
 nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
-		    uint64_t rearm)
+		    uint64_t rearm, const uint16_t flags)
 {
 	const rte_iova_t *iova_list;
 	struct rte_mbuf *head;
@@ -125,8 +125,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
 		return;
 	}
 
-	mbuf->pkt_len = rx->pkt_lenm1 + 1;
-	mbuf->data_len = sg & 0xFFFF;
+	mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					       CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+	mbuf->data_len = (sg & 0xFFFF) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
 	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;
 
@@ -207,7 +209,7 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 	*(uint64_t *)(&mbuf->rearm_data) = val;
 
 	if (flag & NIX_RX_MULTI_SEG_F)
-		nix_cqe_xtract_mseg(rx, mbuf, val);
+		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
 	else
 		mbuf->next = NULL;
 }
@@ -272,8 +274,9 @@ cn10k_nix_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts,
 				      flags);
 		cnxk_nix_mbuf_to_tstamp(mbuf, rxq->tstamp,
 					(flags & NIX_RX_OFFLOAD_TSTAMP_F),
-					(uint64_t *)((uint8_t *)mbuf + data_off)
-					);
+					(flags & NIX_RX_MULTI_SEG_F),
+					(uint64_t *)((uint8_t *)mbuf
+								+ data_off));
 		rx_pkts[packets++] = mbuf;
 		roc_prefetch_store_keep(mbuf);
 		head++;
@@ -469,6 +472,99 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				mbuf3);
 		}
 
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
+			const uint16x8_t len_off = {
+				0,			     /* ptype   0:15 */
+				0,			     /* ptype  16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* pktlen  0:15*/
+				0,			     /* pktlen 16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* datalen 0:15 */
+				0,
+				0,
+				0};
+			const uint32x4_t ptype = {RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC};
+			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
+						PKT_RX_IEEE1588_TMST |
+						rxq->tstamp->rx_tstamp_dynflag;
+			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
+			uint64x2_t ts01, ts23, mask;
+			uint64_t ts[4];
+			uint8_t res;
+
+			/* Substract timesync length from total pkt length. */
+			f0 = vsubq_u16(f0, len_off);
+			f1 = vsubq_u16(f1, len_off);
+			f2 = vsubq_u16(f2, len_off);
+			f3 = vsubq_u16(f3, len_off);
+
+			/* Get the address of actual timestamp. */
+			ts01 = vaddq_u64(mbuf01, data_off);
+			ts23 = vaddq_u64(mbuf23, data_off);
+			/* Load timestamp from address. */
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  0),
+					      ts01, 0);
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  1),
+					      ts01, 1);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  0),
+					      ts23, 0);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  1),
+					      ts23, 1);
+			/* Convert from be to cpu byteorder. */
+			ts01 = vrev64q_u8(ts01);
+			ts23 = vrev64q_u8(ts23);
+			/* Store timestamp into scalar for later use. */
+			ts[0] = vgetq_lane_u64(ts01, 0);
+			ts[1] = vgetq_lane_u64(ts01, 1);
+			ts[2] = vgetq_lane_u64(ts23, 0);
+			ts[3] = vgetq_lane_u64(ts23, 1);
+
+			/* Store timestamp into dynfield. */
+			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
+				ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
+				ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
+				ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
+				ts[3];
+
+			/* Generate ptype mask to filter L2 ether timesync */
+			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
+			mask = vsetq_lane_u32(vgetq_lane_u32(f1, 0), mask, 1);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f2, 0), mask, 2);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f3, 0), mask, 3);
+
+			/* Match against L2 ether timesync. */
+			mask = vceqq_u32(mask, ptype);
+			/* Convert from vector from scalar mask */
+			res = vaddvq_u32(vandq_u32(mask, and_mask));
+			res &= 0xF;
+
+			if (res) {
+				/* Fill in the ol_flags for any packets that
+				 * matched.
+				 */
+				ol_flags0 |= ((res & 0x1) ? ts_olf : 0);
+				ol_flags1 |= ((res & 0x2) ? ts_olf : 0);
+				ol_flags2 |= ((res & 0x4) ? ts_olf : 0);
+				ol_flags3 |= ((res & 0x8) ? ts_olf : 0);
+
+				/* Update Rxq timestamp with the latest
+				 * timestamp.
+				 */
+				rxq->tstamp->rx_ready = 1;
+				rxq->tstamp->rx_tstamp =
+					ts[31 - __builtin_clz(res)];
+			}
+		}
+
 		/* Form rearm_data with ol_flags */
 		rearm0 = vsetq_lane_u64(ol_flags0, rearm0, 1);
 		rearm1 = vsetq_lane_u64(ol_flags1, rearm1, 1);
@@ -496,17 +592,17 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
diff --git a/drivers/net/cnxk/cn10k_rx_vec.c b/drivers/net/cnxk/cn10k_rx_vec.c
index 65ffa97841..93528a44f9 100644
--- a/drivers/net/cnxk/cn10k_rx_vec.c
+++ b/drivers/net/cnxk/cn10k_rx_vec.c
@@ -11,9 +11,6 @@
 					       struct rte_mbuf **rx_pkts,      \
 					       uint16_t pkts)                  \
 	{                                                                      \
-		/* TSTMP is not supported by vector */                         \
-		if ((flags) & NIX_RX_OFFLOAD_TSTAMP_F)                         \
-			return 0;                                              \
 		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
 						  (flags));		       \
 	}
diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index 107a540915..cb302b75d8 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -309,7 +309,6 @@ nix_ptp_enable_vf(struct rte_eth_dev *eth_dev)
 	if (nix_recalc_mtu(eth_dev))
 		plt_err("Failed to set MTU size for ptp");
 
-	dev->scalar_ena = true;
 	dev->rx_offload_flags |= NIX_RX_OFFLOAD_TSTAMP_F;
 
 	/* Setting up the function pointers as per new offload flags */
diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
index d293d4eac3..7d9f1bd61f 100644
--- a/drivers/net/cnxk/cn9k_rx.c
+++ b/drivers/net/cnxk/cn9k_rx.c
@@ -75,10 +75,7 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
 
-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+	if (dev->scalar_ena) {
 		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
 			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
 		return pick_rx_func(eth_dev, nix_eth_rx_burst);
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 5ae9e8195c..dd3e5d3c7e 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -110,7 +110,7 @@ nix_update_match_id(const uint16_t match_id, uint64_t ol_flags,
 
 static __rte_always_inline void
 nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
-		    uint64_t rearm)
+		    uint64_t rearm, const uint16_t flags)
 {
 	const rte_iova_t *iova_list;
 	struct rte_mbuf *head;
@@ -126,8 +126,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
 		return;
 	}
 
-	mbuf->pkt_len = rx->pkt_lenm1 + 1;
-	mbuf->data_len = sg & 0xFFFF;
+	mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					       CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+	mbuf->data_len = (sg & 0xFFFF) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
 	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;
 
@@ -210,7 +212,7 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 	*(uint64_t *)(&mbuf->rearm_data) = val;
 
 	if (flag & NIX_RX_MULTI_SEG_F)
-		nix_cqe_xtract_mseg(rx, mbuf, val);
+		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
 	else
 		mbuf->next = NULL;
 }
@@ -275,8 +277,9 @@ cn9k_nix_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts,
 				     flags);
 		cnxk_nix_mbuf_to_tstamp(mbuf, rxq->tstamp,
 					(flags & NIX_RX_OFFLOAD_TSTAMP_F),
-					(uint64_t *)((uint8_t *)mbuf + data_off)
-					);
+					(flags & NIX_RX_MULTI_SEG_F),
+					(uint64_t *)((uint8_t *)mbuf
+								+ data_off));
 		rx_pkts[packets++] = mbuf;
 		roc_prefetch_store_keep(mbuf);
 		head++;
@@ -472,6 +475,99 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				mbuf3);
 		}
 
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
+			const uint16x8_t len_off = {
+				0,			     /* ptype   0:15 */
+				0,			     /* ptype  16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* pktlen  0:15*/
+				0,			     /* pktlen 16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* datalen 0:15 */
+				0,
+				0,
+				0};
+			const uint32x4_t ptype = {RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC};
+			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
+						PKT_RX_IEEE1588_TMST |
+						rxq->tstamp->rx_tstamp_dynflag;
+			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
+			uint64x2_t ts01, ts23, mask;
+			uint64_t ts[4];
+			uint8_t res;
+
+			/* Substract timesync length from total pkt length. */
+			f0 = vsubq_u16(f0, len_off);
+			f1 = vsubq_u16(f1, len_off);
+			f2 = vsubq_u16(f2, len_off);
+			f3 = vsubq_u16(f3, len_off);
+
+			/* Get the address of actual timestamp. */
+			ts01 = vaddq_u64(mbuf01, data_off);
+			ts23 = vaddq_u64(mbuf23, data_off);
+			/* Load timestamp from address. */
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  0),
+					      ts01, 0);
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  1),
+					      ts01, 1);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  0),
+					      ts23, 0);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  1),
+					      ts23, 1);
+			/* Convert from be to cpu byteorder. */
+			ts01 = vrev64q_u8(ts01);
+			ts23 = vrev64q_u8(ts23);
+			/* Store timestamp into scalar for later use. */
+			ts[0] = vgetq_lane_u64(ts01, 0);
+			ts[1] = vgetq_lane_u64(ts01, 1);
+			ts[2] = vgetq_lane_u64(ts23, 0);
+			ts[3] = vgetq_lane_u64(ts23, 1);
+
+			/* Store timestamp into dynfield. */
+			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
+				ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
+				ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
+				ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
+				ts[3];
+
+			/* Generate ptype mask to filter L2 ether timesync */
+			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
+			mask = vsetq_lane_u32(vgetq_lane_u32(f1, 0), mask, 1);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f2, 0), mask, 2);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f3, 0), mask, 3);
+
+			/* Match against L2 ether timesync. */
+			mask = vceqq_u32(mask, ptype);
+			/* Convert from vector from scalar mask */
+			res = vaddvq_u32(vandq_u32(mask, and_mask));
+			res &= 0xF;
+
+			if (res) {
+				/* Fill in the ol_flags for any packets that
+				 * matched.
+				 */
+				ol_flags0 |= ((res & 0x1) ? ts_olf : 0);
+				ol_flags1 |= ((res & 0x2) ? ts_olf : 0);
+				ol_flags2 |= ((res & 0x4) ? ts_olf : 0);
+				ol_flags3 |= ((res & 0x8) ? ts_olf : 0);
+
+				/* Update Rxq timestamp with the latest
+				 * timestamp.
+				 */
+				rxq->tstamp->rx_ready = 1;
+				rxq->tstamp->rx_tstamp =
+					ts[31 - __builtin_clz(res)];
+			}
+		}
+
 		/* Form rearm_data with ol_flags */
 		rearm0 = vsetq_lane_u64(ol_flags0, rearm0, 1);
 		rearm1 = vsetq_lane_u64(ol_flags1, rearm1, 1);
@@ -499,17 +595,17 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
diff --git a/drivers/net/cnxk/cn9k_rx_vec.c b/drivers/net/cnxk/cn9k_rx_vec.c
index e61c2225c6..ef5f771ef7 100644
--- a/drivers/net/cnxk/cn9k_rx_vec.c
+++ b/drivers/net/cnxk/cn9k_rx_vec.c
@@ -9,9 +9,6 @@
 	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
 	{                                                                      \
-		/* TSTMP is not supported by vector */                         \
-		if ((flags) & NIX_RX_OFFLOAD_TSTAMP_F)                         \
-			return 0;                                              \
 		return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
 						 (flags));                     \
 	}
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index 67b1f42531..4eead03905 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -136,13 +136,12 @@ struct cnxk_eth_qconf {
 };
 
 struct cnxk_timesync_info {
+	uint8_t rx_ready;
+	uint64_t rx_tstamp;
 	uint64_t rx_tstamp_dynflag;
+	int tstamp_dynfield_offset;
 	rte_iova_t tx_tstamp_iova;
 	uint64_t *tx_tstamp;
-	uint64_t rx_tstamp;
-	int tstamp_dynfield_offset;
-	uint8_t tx_ready;
-	uint8_t rx_ready;
 } __plt_cache_aligned;
 
 struct cnxk_eth_dev {
@@ -465,13 +464,15 @@ cnxk_nix_timestamp_dynfield(struct rte_mbuf *mbuf,
 
 static __rte_always_inline void
 cnxk_nix_mbuf_to_tstamp(struct rte_mbuf *mbuf,
-			struct cnxk_timesync_info *tstamp, bool ts_enable,
+			struct cnxk_timesync_info *tstamp,
+			const uint8_t ts_enable, const uint8_t mseg_enable,
 			uint64_t *tstamp_ptr)
 {
-	if (ts_enable &&
-	    (mbuf->data_off ==
-	     RTE_PKTMBUF_HEADROOM + CNXK_NIX_TIMESYNC_RX_OFFSET)) {
-		mbuf->pkt_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+	if (ts_enable) {
+		if (!mseg_enable) {
+			mbuf->pkt_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+			mbuf->data_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+		}
 
 		/* Reading the rx timestamp inserted by CGX, viz at
 		 * starting of the packet data.
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 03/13] net/cnxk: enable VLAN processing in vector Tx
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 02/13] net/cnxk: enable ptp processing in vector Rx pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 04/13] net/cnxk: enable ptp " pbhagavatula
                     ` (10 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable VLAN offload in vector Tx burst function.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |   3 +-
 drivers/net/cnxk/cn10k_tx.h     | 125 +++++++++++++++++++++++++++----
 drivers/net/cnxk/cn10k_tx_vec.c |   3 +-
 drivers/net/cnxk/cn9k_tx.c      |   3 +-
 drivers/net/cnxk/cn9k_tx.h      | 128 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_tx_vec.c  |   3 +-
 6 files changed, 227 insertions(+), 38 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 18694dc704..05bc163a40 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -69,8 +69,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 
 	if (dev->scalar_ena ||
 	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
-	      NIX_TX_OFFLOAD_TSO_F)))
+	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8b1446f25c..1e16978584 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -62,9 +62,14 @@ cn10k_nix_tx_ext_subs(const uint16_t flags)
 static __rte_always_inline uint8_t
 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
 {
-	RTE_SET_USED(flags);
-	/* We can pack up to 4 packets per LMTLINE if there are no offloads. */
-	return 4 << ROC_LMT_LINES_PER_CORE_LOG2;
+	return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
+	       << ROC_LMT_LINES_PER_CORE_LOG2;
+}
+
+static __rte_always_inline uint8_t
+cn10k_nix_tx_dwords_per_line(const uint16_t flags)
+{
+	return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
 }
 
 static __rte_always_inline uint64_t
@@ -98,10 +103,9 @@ cn10k_nix_tx_steor_data(const uint16_t flags)
 static __rte_always_inline uint64_t
 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
 {
-	const uint64_t dw_m1 = 0x7;
+	const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
 	uint64_t data;
 
-	RTE_SET_USED(flags);
 	/* This will be moved to addr area */
 	data = dw_m1;
 	/* 15 vector sizes for single seg */
@@ -690,11 +694,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
-	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
+	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
+		cmd2[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
+	uint64x2_t sendext01_w0, sendext23_w0;
+	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -720,6 +727,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
 	sgdesc23_w0 = sgdesc01_w0;
 
+	/* Load command defaults into vector variables. */
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
+		sendext23_w0 = sendext01_w0;
+		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		sendext23_w1 = sendext01_w1;
+	}
+
 	/* Get LMT base address and LMT ID as lcore id */
 	ROC_LMT_BASE_ID_GET(laddr, lmt_id);
 	left = pkts;
@@ -738,6 +753,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc23_w0 = senddesc01_w0;
 		sgdesc23_w0 = sgdesc01_w0;
 
+		/* Clear vlan enables. */
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			sendext01_w1 = vbicq_u64(sendext01_w1,
+						 vdupq_n_u64(0x3FFFF00FFFF00));
+			sendext23_w1 = sendext01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1303,6 +1325,52 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 		senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+			/* Tx ol_flag for vlan. */
+			const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
+			/* Bit enable for VLAN1 */
+			const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+			/* Tx ol_flag for QnQ. */
+			const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
+			/* Bit enable for VLAN0 */
+			const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+			/* Load vlan values from packet. outer is VLAN 0 */
+			uint64x2_t ext01 = {
+				((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+			};
+			uint64x2_t ext23 = {
+				((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+			};
+
+			/* Get ol_flags of the packets. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* ORR vlan outer/inner values into cmd. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+			/* Test for offload enable bits and generate masks. */
+			xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(xtmp128, olq),
+						      mlq));
+			ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(ytmp128, olq),
+						      mlq));
+
+			/* Set vlan enable bits into cmd based on mask. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1381,16 +1449,41 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 		cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 
-		/* Store the prepared send desc to LMT lines */
-		vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
-		lnum += 1;
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+			cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+			cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+		}
+
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			/* Store the prepared send desc to LMT lines */
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+			lnum += 1;
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			lnum += 1;
+		} else {
+			/* Store the prepared send desc to LMT lines */
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
+			lnum += 1;
+		}
 
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 7453f3bc98..beb5c649bb 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -14,8 +14,7 @@
 		uint64_t cmd[sz];                                              \
 									       \
 		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
+		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
 		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index b802606075..4b43cdaff9 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -68,8 +68,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 
 	if (dev->scalar_ena ||
 	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
-	      NIX_TX_OFFLOAD_TSO_F)))
+	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 1899d6670f..d5715bb52d 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -552,10 +552,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
-	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
+	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
+		cmd2[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
+	uint64x2_t sendext01_w0, sendext23_w0;
+	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
@@ -585,8 +588,19 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	senddesc23_w0 = senddesc01_w0;
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
-	sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-	sgdesc23_w0 = sgdesc01_w0;
+
+	/* Load command defaults into vector variables. */
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+		sendext23_w0 = sendext01_w0;
+		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		sendext23_w1 = sendext01_w1;
+		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
+		sgdesc23_w0 = sgdesc01_w0;
+	} else {
+		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+		sgdesc23_w0 = sgdesc01_w0;
+	}
 
 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
 		/* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
@@ -597,6 +611,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc23_w0 = senddesc01_w0;
 		sgdesc23_w0 = sgdesc01_w0;
 
+		/* Clear vlan enables. */
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			sendext01_w1 = vbicq_u64(sendext01_w1,
+						 vdupq_n_u64(0x3FFFF00FFFF00));
+			sendext23_w1 = sendext01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1162,6 +1183,52 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 		senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+			/* Tx ol_flag for vlan. */
+			const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
+			/* Bit enable for VLAN1 */
+			const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+			/* Tx ol_flag for QnQ. */
+			const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
+			/* Bit enable for VLAN0 */
+			const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+			/* Load vlan values from packet. outer is VLAN 0 */
+			uint64x2_t ext01 = {
+				((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+			};
+			uint64x2_t ext23 = {
+				((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+			};
+
+			/* Get ol_flags of the packets. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* ORR vlan outer/inner values into cmd. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+			/* Test for offload enable bits and generate masks. */
+			xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(xtmp128, olq),
+						      mlq));
+			ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(ytmp128, olq),
+						      mlq));
+
+			/* Set vlan enable bits into cmd based on mask. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1247,17 +1314,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 		cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 
-		do {
-			vst1q_u64(lmt_addr, cmd0[0]);
-			vst1q_u64(lmt_addr + 2, cmd1[0]);
-			vst1q_u64(lmt_addr + 4, cmd0[1]);
-			vst1q_u64(lmt_addr + 6, cmd1[1]);
-			vst1q_u64(lmt_addr + 8, cmd0[2]);
-			vst1q_u64(lmt_addr + 10, cmd1[2]);
-			vst1q_u64(lmt_addr + 12, cmd0[3]);
-			vst1q_u64(lmt_addr + 14, cmd1[3]);
-			lmt_status = roc_lmt_submit_ldeor(io_addr);
-		} while (lmt_status == 0);
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+			cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+			cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+		}
+
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			/* With ext header in the command we can no longer send
+			 * all 4 packets together since LMTLINE is 128bytes.
+			 * Split and Tx twice.
+			 */
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd2[0]);
+				vst1q_u64(lmt_addr + 4, cmd1[0]);
+				vst1q_u64(lmt_addr + 6, cmd0[1]);
+				vst1q_u64(lmt_addr + 8, cmd2[1]);
+				vst1q_u64(lmt_addr + 10, cmd1[1]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+
+			do {
+				vst1q_u64(lmt_addr, cmd0[2]);
+				vst1q_u64(lmt_addr + 2, cmd2[2]);
+				vst1q_u64(lmt_addr + 4, cmd1[2]);
+				vst1q_u64(lmt_addr + 6, cmd0[3]);
+				vst1q_u64(lmt_addr + 8, cmd2[3]);
+				vst1q_u64(lmt_addr + 10, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+		} else {
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd1[0]);
+				vst1q_u64(lmt_addr + 4, cmd0[1]);
+				vst1q_u64(lmt_addr + 6, cmd1[1]);
+				vst1q_u64(lmt_addr + 8, cmd0[2]);
+				vst1q_u64(lmt_addr + 10, cmd1[2]);
+				vst1q_u64(lmt_addr + 12, cmd0[3]);
+				vst1q_u64(lmt_addr + 14, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+		}
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index a6e7c9e542..5842facb58 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -14,8 +14,7 @@
 		uint64_t cmd[sz];                                              \
 									       \
 		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
+		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
 		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 04/13] net/cnxk: enable ptp processing in vector Tx
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 02/13] net/cnxk: enable ptp processing in vector Rx pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 03/13] net/cnxk: enable VLAN processing in vector Tx pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 05/13] net/cnxk: enable TSO " pbhagavatula
                     ` (9 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable PTP offload in vector Tx burst function. Since, we can
no-longer use a single LMT line for burst of 4, split the LMT
into two and transmit twice.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |   4 +-
 drivers/net/cnxk/cn10k_tx.h     | 109 +++++++++++++++++++++++++++-----
 drivers/net/cnxk/cn10k_tx_vec.c |   5 +-
 drivers/net/cnxk/cn9k_tx.c      |   4 +-
 drivers/net/cnxk/cn9k_tx.h      | 105 ++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_tx_vec.c  |   5 +-
 6 files changed, 192 insertions(+), 40 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 05bc163a40..c4c3e65704 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,9 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena ||
-	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
+	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 1e16978584..d5812c5c28 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -69,7 +69,9 @@ cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
 static __rte_always_inline uint8_t
 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
 {
-	return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
+	return (flags & NIX_TX_NEED_EXT_HDR) ?
+			     ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
+			     8;
 }
 
 static __rte_always_inline uint64_t
@@ -695,13 +697,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
-		cmd2[NIX_DESCS_PER_LOOP];
+		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
 	uint64x2_t sendext01_w0, sendext23_w0;
 	uint64x2_t sendext01_w1, sendext23_w1;
+	uint64x2_t sendmem01_w0, sendmem23_w0;
+	uint64x2_t sendmem01_w1, sendmem23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -733,6 +737,12 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		sendext23_w0 = sendext01_w0;
 		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
 		sendext23_w1 = sendext01_w1;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+			sendmem23_w0 = sendmem01_w0;
+			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
+			sendmem23_w1 = sendmem01_w1;
+		}
 	}
 
 	/* Get LMT base address and LMT ID as lcore id */
@@ -760,6 +770,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = sendext01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Reset send mem alg to SETTSTMP from SUB*/
+			sendmem01_w0 = vbicq_u64(sendmem01_w0,
+						 vdupq_n_u64(BIT_ULL(59)));
+			/* Reset send mem address to default. */
+			sendmem01_w1 =
+				vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+			sendmem23_w0 = sendmem01_w0;
+			sendmem23_w1 = sendmem01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1371,6 +1392,44 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Tx ol_flag for timestam. */
+			const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
+						PKT_TX_IEEE1588_TMST};
+			/* Set send mem alg to SUB. */
+			const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+			/* Increment send mem address by 8. */
+			const uint64x2_t addr = {0x8, 0x8};
+
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Check if timestamp is requested and genered inverted
+			 * mask as we need not make any changes to default cmd
+			 * value.
+			 */
+			xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+			ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+			/* Change send mem address to an 8 byte offset when
+			 * TSTMP is disabled.
+			 */
+			sendmem01_w1 = vaddq_u64(sendmem01_w1,
+						 vandq_u64(xtmp128, addr));
+			sendmem23_w1 = vaddq_u64(sendmem23_w1,
+						 vandq_u64(ytmp128, addr));
+			/* Change send mem alg to SUB when TSTMP is disabled. */
+			sendmem01_w0 = vorrq_u64(sendmem01_w0,
+						 vandq_u64(xtmp128, alg));
+			sendmem23_w0 = vorrq_u64(sendmem23_w0,
+						 vandq_u64(ytmp128, alg));
+
+			cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1458,19 +1517,39 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 		if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* Store the prepared send desc to LMT lines */
-			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
-			lnum += 1;
-			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
+				lnum += 1;
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
+			} else {
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+				lnum += 1;
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			}
 			lnum += 1;
 		} else {
 			/* Store the prepared send desc to LMT lines */
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index beb5c649bb..0b4a4c7bae 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -13,9 +13,8 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* TSO is not supported by vec */                              \
+		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
 						  (flags));                    \
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index 4b43cdaff9..c32681ed44 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,9 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena ||
-	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
+	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index d5715bb52d..bfb34abb23 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -553,12 +553,14 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
-		cmd2[NIX_DESCS_PER_LOOP];
+		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint64x2_t sendext01_w0, sendext23_w0;
 	uint64x2_t sendext01_w1, sendext23_w1;
+	uint64x2_t sendmem01_w0, sendmem23_w0;
+	uint64x2_t sendmem01_w1, sendmem23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
@@ -597,6 +599,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		sendext23_w1 = sendext01_w1;
 		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
 		sgdesc23_w0 = sgdesc01_w0;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
+			sendmem23_w0 = sendmem01_w0;
+			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
+			sendmem23_w1 = sendmem01_w1;
+		}
 	} else {
 		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
 		sgdesc23_w0 = sgdesc01_w0;
@@ -618,6 +626,17 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = sendext01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Reset send mem alg to SETTSTMP from SUB*/
+			sendmem01_w0 = vbicq_u64(sendmem01_w0,
+						 vdupq_n_u64(BIT_ULL(59)));
+			/* Reset send mem address to default. */
+			sendmem01_w1 =
+				vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+			sendmem23_w0 = sendmem01_w0;
+			sendmem23_w1 = sendmem01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1229,6 +1248,44 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Tx ol_flag for timestam. */
+			const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
+						PKT_TX_IEEE1588_TMST};
+			/* Set send mem alg to SUB. */
+			const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+			/* Increment send mem address by 8. */
+			const uint64x2_t addr = {0x8, 0x8};
+
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Check if timestamp is requested and genered inverted
+			 * mask as we need not make any changes to default cmd
+			 * value.
+			 */
+			xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+			ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+			/* Change send mem address to an 8 byte offset when
+			 * TSTMP is disabled.
+			 */
+			sendmem01_w1 = vaddq_u64(sendmem01_w1,
+						 vandq_u64(xtmp128, addr));
+			sendmem23_w1 = vaddq_u64(sendmem23_w1,
+						 vandq_u64(ytmp128, addr));
+			/* Change send mem alg to SUB when TSTMP is disabled. */
+			sendmem01_w0 = vorrq_u64(sendmem01_w0,
+						 vandq_u64(xtmp128, alg));
+			sendmem23_w0 = vorrq_u64(sendmem23_w0,
+						 vandq_u64(ytmp128, alg));
+
+			cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1327,22 +1384,44 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			 * Split and Tx twice.
 			 */
 			do {
-				vst1q_u64(lmt_addr, cmd0[0]);
-				vst1q_u64(lmt_addr + 2, cmd2[0]);
-				vst1q_u64(lmt_addr + 4, cmd1[0]);
-				vst1q_u64(lmt_addr + 6, cmd0[1]);
-				vst1q_u64(lmt_addr + 8, cmd2[1]);
-				vst1q_u64(lmt_addr + 10, cmd1[1]);
+				if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+					vst1q_u64(lmt_addr, cmd0[0]);
+					vst1q_u64(lmt_addr + 2, cmd2[0]);
+					vst1q_u64(lmt_addr + 4, cmd1[0]);
+					vst1q_u64(lmt_addr + 6, cmd3[0]);
+					vst1q_u64(lmt_addr + 8, cmd0[1]);
+					vst1q_u64(lmt_addr + 10, cmd2[1]);
+					vst1q_u64(lmt_addr + 12, cmd1[1]);
+					vst1q_u64(lmt_addr + 14, cmd3[1]);
+				} else {
+					vst1q_u64(lmt_addr, cmd0[0]);
+					vst1q_u64(lmt_addr + 2, cmd2[0]);
+					vst1q_u64(lmt_addr + 4, cmd1[0]);
+					vst1q_u64(lmt_addr + 6, cmd0[1]);
+					vst1q_u64(lmt_addr + 8, cmd2[1]);
+					vst1q_u64(lmt_addr + 10, cmd1[1]);
+				}
 				lmt_status = roc_lmt_submit_ldeor(io_addr);
 			} while (lmt_status == 0);
 
 			do {
-				vst1q_u64(lmt_addr, cmd0[2]);
-				vst1q_u64(lmt_addr + 2, cmd2[2]);
-				vst1q_u64(lmt_addr + 4, cmd1[2]);
-				vst1q_u64(lmt_addr + 6, cmd0[3]);
-				vst1q_u64(lmt_addr + 8, cmd2[3]);
-				vst1q_u64(lmt_addr + 10, cmd1[3]);
+				if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+					vst1q_u64(lmt_addr, cmd0[2]);
+					vst1q_u64(lmt_addr + 2, cmd2[2]);
+					vst1q_u64(lmt_addr + 4, cmd1[2]);
+					vst1q_u64(lmt_addr + 6, cmd3[2]);
+					vst1q_u64(lmt_addr + 8, cmd0[3]);
+					vst1q_u64(lmt_addr + 10, cmd2[3]);
+					vst1q_u64(lmt_addr + 12, cmd1[3]);
+					vst1q_u64(lmt_addr + 14, cmd3[3]);
+				} else {
+					vst1q_u64(lmt_addr, cmd0[2]);
+					vst1q_u64(lmt_addr + 2, cmd2[2]);
+					vst1q_u64(lmt_addr + 4, cmd1[2]);
+					vst1q_u64(lmt_addr + 6, cmd0[3]);
+					vst1q_u64(lmt_addr + 8, cmd2[3]);
+					vst1q_u64(lmt_addr + 10, cmd1[3]);
+				}
 				lmt_status = roc_lmt_submit_ldeor(io_addr);
 			} while (lmt_status == 0);
 		} else {
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index 5842facb58..9ade66db2b 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -13,9 +13,8 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* TSO is not supported by vec */                              \
+		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
 						 (flags));		       \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 05/13] net/cnxk: enable TSO processing in vector Tx
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (2 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 04/13] net/cnxk: enable ptp " pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 06/13] net/cnxk: add multi seg Tx vector routine pbhagavatula
                     ` (8 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable TSO offload in vector Tx burst function.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |  2 +-
 drivers/net/cnxk/cn10k_tx.h     | 97 +++++++++++++++++++++++++++++++++
 drivers/net/cnxk/cn10k_tx_vec.c |  5 +-
 drivers/net/cnxk/cn9k_tx.c      |  2 +-
 drivers/net/cnxk/cn9k_tx.h      | 94 ++++++++++++++++++++++++++++++++
 drivers/net/cnxk/cn9k_tx_vec.c  |  5 +-
 6 files changed, 199 insertions(+), 6 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index c4c3e65704..d06879163f 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,7 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
+	if (dev->scalar_ena)
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index d5812c5c28..cea7c6cd34 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -689,6 +689,46 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 #if defined(RTE_ARCH_ARM64)
 
+static __rte_always_inline void
+cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
+		      union nix_send_ext_w0_u *w0, uint64_t ol_flags,
+		      const uint64_t flags, const uint64_t lso_tun_fmt)
+{
+	uint16_t lso_sb;
+	uint64_t mask;
+
+	if (!(ol_flags & PKT_TX_TCP_SEG))
+		return;
+
+	mask = -(!w1->il3type);
+	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+
+	w0->u |= BIT(14);
+	w0->lso_sb = lso_sb;
+	w0->lso_mps = m->tso_segsz;
+	w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
+	w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
+
+	/* Handle tunnel tso */
+	if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
+	    (ol_flags & PKT_TX_TUNNEL_MASK)) {
+		const uint8_t is_udp_tun =
+			(CNXK_NIX_UDP_TUN_BITMASK >>
+			 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
+			0x1;
+		uint8_t shift = is_udp_tun ? 32 : 0;
+
+		shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
+		shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
+
+		w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
+		w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
+		/* Update format for UDP tunneled packet */
+
+		w0->lso_format = (lso_tun_fmt >> shift);
+	}
+}
+
 #define NIX_DESCS_PER_LOOP 4
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -723,6 +763,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F) {
+		for (i = 0; i < pkts; i++)
+			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+	}
 
 	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
 	senddesc23_w0 = senddesc01_w0;
@@ -781,6 +826,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendmem23_w1 = sendmem01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			/* Clear the LSO enable bit. */
+			sendext01_w0 = vbicq_u64(sendext01_w0,
+						 vdupq_n_u64(BIT_ULL(14)));
+			sendext23_w0 = sendext01_w0;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1430,6 +1482,51 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			const uint64_t lso_fmt = txq->lso_tun_fmt;
+			uint64_t sx_w0[NIX_DESCS_PER_LOOP];
+			uint64_t sd_w1[NIX_DESCS_PER_LOOP];
+
+			/* Extract SD W1 as we need to set L4 types. */
+			vst1q_u64(sd_w1, senddesc01_w1);
+			vst1q_u64(sd_w1 + 2, senddesc23_w1);
+
+			/* Extract SX W0 as we need to set LSO fields. */
+			vst1q_u64(sx_w0, sendext01_w0);
+			vst1q_u64(sx_w0 + 2, sendext23_w0);
+
+			/* Extract ol_flags. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Prepare individual mbufs. */
+			cn10k_nix_prepare_tso(tx_pkts[0],
+				(union nix_send_hdr_w1_u *)&sd_w1[0],
+				(union nix_send_ext_w0_u *)&sx_w0[0],
+				vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[1],
+				(union nix_send_hdr_w1_u *)&sd_w1[1],
+				(union nix_send_ext_w0_u *)&sx_w0[1],
+				vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[2],
+				(union nix_send_hdr_w1_u *)&sd_w1[2],
+				(union nix_send_ext_w0_u *)&sx_w0[2],
+				vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[3],
+				(union nix_send_hdr_w1_u *)&sd_w1[3],
+				(union nix_send_ext_w0_u *)&sx_w0[3],
+				vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
+
+			senddesc01_w1 = vld1q_u64(sd_w1);
+			senddesc23_w1 = vld1q_u64(sd_w1 + 2);
+
+			sendext01_w0 = vld1q_u64(sx_w0);
+			sendext23_w0 = vld1q_u64(sx_w0 + 2);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 0b4a4c7bae..34e3737501 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -13,8 +13,9 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* TSO is not supported by vec */                              \
-		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&			       \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
 						  (flags));                    \
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index c32681ed44..735e21cc60 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,7 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
+	if (dev->scalar_ena)
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index bfb34abb23..2adff45705 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -545,6 +545,43 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 #if defined(RTE_ARCH_ARM64)
 
+static __rte_always_inline void
+cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
+		     union nix_send_ext_w0_u *w0, uint64_t ol_flags,
+		     uint64_t flags)
+{
+	uint16_t lso_sb;
+	uint64_t mask;
+
+	if (!(ol_flags & PKT_TX_TCP_SEG))
+		return;
+
+	mask = -(!w1->il3type);
+	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+
+	w0->u |= BIT(14);
+	w0->lso_sb = lso_sb;
+	w0->lso_mps = m->tso_segsz;
+	w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
+	w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
+
+	/* Handle tunnel tso */
+	if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
+	    (ol_flags & PKT_TX_TUNNEL_MASK)) {
+		const uint8_t is_udp_tun =
+			(CNXK_NIX_UDP_TUN_BITMASK >>
+			 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
+			0x1;
+
+		w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
+		w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
+		/* Update format for UDP tunneled packet */
+		w0->lso_format += is_udp_tun ? 2 : 6;
+
+		w0->lso_format += !!(ol_flags & PKT_TX_OUTER_IPV6) << 1;
+	}
+}
+
 #define NIX_DESCS_PER_LOOP 4
 static __rte_always_inline uint16_t
 cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -580,6 +617,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
 
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F) {
+		for (i = 0; i < pkts; i++)
+			cn9k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+	}
+
 	/* Lets commit any changes in the packet here as no further changes
 	 * to the packet will be done unless no fast free is enabled.
 	 */
@@ -637,6 +680,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendmem23_w1 = sendmem01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			/* Clear the LSO enable bit. */
+			sendext01_w0 = vbicq_u64(sendext01_w0,
+						 vdupq_n_u64(BIT_ULL(14)));
+			sendext23_w0 = sendext01_w0;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1286,6 +1336,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			uint64_t sx_w0[NIX_DESCS_PER_LOOP];
+			uint64_t sd_w1[NIX_DESCS_PER_LOOP];
+
+			/* Extract SD W1 as we need to set L4 types. */
+			vst1q_u64(sd_w1, senddesc01_w1);
+			vst1q_u64(sd_w1 + 2, senddesc23_w1);
+
+			/* Extract SX W0 as we need to set LSO fields. */
+			vst1q_u64(sx_w0, sendext01_w0);
+			vst1q_u64(sx_w0 + 2, sendext23_w0);
+
+			/* Extract ol_flags. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Prepare individual mbufs. */
+			cn9k_nix_prepare_tso(tx_pkts[0],
+				(union nix_send_hdr_w1_u *)&sd_w1[0],
+				(union nix_send_ext_w0_u *)&sx_w0[0],
+				vgetq_lane_u64(xtmp128, 0), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[1],
+				(union nix_send_hdr_w1_u *)&sd_w1[1],
+				(union nix_send_ext_w0_u *)&sx_w0[1],
+				vgetq_lane_u64(xtmp128, 1), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[2],
+				(union nix_send_hdr_w1_u *)&sd_w1[2],
+				(union nix_send_ext_w0_u *)&sx_w0[2],
+				vgetq_lane_u64(ytmp128, 0), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[3],
+				(union nix_send_hdr_w1_u *)&sd_w1[3],
+				(union nix_send_ext_w0_u *)&sx_w0[3],
+				vgetq_lane_u64(ytmp128, 1), flags);
+
+			senddesc01_w1 = vld1q_u64(sd_w1);
+			senddesc23_w1 = vld1q_u64(sd_w1 + 2);
+
+			sendext01_w0 = vld1q_u64(sx_w0);
+			sendext23_w0 = vld1q_u64(sx_w0 + 2);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index 9ade66db2b..56a3e2514a 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -13,8 +13,9 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* TSO is not supported by vec */                              \
-		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
 						 (flags));		       \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 06/13] net/cnxk: add multi seg Tx vector routine
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (3 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 05/13] net/cnxk: enable TSO " pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 07/13] event/cnxk: add Rx adapter support pbhagavatula
                     ` (7 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add multi segment Tx vector routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c          |  20 +-
 drivers/net/cnxk/cn10k_tx.h          | 388 +++++++++++++++++++++++++--
 drivers/net/cnxk/cn10k_tx_vec_mseg.c |  24 ++
 drivers/net/cnxk/cn9k_tx.c           |  20 +-
 drivers/net/cnxk/cn9k_tx.h           | 272 ++++++++++++++++++-
 drivers/net/cnxk/cn9k_tx_vec_mseg.c  |  24 ++
 drivers/net/cnxk/meson.build         |   6 +-
 7 files changed, 709 insertions(+), 45 deletions(-)
 create mode 100644 drivers/net/cnxk/cn10k_tx_vec_mseg.c
 create mode 100644 drivers/net/cnxk/cn9k_tx_vec_mseg.c

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index d06879163f..1f30bab59a 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,13 +67,23 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena)
+	const eth_tx_burst_t nix_eth_tx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_nix_xmit_pkts_vec_mseg_##name,
+
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	if (dev->scalar_ena) {
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
-	else
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+	} else {
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
-
-	if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-		pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_vec_burst_mseg);
+	}
 
 	rte_mb();
 }
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index cea7c6cd34..b25b20dcb2 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -42,6 +42,13 @@
 		}                                                              \
 	} while (0)
 
+/* Encoded number of segments to number of dwords macro, each value of nb_segs
+ * is encoded as 4bits.
+ */
+#define NIX_SEGDW_MAGIC 0x76654432210ULL
+
+#define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
+
 #define LMT_OFF(lmt_addr, lmt_num, offset)                                     \
 	(void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
 
@@ -102,6 +109,14 @@ cn10k_nix_tx_steor_data(const uint16_t flags)
 	return data;
 }
 
+static __rte_always_inline uint8_t
+cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
+{
+	return ((flags & NIX_TX_NEED_EXT_HDR) ?
+			      (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
+			      4);
+}
+
 static __rte_always_inline uint64_t
 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
 {
@@ -729,7 +744,244 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 	}
 }
 
+static __rte_always_inline void
+cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
+				union nix_send_hdr_w0_u *sh,
+				union nix_send_sg_s *sg, const uint32_t flags)
+{
+	struct rte_mbuf *m_next;
+	uint64_t *slist, sg_u;
+	uint16_t nb_segs;
+	int i = 1;
+
+	sh->total = m->pkt_len;
+	/* Clear sg->u header before use */
+	sg->u &= 0xFC00000000000000;
+	sg_u = sg->u;
+	slist = &cmd[0];
+
+	sg_u = sg_u | ((uint64_t)m->data_len);
+
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+
+	m = m_next;
+	/* Fill mbuf segments */
+	do {
+		m_next = m->next;
+		sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
+		*slist = rte_mbuf_data_iova(m);
+		/* Set invert df if buffer is not to be freed by H/W */
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			/* Mark mempool object as "put" since it is freed by NIX
+			 */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (!(sg_u & (1ULL << (i + 55))))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		slist++;
+		i++;
+		nb_segs--;
+		if (i > 2 && nb_segs) {
+			i = 0;
+			/* Next SG subdesc */
+			*(uint64_t *)slist = sg_u & 0xFC00000000000000;
+			sg->u = sg_u;
+			sg->segs = 3;
+			sg = (union nix_send_sg_s *)slist;
+			sg_u = sg->u;
+			slist++;
+		}
+		m = m_next;
+	} while (nb_segs);
+
+	sg->u = sg_u;
+	sg->segs = i;
+}
+
+static __rte_always_inline void
+cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
+			   uint64x2_t *cmd1, const uint8_t segdw,
+			   const uint32_t flags)
+{
+	union nix_send_hdr_w0_u sh;
+	union nix_send_sg_s sg;
+
+	if (m->nb_segs == 1) {
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+			sg.u = vgetq_lane_u64(cmd1[0], 0);
+			sg.u |= (cnxk_nix_prefree_seg(m) << 55);
+			cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+		}
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		sg.u = vgetq_lane_u64(cmd1[0], 0);
+		if (!(sg.u & (1ULL << 55)))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		return;
+	}
+
+	sh.u = vgetq_lane_u64(cmd0[0], 0);
+	sg.u = vgetq_lane_u64(cmd1[0], 0);
+
+	cn10k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
+
+	sh.sizem1 = segdw - 1;
+	cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
+	cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+}
+
 #define NIX_DESCS_PER_LOOP 4
+
+static __rte_always_inline uint8_t
+cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
+			       uint64x2_t *cmd1, uint64x2_t *cmd2,
+			       uint64x2_t *cmd3, uint8_t *segdw,
+			       uint64_t *lmt_addr, __uint128_t *data128,
+			       uint8_t *shift, const uint16_t flags)
+{
+	uint8_t j, off, lmt_used;
+
+	if (!(flags & NIX_TX_NEED_EXT_HDR) &&
+	    !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+		/* No segments in 4 consecutive packets. */
+		if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
+			for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
+				cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+			vst1q_u64(lmt_addr, cmd0[0]);
+			vst1q_u64(lmt_addr + 2, cmd1[0]);
+			vst1q_u64(lmt_addr + 4, cmd0[1]);
+			vst1q_u64(lmt_addr + 6, cmd1[1]);
+			vst1q_u64(lmt_addr + 8, cmd0[2]);
+			vst1q_u64(lmt_addr + 10, cmd1[2]);
+			vst1q_u64(lmt_addr + 12, cmd0[3]);
+			vst1q_u64(lmt_addr + 14, cmd1[3]);
+
+			*data128 |= ((__uint128_t)7) << *shift;
+			shift += 3;
+
+			return 1;
+		}
+	}
+
+	lmt_used = 0;
+	for (j = 0; j < NIX_DESCS_PER_LOOP;) {
+		/* Fit consecutive packets in same LMTLINE. */
+		if ((segdw[j] + segdw[j + 1]) <= 8) {
+			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+				cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL,
+							   &cmd0[j + 1],
+							   &cmd1[j + 1],
+							   segdw[j + 1], flags);
+				/* TSTAMP takes 4 each, no segs. */
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				vst1q_u64(lmt_addr + 6, cmd3[j]);
+
+				vst1q_u64(lmt_addr + 8, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 10, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 12, cmd1[j + 1]);
+				vst1q_u64(lmt_addr + 14, cmd3[j + 1]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				/* EXT header take 3 each, space for 2 segs.*/
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 6,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				off = segdw[j] - 3;
+				off <<= 1;
+				cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
+							   lmt_addr + 12 + off,
+							   &cmd0[j + 1],
+							   &cmd1[j + 1],
+							   segdw[j + 1], flags);
+				vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
+			} else {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 4,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+				off = segdw[j] - 2;
+				off <<= 1;
+				cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
+							   lmt_addr + 8 + off,
+							   &cmd0[j + 1],
+							   &cmd1[j + 1],
+							   segdw[j + 1], flags);
+				vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
+			}
+			*data128 |= ((__uint128_t)(segdw[j] + segdw[j + 1]) - 1)
+				    << *shift;
+			*shift += 3;
+			j += 2;
+		} else {
+			if ((flags & NIX_TX_NEED_EXT_HDR) &&
+			    (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 6,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				off = segdw[j] - 4;
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 6,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+			} else {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 4,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+			}
+			*data128 |= ((__uint128_t)(segdw[j]) - 1) << *shift;
+			*shift += 3;
+			j++;
+		}
+		lmt_used++;
+		lmt_addr += 16;
+	}
+
+	return lmt_used;
+}
+
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
@@ -738,7 +990,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
 		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
-	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
+	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
@@ -746,6 +998,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sendmem01_w0, sendmem23_w0;
 	uint64x2_t sendmem01_w1, sendmem23_w1;
+	uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -754,7 +1007,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t ltypes01, ltypes23;
 	uint64x2_t xtmp128, ytmp128;
 	uint64x2_t xmask01, xmask23;
-	uint8_t lnum;
+	uint8_t lnum, shift;
+	union wdata {
+		__uint128_t data128;
+		uint64_t data[2];
+	} wd;
 
 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
 
@@ -798,8 +1055,43 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
 			      cn10k_nix_pkts_per_vec_brst(flags) :
 			      left;
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		wd.data128 = 0;
+		shift = 16;
+	}
 	lnum = 0;
+
 	for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			struct rte_mbuf *m = tx_pkts[j];
+			uint8_t j;
+
+			for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
+				/* Get dwords based on nb_segs. */
+				segdw[j] = NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
+				/* Add dwords based on offloads. */
+				segdw[j] += 1 + /* SEND HDR */
+					    !!(flags & NIX_TX_NEED_EXT_HDR) +
+					    !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+			}
+
+			/* Check if there are enough LMTLINES for this loop */
+			if (lnum + 4 > 32) {
+				uint8_t ldwords_con = 0, lneeded = 0;
+				for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
+					ldwords_con += segdw[j];
+					if (ldwords_con > 8) {
+						lneeded += 1;
+						ldwords_con = segdw[j];
+					}
+				}
+				lneeded += 1;
+				if (lnum + lneeded > 32) {
+					burst = i;
+					break;
+				}
+			}
+		}
 		/* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
 		senddesc01_w0 =
 			vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
@@ -1527,7 +1819,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w0 = vld1q_u64(sx_w0 + 2);
 		}
 
-		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
+		    !(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
 			xmask23 = xmask01;
@@ -1567,7 +1860,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 					(void **)&mbuf3, 1, 0);
 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
-		} else {
+		} else if (!(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Move mbufs to iova */
 			mbuf0 = (uint64_t *)tx_pkts[0];
 			mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1612,7 +1905,19 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
 		}
 
-		if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			uint8_t j;
+
+			segdw[4] = 8;
+			j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
+							  cmd2, cmd3, segdw,
+							  (uint64_t *)
+							  LMT_OFF(laddr, lnum,
+								  0),
+							  &wd.data128, &shift,
+							  flags);
+			lnum += j;
+		} else if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* Store the prepared send desc to LMT lines */
 			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
@@ -1664,34 +1969,55 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
+	if (flags & NIX_TX_MULTI_SEG_F)
+		wd.data[0] >>= 16;
+
 	/* Trigger LMTST */
 	if (lnum > 16) {
-		data = cn10k_nix_tx_steor_vec_data(flags);
-		pa = io_addr | (data & 0x7) << 4;
-		data &= ~0x7ULL;
-		data |= (15ULL << 12);
-		data |= (uint64_t)lmt_id;
+		if (!(flags & NIX_TX_MULTI_SEG_F))
+			wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
+
+		pa = io_addr | (wd.data[0] & 0x7) << 4;
+		wd.data[0] &= ~0x7ULL;
+
+		if (flags & NIX_TX_MULTI_SEG_F)
+			wd.data[0] <<= 16;
+
+		wd.data[0] |= (15ULL << 12);
+		wd.data[0] |= (uint64_t)lmt_id;
 
 		/* STEOR0 */
-		roc_lmt_submit_steorl(data, pa);
+		roc_lmt_submit_steorl(wd.data[0], pa);
 
-		data = cn10k_nix_tx_steor_vec_data(flags);
-		pa = io_addr | (data & 0x7) << 4;
-		data &= ~0x7ULL;
-		data |= ((uint64_t)(lnum - 17)) << 12;
-		data |= (uint64_t)(lmt_id + 16);
+		if (!(flags & NIX_TX_MULTI_SEG_F))
+			wd.data[1] = cn10k_nix_tx_steor_vec_data(flags);
+
+		pa = io_addr | (wd.data[1] & 0x7) << 4;
+		wd.data[1] &= ~0x7ULL;
+
+		if (flags & NIX_TX_MULTI_SEG_F)
+			wd.data[1] <<= 16;
+
+		wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
+		wd.data[1] |= (uint64_t)(lmt_id + 16);
 
 		/* STEOR1 */
-		roc_lmt_submit_steorl(data, pa);
+		roc_lmt_submit_steorl(wd.data[1], pa);
 	} else if (lnum) {
-		data = cn10k_nix_tx_steor_vec_data(flags);
-		pa = io_addr | (data & 0x7) << 4;
-		data &= ~0x7ULL;
-		data |= ((uint64_t)(lnum - 1)) << 12;
-		data |= lmt_id;
+		if (!(flags & NIX_TX_MULTI_SEG_F))
+			wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
+
+		pa = io_addr | (wd.data[0] & 0x7) << 4;
+		wd.data[0] &= ~0x7ULL;
+
+		if (flags & NIX_TX_MULTI_SEG_F)
+			wd.data[0] <<= 16;
+
+		wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
+		wd.data[0] |= lmt_id;
 
 		/* STEOR0 */
-		roc_lmt_submit_steorl(data, pa);
+		roc_lmt_submit_steorl(wd.data[0], pa);
 	}
 
 	left -= burst;
@@ -1699,9 +2025,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (left)
 		goto again;
 
-	if (unlikely(scalar))
-		pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar, cmd,
-					    flags);
+	if (unlikely(scalar)) {
+		if (flags & NIX_TX_MULTI_SEG_F)
+			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
+							 scalar, cmd, flags);
+		else
+			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
+						    cmd, flags);
+	}
 
 	return pkts;
 }
@@ -1866,7 +2197,10 @@ T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum,	1, 1, 1, 1, 1, 1,	8,	\
 		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name(      \
-		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
 
 NIX_TX_FASTPATH_MODES
 #undef T
diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
new file mode 100644
index 0000000000..1fad81dbad
--- /dev/null
+++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_ethdev.h"
+#include "cn10k_tx.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		uint64_t cmd[sz];                                              \
+									       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
+			return 0;                                              \
+		return cn10k_nix_xmit_pkts_vector(                             \
+			tx_queue, tx_pkts, pkts, cmd,                          \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index 735e21cc60..763f9a14fd 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,13 +66,23 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena)
+	const eth_tx_burst_t nix_eth_tx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)			       \
+	[f5][f4][f3][f2][f1][f0] = cn9k_nix_xmit_pkts_vec_mseg_##name,
+
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	if (dev->scalar_ena) {
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
-	else
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+	} else {
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
-
-	if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-		pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_vec_burst_mseg);
+	}
 
 	rte_mb();
 }
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 2adff45705..42b54a378e 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -582,7 +582,238 @@ cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 	}
 }
 
+static __rte_always_inline uint8_t
+cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
+			       union nix_send_hdr_w0_u *sh,
+			       union nix_send_sg_s *sg, const uint32_t flags)
+{
+	struct rte_mbuf *m_next;
+	uint64_t *slist, sg_u;
+	uint16_t nb_segs;
+	uint64_t segdw;
+	int i = 1;
+
+	sh->total = m->pkt_len;
+	/* Clear sg->u header before use */
+	sg->u &= 0xFC00000000000000;
+	sg_u = sg->u;
+	slist = &cmd[0];
+
+	sg_u = sg_u | ((uint64_t)m->data_len);
+
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+
+	m = m_next;
+	/* Fill mbuf segments */
+	do {
+		m_next = m->next;
+		sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
+		*slist = rte_mbuf_data_iova(m);
+		/* Set invert df if buffer is not to be freed by H/W */
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			/* Mark mempool object as "put" since it is freed by NIX
+			 */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (!(sg_u & (1ULL << (i + 55))))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		slist++;
+		i++;
+		nb_segs--;
+		if (i > 2 && nb_segs) {
+			i = 0;
+			/* Next SG subdesc */
+			*(uint64_t *)slist = sg_u & 0xFC00000000000000;
+			sg->u = sg_u;
+			sg->segs = 3;
+			sg = (union nix_send_sg_s *)slist;
+			sg_u = sg->u;
+			slist++;
+		}
+		m = m_next;
+	} while (nb_segs);
+
+	sg->u = sg_u;
+	sg->segs = i;
+	segdw = (uint64_t *)slist - (uint64_t *)&cmd[0];
+
+	segdw += 2;
+	/* Roundup extra dwords to multiple of 2 */
+	segdw = (segdw >> 1) + (segdw & 0x1);
+	/* Default dwords */
+	segdw += 1 + !!(flags & NIX_TX_NEED_EXT_HDR) +
+		 !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+	sh->sizem1 = segdw - 1;
+
+	return segdw;
+}
+
+static __rte_always_inline uint8_t
+cn9k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
+			  uint64x2_t *cmd1, const uint32_t flags)
+{
+	union nix_send_hdr_w0_u sh;
+	union nix_send_sg_s sg;
+	uint8_t ret;
+
+	if (m->nb_segs == 1) {
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+			sg.u = vgetq_lane_u64(cmd1[0], 0);
+			sg.u |= (cnxk_nix_prefree_seg(m) << 55);
+			cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+		}
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		sg.u = vgetq_lane_u64(cmd1[0], 0);
+		if (!(sg.u & (1ULL << 55)))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		return 2 + !!(flags & NIX_TX_NEED_EXT_HDR) +
+		       !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	sh.u = vgetq_lane_u64(cmd0[0], 0);
+	sg.u = vgetq_lane_u64(cmd1[0], 0);
+
+	ret = cn9k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
+
+	cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
+	cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+	return ret;
+}
+
 #define NIX_DESCS_PER_LOOP 4
+
+static __rte_always_inline void
+cn9k_nix_xmit_pkts_mseg_vector(uint64x2_t *cmd0, uint64x2_t *cmd1,
+			       uint64x2_t *cmd2, uint64x2_t *cmd3,
+			       uint8_t *segdw,
+			       uint64_t slist[][CNXK_NIX_TX_MSEG_SG_DWORDS - 2],
+			       uint64_t *lmt_addr, rte_iova_t io_addr,
+			       const uint32_t flags)
+{
+	uint64_t lmt_status;
+	uint8_t j, off;
+
+	if (!(flags & NIX_TX_NEED_EXT_HDR) &&
+	    !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+		/* No segments in 4 consecutive packets. */
+		if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd1[0]);
+				vst1q_u64(lmt_addr + 4, cmd0[1]);
+				vst1q_u64(lmt_addr + 6, cmd1[1]);
+				vst1q_u64(lmt_addr + 8, cmd0[2]);
+				vst1q_u64(lmt_addr + 10, cmd1[2]);
+				vst1q_u64(lmt_addr + 12, cmd0[3]);
+				vst1q_u64(lmt_addr + 14, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+
+			return;
+		}
+	}
+
+	for (j = 0; j < NIX_DESCS_PER_LOOP;) {
+		/* Fit consecutive packets in same LMTLINE. */
+		if ((segdw[j] + segdw[j + 1]) <= 8) {
+again0:
+			if ((flags & NIX_TX_NEED_EXT_HDR) &&
+			    (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 4;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
+
+				vst1q_u64(lmt_addr + 8 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 10 + off, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 12 + off, cmd1[j + 1]);
+				roc_lmt_mov_seg(lmt_addr + 14 + off,
+						slist[j + 1], segdw[j + 1] - 4);
+				off += ((segdw[j + 1] - 4) << 1);
+				vst1q_u64(lmt_addr + 14 + off, cmd3[j + 1]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 3;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
+				roc_lmt_mov_seg(lmt_addr + 12 + off,
+						slist[j + 1], segdw[j + 1] - 3);
+			} else {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 2;
+				roc_lmt_mov_seg(lmt_addr + 4, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
+				roc_lmt_mov_seg(lmt_addr + 8 + off,
+						slist[j + 1], segdw[j + 1] - 2);
+			}
+			lmt_status = roc_lmt_submit_ldeor(io_addr);
+			if (lmt_status == 0)
+				goto again0;
+			j += 2;
+		} else {
+again1:
+			if ((flags & NIX_TX_NEED_EXT_HDR) &&
+			    (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 4;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 3;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+			} else {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 2;
+				roc_lmt_mov_seg(lmt_addr + 4, slist[j], off);
+			}
+			lmt_status = roc_lmt_submit_ldeor(io_addr);
+			if (lmt_status == 0)
+				goto again1;
+			j += 1;
+		}
+	}
+}
+
 static __rte_always_inline uint16_t
 cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			  uint16_t pkts, uint64_t *cmd, const uint16_t flags)
@@ -1380,7 +1611,8 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w0 = vld1q_u64(sx_w0 + 2);
 		}
 
-		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
+		    !(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
 			xmask23 = xmask01;
@@ -1424,7 +1656,7 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			 * cnxk_nix_prefree_seg are written before LMTST.
 			 */
 			rte_io_wmb();
-		} else {
+		} else if (!(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Move mbufs to iova */
 			mbuf0 = (uint64_t *)tx_pkts[0];
 			mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1472,7 +1704,27 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
 		}
 
-		if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			uint64_t seg_list[NIX_DESCS_PER_LOOP]
+					 [CNXK_NIX_TX_MSEG_SG_DWORDS - 2];
+			uint8_t j, segdw[NIX_DESCS_PER_LOOP + 1];
+
+			/* Build mseg list for each packet individually. */
+			for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
+				segdw[j] = cn9k_nix_prepare_mseg_vec(tx_pkts[j],
+							seg_list[j], &cmd0[j],
+							&cmd1[j], flags);
+			segdw[4] = 8;
+
+			/* Commit all changes to mbuf before LMTST. */
+			if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+				rte_io_wmb();
+
+			cn9k_nix_xmit_pkts_mseg_vector(cmd0, cmd1, cmd2, cmd3,
+						       segdw, seg_list,
+						       lmt_addr, io_addr,
+						       flags);
+		} else if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* With ext header in the command we can no longer send
 			 * all 4 packets together since LMTLINE is 128bytes.
 			 * Split and Tx twice.
@@ -1534,9 +1786,14 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
-	if (unlikely(pkts_left))
-		pkts += cn9k_nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd,
-					   flags);
+	if (unlikely(pkts_left)) {
+		if (flags & NIX_TX_MULTI_SEG_F)
+			pkts += cn9k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
+							pkts_left, cmd, flags);
+		else
+			pkts += cn9k_nix_xmit_pkts(tx_queue, tx_pkts, pkts_left,
+						   cmd, flags);
+	}
 
 	return pkts;
 }
@@ -1701,6 +1958,9 @@ T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum,	1, 1, 1, 1, 1, 1,	8,	       \
 		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn9k_nix_xmit_pkts_vec_##name(       \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_xmit_pkts_vec_mseg_##name(  \
 		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn9k_tx_vec_mseg.c b/drivers/net/cnxk/cn9k_tx_vec_mseg.c
new file mode 100644
index 0000000000..0256efd45a
--- /dev/null
+++ b/drivers/net/cnxk/cn9k_tx_vec_mseg.c
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_ethdev.h"
+#include "cn9k_tx.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_xmit_pkts_vec_mseg_##name(  \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		uint64_t cmd[sz];                                              \
+									       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
+			return 0;                                              \
+		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
+						 (flags) |                     \
+							 NIX_TX_MULTI_SEG_F);  \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/net/cnxk/meson.build b/drivers/net/cnxk/meson.build
index aa8c7253fb..361f7ce849 100644
--- a/drivers/net/cnxk/meson.build
+++ b/drivers/net/cnxk/meson.build
@@ -26,7 +26,8 @@ sources += files('cn9k_ethdev.c',
 		 'cn9k_rx_vec_mseg.c',
 		 'cn9k_tx.c',
 		 'cn9k_tx_mseg.c',
-		 'cn9k_tx_vec.c')
+		 'cn9k_tx_vec.c',
+		 'cn9k_tx_vec_mseg.c')
 # CN10K
 sources += files('cn10k_ethdev.c',
 		 'cn10k_rte_flow.c',
@@ -36,7 +37,8 @@ sources += files('cn10k_ethdev.c',
 		 'cn10k_rx_vec_mseg.c',
 		 'cn10k_tx.c',
 		 'cn10k_tx_mseg.c',
-		 'cn10k_tx_vec.c')
+		 'cn10k_tx_vec.c',
+		 'cn10k_tx_vec_mseg.c')
 
 deps += ['bus_pci', 'cryptodev', 'eventdev', 'security']
 deps += ['common_cnxk', 'mempool_cnxk']
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 07/13] event/cnxk: add Rx adapter support
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (4 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 06/13] net/cnxk: add multi seg Tx vector routine pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 08/13] event/cnxk: add Rx adapter fastpath ops pbhagavatula
                     ` (6 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao, Ray Kinsella,
	Neil Horman
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Rx adapter.
Resize cn10k workslot fastpath structure to fit in 64B cacheline size.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |  28 ++++
 doc/guides/rel_notes/release_21_08.rst   |   5 +
 drivers/common/cnxk/roc_nix.h            |   3 +
 drivers/common/cnxk/roc_nix_fc.c         |  78 ++++++++++
 drivers/common/cnxk/roc_nix_priv.h       |   3 +-
 drivers/common/cnxk/version.map          |   1 +
 drivers/event/cnxk/cn10k_eventdev.c      | 107 +++++++++++---
 drivers/event/cnxk/cn10k_worker.c        |   7 +-
 drivers/event/cnxk/cn10k_worker.h        |  32 +++--
 drivers/event/cnxk/cn9k_eventdev.c       |  89 ++++++++++++
 drivers/event/cnxk/cn9k_worker.h         |   4 +
 drivers/event/cnxk/cnxk_eventdev.c       |   2 +
 drivers/event/cnxk/cnxk_eventdev.h       |  43 ++++--
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 176 +++++++++++++++++++++++
 drivers/event/cnxk/meson.build           |   9 +-
 15 files changed, 540 insertions(+), 47 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 36da3800cc..b7e82c1273 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -39,6 +39,10 @@ Features of the OCTEON cnxk SSO PMD are:
   time granularity of 2.5us on CN9K and 1us on CN10K.
 - Up to 256 TIM rings a.k.a event timer adapters.
 - Up to 8 rings traversed in parallel.
+- HW managed packets enqueued from ethdev to eventdev exposed through event eth
+  RX adapter.
+- N:1 ethernet device Rx queue to Event queue mapping.
+- Full Rx offload support defined through ethdev queue configuration.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
@@ -93,6 +97,15 @@ Runtime Config Options
 
     -a 0002:0e:00.0,qos=[1-50-50-50]
 
+- ``Force Rx Back pressure``
+
+   Force Rx back pressure when same mempool is used across ethernet device
+   connected to event device.
+
+   For example::
+
+      -a 0002:0e:00.0,force_rx_bp=1
+
 - ``TIM disable NPA``
 
   By default chunks are allocated from NPA then TIM can automatically free
@@ -160,3 +173,18 @@ Debugging Options
    +---+------------+-------------------------------------------------------+
    | 2 | TIM        | --log-level='pmd\.event\.cnxk\.timer,8'               |
    +---+------------+-------------------------------------------------------+
+
+Limitations
+-----------
+
+Rx adapter support
+~~~~~~~~~~~~~~~~~~
+
+Using the same mempool for all the ethernet device ports connected to
+event device would cause back pressure to be asserted only on the first
+ethernet device.
+Back pressure is automatically disabled when using same mempool for all the
+ethernet devices connected to event device to override this applications can
+use `force_rx_bp=1` device arguments.
+Using unique mempool per each ethernet device is recommended when they are
+connected to event device.
diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 31e49e1a56..3892c8017a 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -60,6 +60,11 @@ New Features
   * Added net/cnxk driver which provides the support for the integrated ethernet
     device.
 
+* **Added support for Marvell CN10K, CN9K, event Rx adapter.**
+
+  * Added Rx adapter support for event/cnxk when the ethernet device requested is
+    net/cnxk.
+
 
 Removed Items
 -------------
diff --git a/drivers/common/cnxk/roc_nix.h b/drivers/common/cnxk/roc_nix.h
index bb69027956..76613fe84e 100644
--- a/drivers/common/cnxk/roc_nix.h
+++ b/drivers/common/cnxk/roc_nix.h
@@ -514,6 +514,9 @@ int __roc_api roc_nix_fc_mode_set(struct roc_nix *roc_nix,
 
 enum roc_nix_fc_mode __roc_api roc_nix_fc_mode_get(struct roc_nix *roc_nix);
 
+void __roc_api rox_nix_fc_npa_bp_cfg(struct roc_nix *roc_nix, uint64_t pool_id,
+				     uint8_t ena, uint8_t force);
+
 /* NPC */
 int __roc_api roc_nix_npc_promisc_ena_dis(struct roc_nix *roc_nix, int enable);
 
diff --git a/drivers/common/cnxk/roc_nix_fc.c b/drivers/common/cnxk/roc_nix_fc.c
index 47be8aa3f8..f17eba4169 100644
--- a/drivers/common/cnxk/roc_nix_fc.c
+++ b/drivers/common/cnxk/roc_nix_fc.c
@@ -249,3 +249,81 @@ roc_nix_fc_mode_set(struct roc_nix *roc_nix, enum roc_nix_fc_mode mode)
 exit:
 	return rc;
 }
+
+void
+rox_nix_fc_npa_bp_cfg(struct roc_nix *roc_nix, uint64_t pool_id, uint8_t ena,
+		      uint8_t force)
+{
+	struct nix *nix = roc_nix_to_nix_priv(roc_nix);
+	struct npa_lf *lf = idev_npa_obj_get();
+	struct npa_aq_enq_req *req;
+	struct npa_aq_enq_rsp *rsp;
+	struct mbox *mbox;
+	uint32_t limit;
+	int rc;
+
+	if (roc_nix_is_sdp(roc_nix))
+		return;
+
+	if (!lf)
+		return;
+	mbox = lf->mbox;
+
+	req = mbox_alloc_msg_npa_aq_enq(mbox);
+	if (req == NULL)
+		return;
+
+	req->aura_id = roc_npa_aura_handle_to_aura(pool_id);
+	req->ctype = NPA_AQ_CTYPE_AURA;
+	req->op = NPA_AQ_INSTOP_READ;
+
+	rc = mbox_process_msg(mbox, (void *)&rsp);
+	if (rc)
+		return;
+
+	limit = rsp->aura.limit;
+	/* BP is already enabled. */
+	if (rsp->aura.bp_ena) {
+		/* If BP ids don't match disable BP. */
+		if ((rsp->aura.nix0_bpid != nix->bpid[0]) && !force) {
+			req = mbox_alloc_msg_npa_aq_enq(mbox);
+			if (req == NULL)
+				return;
+
+			req->aura_id = roc_npa_aura_handle_to_aura(pool_id);
+			req->ctype = NPA_AQ_CTYPE_AURA;
+			req->op = NPA_AQ_INSTOP_WRITE;
+
+			req->aura.bp_ena = 0;
+			req->aura_mask.bp_ena = ~(req->aura_mask.bp_ena);
+
+			mbox_process(mbox);
+		}
+		return;
+	}
+
+	/* BP was previously enabled but now disabled skip. */
+	if (rsp->aura.bp)
+		return;
+
+	req = mbox_alloc_msg_npa_aq_enq(mbox);
+	if (req == NULL)
+		return;
+
+	req->aura_id = roc_npa_aura_handle_to_aura(pool_id);
+	req->ctype = NPA_AQ_CTYPE_AURA;
+	req->op = NPA_AQ_INSTOP_WRITE;
+
+	if (ena) {
+		req->aura.nix0_bpid = nix->bpid[0];
+		req->aura_mask.nix0_bpid = ~(req->aura_mask.nix0_bpid);
+		req->aura.bp = NIX_RQ_AURA_THRESH(
+			limit > 128 ? 256 : limit); /* 95% of size*/
+		req->aura_mask.bp = ~(req->aura_mask.bp);
+	}
+
+	req->aura.bp_ena = !!ena;
+	req->aura_mask.bp_ena = ~(req->aura_mask.bp_ena);
+
+	mbox_process(mbox);
+}
diff --git a/drivers/common/cnxk/roc_nix_priv.h b/drivers/common/cnxk/roc_nix_priv.h
index d9c32df442..9dc0c88a6f 100644
--- a/drivers/common/cnxk/roc_nix_priv.h
+++ b/drivers/common/cnxk/roc_nix_priv.h
@@ -16,7 +16,8 @@
 #define NIX_SQB_LOWER_THRESH ((uint16_t)70)
 
 /* Apply BP/DROP when CQ is 95% full */
-#define NIX_CQ_THRESH_LEVEL (5 * 256 / 100)
+#define NIX_CQ_THRESH_LEVEL	(5 * 256 / 100)
+#define NIX_RQ_AURA_THRESH(x)	(((x) * 95) / 100)
 
 /* IRQ triggered when NIX_LF_CINTX_CNT[QCOUNT] crosses this value */
 #define CQ_CQE_THRESH_DEFAULT	0x1ULL
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8a5c839e57..cb1ce4b6fc 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -29,6 +29,7 @@ INTERNAL {
 	roc_nix_fc_config_set;
 	roc_nix_fc_mode_set;
 	roc_nix_fc_mode_get;
+	rox_nix_fc_npa_bp_cfg;
 	roc_nix_get_base_chan;
 	roc_nix_get_pf;
 	roc_nix_get_pf_func;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index bf4052c76c..2060c8fe84 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -6,18 +6,6 @@
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
 
-static void
-cn10k_init_hws_ops(struct cn10k_sso_hws *ws, uintptr_t base)
-{
-	ws->tag_wqe_op = base + SSOW_LF_GWS_WQE0;
-	ws->getwrk_op = base + SSOW_LF_GWS_OP_GET_WORK0;
-	ws->updt_wqe_op = base + SSOW_LF_GWS_OP_UPD_WQP_GRP1;
-	ws->swtag_norm_op = base + SSOW_LF_GWS_OP_SWTAG_NORM;
-	ws->swtag_untag_op = base + SSOW_LF_GWS_OP_SWTAG_UNTAG;
-	ws->swtag_flush_op = base + SSOW_LF_GWS_OP_SWTAG_FLUSH;
-	ws->swtag_desched_op = base + SSOW_LF_GWS_OP_SWTAG_DESCHED;
-}
-
 static uint32_t
 cn10k_sso_gw_mode_wdata(struct cnxk_sso_evdev *dev)
 {
@@ -56,7 +44,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
-	cn10k_init_hws_ops(ws, ws->base);
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -135,13 +122,14 @@ cn10k_sso_hws_flush_events(void *hws, uint8_t queue_id, uintptr_t base,
 	cq_ds_cnt &= 0x3FFF3FFF0000;
 
 	while (aq_cnt || cq_ds_cnt || ds_cnt) {
-		plt_write64(req, ws->getwrk_op);
+		plt_write64(req, ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 		cn10k_sso_hws_get_work_empty(ws, &ev);
 		if (fn != NULL && ev.u64 != 0)
 			fn(arg, ev);
 		if (ev.sched_type != SSO_TT_EMPTY)
-			cnxk_sso_hws_swtag_flush(ws->tag_wqe_op,
-						 ws->swtag_flush_op);
+			cnxk_sso_hws_swtag_flush(
+				ws->base + SSOW_LF_GWS_WQE0,
+				ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 		do {
 			val = plt_read64(ws->base + SSOW_LF_GWS_PENDSTATE);
 		} while (val & BIT_ULL(56));
@@ -205,9 +193,11 @@ cn10k_sso_hws_reset(void *arg, void *hws)
 
 	if (CNXK_TT_FROM_TAG(plt_read64(base + SSOW_LF_GWS_PRF_WQE0)) !=
 	    SSO_TT_EMPTY) {
-		plt_write64(BIT_ULL(16) | 1, ws->getwrk_op);
+		plt_write64(BIT_ULL(16) | 1,
+			    ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 		do {
-			roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
+			roc_load_pair(gw.u64[0], gw.u64[1],
+				      ws->base + SSOW_LF_GWS_WQE0);
 		} while (gw.u64[0] & BIT_ULL(63));
 		pend_tt = CNXK_TT_FROM_TAG(plt_read64(base + SSOW_LF_GWS_WQE0));
 		if (pend_tt != SSO_TT_EMPTY) { /* Work was pending */
@@ -407,6 +397,80 @@ cn10k_sso_selftest(void)
 	return cnxk_sso_selftest(RTE_STR(event_cn10k));
 }
 
+static int
+cn10k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int rc;
+
+	RTE_SET_USED(event_dev);
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 9);
+	if (rc)
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_SW_CAP;
+	else
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+
+	return 0;
+}
+
+static void
+cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem,
+		       void *tstmp_info)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
+		ws->lookup_mem = lookup_mem;
+		ws->tstamp = tstmp_info;
+	}
+}
+
+static int
+cn10k_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cn10k_eth_rxq *rxq;
+	void *lookup_mem;
+	void *tstmp_info;
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	rc = cnxk_sso_rx_adapter_queue_add(event_dev, eth_dev, rx_queue_id,
+					   queue_conf);
+	if (rc)
+		return -EINVAL;
+	rxq = eth_dev->data->rx_queues[0];
+	lookup_mem = rxq->lookup_mem;
+	tstmp_info = rxq->tstamp;
+	cn10k_sso_set_priv_mem(event_dev, lookup_mem, tstmp_info);
+	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t rx_queue_id)
+{
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
+}
+
 static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.dev_infos_get = cn10k_sso_info_get,
 	.dev_configure = cn10k_sso_dev_configure,
@@ -420,6 +484,12 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.port_unlink = cn10k_sso_port_unlink,
 	.timeout_ticks = cnxk_sso_timeout_ticks,
 
+	.eth_rx_adapter_caps_get = cn10k_sso_rx_adapter_caps_get,
+	.eth_rx_adapter_queue_add = cn10k_sso_rx_adapter_queue_add,
+	.eth_rx_adapter_queue_del = cn10k_sso_rx_adapter_queue_del,
+	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
+	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
@@ -502,6 +572,7 @@ RTE_PMD_REGISTER_PCI_TABLE(event_cn10k, cn10k_pci_sso_map);
 RTE_PMD_REGISTER_KMOD_DEP(event_cn10k, "vfio-pci");
 RTE_PMD_REGISTER_PARAM_STRING(event_cn10k, CNXK_SSO_XAE_CNT "=<int>"
 			      CNXK_SSO_GGRP_QOS "=<string>"
+			      CNXK_SSO_FORCE_BP "=1"
 			      CN10K_SSO_GW_MODE "=<int>"
 			      CNXK_TIM_DISABLE_NPA "=1"
 			      CNXK_TIM_CHNK_SLOTS "=<int>"
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index e2aa534c64..5dbae275ba 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -18,7 +18,8 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 		cn10k_sso_hws_forward_event(ws, ev);
 		break;
 	case RTE_EVENT_OP_RELEASE:
-		cnxk_sso_hws_swtag_flush(ws->tag_wqe_op, ws->swtag_flush_op);
+		cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_WQE0,
+					 ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 		break;
 	default:
 		return 0;
@@ -69,7 +70,7 @@ cn10k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
 
 	if (ws->swtag_req) {
 		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);
+		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
 		return 1;
 	}
 
@@ -94,7 +95,7 @@ cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
 
 	if (ws->swtag_req) {
 		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);
+		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
 		return ret;
 	}
 
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 2f093a8dd5..c7250bf9e7 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -5,9 +5,13 @@
 #ifndef __CN10K_WORKER_H__
 #define __CN10K_WORKER_H__
 
+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
 
+#include "cn10k_ethdev.h"
+#include "cn10k_rx.h"
+
 /* SSO Operations */
 
 static __rte_always_inline uint8_t
@@ -31,7 +35,8 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 {
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
-	const uint8_t cur_tt = CNXK_TT_FROM_TAG(plt_read64(ws->tag_wqe_op));
+	const uint8_t cur_tt =
+		CNXK_TT_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0));
 
 	/* CNXK model
 	 * cur_tt/new_tt     SSO_TT_ORDERED SSO_TT_ATOMIC SSO_TT_UNTAGGED
@@ -43,9 +48,11 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 
 	if (new_tt == SSO_TT_UNTAGGED) {
 		if (cur_tt != SSO_TT_UNTAGGED)
-			cnxk_sso_hws_swtag_untag(ws->swtag_untag_op);
+			cnxk_sso_hws_swtag_untag(ws->base +
+						 SSOW_LF_GWS_OP_SWTAG_UNTAG);
 	} else {
-		cnxk_sso_hws_swtag_norm(tag, new_tt, ws->swtag_norm_op);
+		cnxk_sso_hws_swtag_norm(tag, new_tt,
+					ws->base + SSOW_LF_GWS_OP_SWTAG_NORM);
 	}
 	ws->swtag_req = 1;
 }
@@ -57,8 +64,9 @@ cn10k_sso_hws_fwd_group(struct cn10k_sso_hws *ws, const struct rte_event *ev,
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
 
-	plt_write64(ev->u64, ws->updt_wqe_op);
-	cnxk_sso_hws_swtag_desched(tag, new_tt, grp, ws->swtag_desched_op);
+	plt_write64(ev->u64, ws->base + SSOW_LF_GWS_OP_UPD_WQP_GRP1);
+	cnxk_sso_hws_swtag_desched(tag, new_tt, grp,
+				   ws->base + SSOW_LF_GWS_OP_SWTAG_DESCHED);
 }
 
 static __rte_always_inline void
@@ -68,7 +76,7 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 	const uint8_t grp = ev->queue_id;
 
 	/* Group hasn't changed, Use SWTAG to forward the event */
-	if (CNXK_GRP_FROM_TAG(plt_read64(ws->tag_wqe_op)) == grp)
+	if (CNXK_GRP_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0)) == grp)
 		cn10k_sso_hws_fwd_swtag(ws, ev);
 	else
 		/*
@@ -93,12 +101,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		PLT_CPU_FEATURE_PREAMBLE
 		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
 		: [wdata] "+r"(gw.get_work)
-		: [gw_loc] "r"(ws->getwrk_op)
+		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
-	plt_write64(gw.u64[0], ws->getwrk_op);
+	plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 	do {
-		roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
+		roc_load_pair(gw.u64[0], gw.u64[1],
+			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
 #endif
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
@@ -130,11 +139,12 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		     "		tbnz %[tag], 63, rty%=			\n"
 		     "done%=:	dmb ld					\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
-		     : [tag_loc] "r"(ws->tag_wqe_op)
+		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
 		     : "memory");
 #else
 	do {
-		roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
+		roc_load_pair(gw.u64[0], gw.u64[1],
+			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
 #endif
 
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 0684417eab..072800c243 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -481,6 +481,88 @@ cn9k_sso_selftest(void)
 	return cnxk_sso_selftest(RTE_STR(event_cn9k));
 }
 
+static int
+cn9k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
+			     const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int rc;
+
+	RTE_SET_USED(event_dev);
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 9);
+	if (rc)
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_SW_CAP;
+	else
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+
+	return 0;
+}
+
+static void
+cn9k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem,
+		      void *tstmp_info)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		if (dev->dual_ws) {
+			struct cn9k_sso_hws_dual *dws =
+				event_dev->data->ports[i];
+			dws->lookup_mem = lookup_mem;
+			dws->tstamp = tstmp_info;
+		} else {
+			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
+			ws->lookup_mem = lookup_mem;
+			ws->tstamp = tstmp_info;
+		}
+	}
+}
+
+static int
+cn9k_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cn9k_eth_rxq *rxq;
+	void *lookup_mem;
+	void *tstmp_info;
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (rc)
+		return -EINVAL;
+
+	rc = cnxk_sso_rx_adapter_queue_add(event_dev, eth_dev, rx_queue_id,
+					   queue_conf);
+	if (rc)
+		return -EINVAL;
+
+	rxq = eth_dev->data->rx_queues[0];
+	lookup_mem = rxq->lookup_mem;
+	tstmp_info = rxq->tstamp;
+	cn9k_sso_set_priv_mem(event_dev, lookup_mem, tstmp_info);
+	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn9k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t rx_queue_id)
+{
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (rc)
+		return -EINVAL;
+
+	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
+}
+
 static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.dev_infos_get = cn9k_sso_info_get,
 	.dev_configure = cn9k_sso_dev_configure,
@@ -494,6 +576,12 @@ static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.port_unlink = cn9k_sso_port_unlink,
 	.timeout_ticks = cnxk_sso_timeout_ticks,
 
+	.eth_rx_adapter_caps_get = cn9k_sso_rx_adapter_caps_get,
+	.eth_rx_adapter_queue_add = cn9k_sso_rx_adapter_queue_add,
+	.eth_rx_adapter_queue_del = cn9k_sso_rx_adapter_queue_del,
+	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
+	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
@@ -571,6 +659,7 @@ RTE_PMD_REGISTER_PCI_TABLE(event_cn9k, cn9k_pci_sso_map);
 RTE_PMD_REGISTER_KMOD_DEP(event_cn9k, "vfio-pci");
 RTE_PMD_REGISTER_PARAM_STRING(event_cn9k, CNXK_SSO_XAE_CNT "=<int>"
 			      CNXK_SSO_GGRP_QOS "=<string>"
+			      CNXK_SSO_FORCE_BP "=1"
 			      CN9K_SSO_SINGLE_WS "=1"
 			      CNXK_TIM_DISABLE_NPA "=1"
 			      CNXK_TIM_CHNK_SLOTS "=<int>"
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 38fca08fb6..f5a4401465 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -5,9 +5,13 @@
 #ifndef __CN9K_WORKER_H__
 #define __CN9K_WORKER_H__
 
+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
 
+#include "cn9k_ethdev.h"
+#include "cn9k_rx.h"
+
 /* SSO Operations */
 
 static __rte_always_inline uint8_t
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index 7189ee3a79..cfd7fb971c 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -571,6 +571,8 @@ cnxk_sso_parse_devargs(struct cnxk_sso_evdev *dev, struct rte_devargs *devargs)
 			   &dev->xae_cnt);
 	rte_kvargs_process(kvlist, CNXK_SSO_GGRP_QOS, &parse_sso_kvargs_dict,
 			   dev);
+	rte_kvargs_process(kvlist, CNXK_SSO_FORCE_BP, &parse_kvargs_value,
+			   &dev->force_ena_bp);
 	rte_kvargs_process(kvlist, CN9K_SSO_SINGLE_WS, &parse_kvargs_value,
 			   &single_ws);
 	rte_kvargs_process(kvlist, CN10K_SSO_GW_MODE, &parse_kvargs_value,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 668e51d62a..b65d725f55 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -6,6 +6,8 @@
 #define __CNXK_EVENTDEV_H__
 
 #include <rte_devargs.h>
+#include <rte_ethdev.h>
+#include <rte_event_eth_rx_adapter.h>
 #include <rte_kvargs.h>
 #include <rte_mbuf_pool_ops.h>
 #include <rte_pci.h>
@@ -18,6 +20,7 @@
 
 #define CNXK_SSO_XAE_CNT   "xae_cnt"
 #define CNXK_SSO_GGRP_QOS  "qos"
+#define CNXK_SSO_FORCE_BP  "force_rx_bp"
 #define CN9K_SSO_SINGLE_WS "single_ws"
 #define CN10K_SSO_GW_MODE  "gw_mode"
 
@@ -81,7 +84,10 @@ struct cnxk_sso_evdev {
 	uint64_t nb_xaq_cfg;
 	rte_iova_t fc_iova;
 	struct rte_mempool *xaq_pool;
+	uint64_t rx_offloads;
 	uint64_t adptr_xae_cnt;
+	uint16_t rx_adptr_pool_cnt;
+	uint64_t *rx_adptr_pools;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -89,25 +95,18 @@ struct cnxk_sso_evdev {
 	uint32_t xae_cnt;
 	uint8_t qos_queue_cnt;
 	struct cnxk_sso_qos *qos_parse_data;
+	uint8_t force_ena_bp;
 	/* CN9K */
 	uint8_t dual_ws;
 	/* CN10K */
 	uint8_t gw_mode;
 } __rte_cache_aligned;
 
-/* CN10K HWS ops */
-#define CN10K_SSO_HWS_OPS                                                      \
-	uintptr_t swtag_desched_op;                                            \
-	uintptr_t swtag_flush_op;                                              \
-	uintptr_t swtag_untag_op;                                              \
-	uintptr_t swtag_norm_op;                                               \
-	uintptr_t updt_wqe_op;                                                 \
-	uintptr_t tag_wqe_op;                                                  \
-	uintptr_t getwrk_op
-
 struct cn10k_sso_hws {
-	/* Get Work Fastpath data */
-	CN10K_SSO_HWS_OPS;
+	uint64_t base;
+	/* PTP timestamp */
+	struct cnxk_timesync_info *tstamp;
+	void *lookup_mem;
 	uint32_t gw_wdata;
 	uint8_t swtag_req;
 	uint8_t hws_id;
@@ -115,7 +114,6 @@ struct cn10k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base;
 	uintptr_t lmt_base;
 } __rte_cache_aligned;
 
@@ -132,6 +130,9 @@ struct cn10k_sso_hws {
 struct cn9k_sso_hws {
 	/* Get Work Fastpath data */
 	CN9K_SSO_HWS_OPS;
+	/* PTP timestamp */
+	struct cnxk_timesync_info *tstamp;
+	void *lookup_mem;
 	uint8_t swtag_req;
 	uint8_t hws_id;
 	/* Add Work Fastpath data */
@@ -148,6 +149,9 @@ struct cn9k_sso_hws_state {
 struct cn9k_sso_hws_dual {
 	/* Get Work Fastpath data */
 	struct cn9k_sso_hws_state ws_state[2]; /* Ping and Pong */
+	/* PTP timestamp */
+	struct cnxk_timesync_info *tstamp;
+	void *lookup_mem;
 	uint8_t swtag_req;
 	uint8_t vws; /* Ping pong bit */
 	uint8_t hws_id;
@@ -250,4 +254,17 @@ int cnxk_sso_xstats_reset(struct rte_eventdev *event_dev,
 /* CN9K */
 void cn9k_sso_set_rsrc(void *arg);
 
+/* Common adapter ops */
+int cnxk_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf);
+int cnxk_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t rx_queue_id);
+int cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev);
+int cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
+			     const struct rte_eth_dev *eth_dev);
+
 #endif /* __CNXK_EVENTDEV_H__ */
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 89a1d82c14..24bfd985e7 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -2,6 +2,7 @@
  * Copyright(C) 2021 Marvell.
  */
 
+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 
 void
@@ -11,6 +12,32 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 	int i;
 
 	switch (event_type) {
+	case RTE_EVENT_TYPE_ETHDEV: {
+		struct cnxk_eth_rxq_sp *rxq = data;
+		uint64_t *old_ptr;
+
+		for (i = 0; i < dev->rx_adptr_pool_cnt; i++) {
+			if ((uint64_t)rxq->qconf.mp == dev->rx_adptr_pools[i])
+				return;
+		}
+
+		dev->rx_adptr_pool_cnt++;
+		old_ptr = dev->rx_adptr_pools;
+		dev->rx_adptr_pools = rte_realloc(
+			dev->rx_adptr_pools,
+			sizeof(uint64_t) * dev->rx_adptr_pool_cnt, 0);
+		if (dev->rx_adptr_pools == NULL) {
+			dev->adptr_xae_cnt += rxq->qconf.mp->size;
+			dev->rx_adptr_pools = old_ptr;
+			dev->rx_adptr_pool_cnt--;
+			return;
+		}
+		dev->rx_adptr_pools[dev->rx_adptr_pool_cnt - 1] =
+			(uint64_t)rxq->qconf.mp;
+
+		dev->adptr_xae_cnt += rxq->qconf.mp->size;
+		break;
+	}
 	case RTE_EVENT_TYPE_TIMER: {
 		struct cnxk_tim_ring *timr = data;
 		uint16_t *old_ring_ptr;
@@ -65,3 +92,152 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		break;
 	}
 }
+
+static int
+cnxk_sso_rxq_enable(struct cnxk_eth_dev *cnxk_eth_dev, uint16_t rq_id,
+		    uint16_t port_id, const struct rte_event *ev,
+		    uint8_t custom_flowid)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+	rq->sso_ena = 1;
+	rq->tt = ev->sched_type;
+	rq->hwgrp = ev->queue_id;
+	rq->flow_tag_width = 20;
+	rq->wqe_skip = 1;
+	rq->tag_mask = (port_id & 0xF) << 20;
+	rq->tag_mask |= (((port_id >> 4) & 0xF) | (RTE_EVENT_TYPE_ETHDEV << 4))
+			<< 24;
+
+	if (custom_flowid) {
+		rq->flow_tag_width = 0;
+		rq->tag_mask |= ev->flow_id;
+	}
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+static int
+cnxk_sso_rxq_disable(struct cnxk_eth_dev *cnxk_eth_dev, uint16_t rq_id)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+	rq->sso_ena = 0;
+	rq->flow_tag_width = 32;
+	rq->tag_mask = 0;
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+int
+cnxk_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t port = eth_dev->data->port_id;
+	struct cnxk_eth_rxq_sp *rxq_sp;
+	int i, rc = 0;
+
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			rxq_sp = eth_dev->data->rx_queues[i];
+			rxq_sp = rxq_sp - 1;
+			cnxk_sso_updt_xae_cnt(dev, rxq_sp,
+					      RTE_EVENT_TYPE_ETHDEV);
+			rc = cnxk_sso_xae_reconfigure(
+				(struct rte_eventdev *)(uintptr_t)event_dev);
+			rc |= cnxk_sso_rxq_enable(
+				cnxk_eth_dev, i, port, &queue_conf->ev,
+				!!(queue_conf->rx_queue_flags &
+				RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID));
+			rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+					      rxq_sp->qconf.mp->pool_id, true,
+					      dev->force_ena_bp);
+		}
+	} else {
+		rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+		rxq_sp = rxq_sp - 1;
+		cnxk_sso_updt_xae_cnt(dev, rxq_sp, RTE_EVENT_TYPE_ETHDEV);
+		rc = cnxk_sso_xae_reconfigure(
+			(struct rte_eventdev *)(uintptr_t)event_dev);
+		rc |= cnxk_sso_rxq_enable(
+			cnxk_eth_dev, (uint16_t)rx_queue_id, port,
+			&queue_conf->ev,
+			!!(queue_conf->rx_queue_flags &
+			   RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID));
+		rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+				      rxq_sp->qconf.mp->pool_id, true,
+				      dev->force_ena_bp);
+	}
+
+	if (rc < 0) {
+		plt_err("Failed to configure Rx adapter port=%d, q=%d", port,
+			queue_conf->ev.queue_id);
+		return rc;
+	}
+
+	dev->rx_offloads |= cnxk_eth_dev->rx_offload_flags;
+
+	return 0;
+}
+
+int
+cnxk_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t rx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	struct cnxk_eth_rxq_sp *rxq_sp;
+	int i, rc = 0;
+
+	RTE_SET_USED(event_dev);
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+			rxq_sp = rxq_sp - 1;
+			rc = cnxk_sso_rxq_disable(cnxk_eth_dev, i);
+			rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+					      rxq_sp->qconf.mp->pool_id, false,
+					      dev->force_ena_bp);
+		}
+	} else {
+		rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+		rxq_sp = rxq_sp - 1;
+		rc = cnxk_sso_rxq_disable(cnxk_eth_dev, (uint16_t)rx_queue_id);
+		rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+				      rxq_sp->qconf.mp->pool_id, false,
+				      dev->force_ena_bp);
+	}
+
+	if (rc < 0)
+		plt_err("Failed to clear Rx adapter config port=%d, q=%d",
+			eth_dev->data->port_id, rx_queue_id);
+
+	return rc;
+}
+
+int
+cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
+			  const struct rte_eth_dev *eth_dev)
+{
+	RTE_SET_USED(event_dev);
+	RTE_SET_USED(eth_dev);
+
+	return 0;
+}
+
+int
+cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
+			 const struct rte_eth_dev *eth_dev)
+{
+	RTE_SET_USED(event_dev);
+	RTE_SET_USED(eth_dev);
+
+	return 0;
+}
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index 87bb9f76a9..eda562f5b5 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -21,4 +21,11 @@ sources = files(
         'cnxk_tim_worker.c',
 )
 
-deps += ['bus_pci', 'common_cnxk']
+extra_flags = ['-flax-vector-conversions', '-Wno-strict-aliasing']
+foreach flag: extra_flags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
+deps += ['bus_pci', 'common_cnxk', 'net_cnxk']
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 08/13] event/cnxk: add Rx adapter fastpath ops
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (5 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 07/13] event/cnxk: add Rx adapter support pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 09/13] event/cnxk: add Tx adapter support pbhagavatula
                     ` (5 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Rx adapter fastpath operations.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c           | 136 +++++++-
 drivers/event/cnxk/cn10k_worker.c             |  54 ----
 drivers/event/cnxk/cn10k_worker.h             |  97 +++++-
 drivers/event/cnxk/cn10k_worker_deq.c         |  44 +++
 drivers/event/cnxk/cn10k_worker_deq_burst.c   |  29 ++
 drivers/event/cnxk/cn10k_worker_deq_tmo.c     |  72 +++++
 drivers/event/cnxk/cn9k_eventdev.c            | 305 +++++++++++++++++-
 drivers/event/cnxk/cn9k_worker.c              | 117 -------
 drivers/event/cnxk/cn9k_worker.h              | 174 ++++++++--
 drivers/event/cnxk/cn9k_worker_deq.c          |  44 +++
 drivers/event/cnxk/cn9k_worker_deq_burst.c    |  29 ++
 drivers/event/cnxk/cn9k_worker_deq_tmo.c      |  72 +++++
 drivers/event/cnxk/cn9k_worker_dual_deq.c     |  53 +++
 .../event/cnxk/cn9k_worker_dual_deq_burst.c   |  30 ++
 drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c |  89 +++++
 drivers/event/cnxk/cnxk_eventdev.h            |   1 +
 drivers/event/cnxk/meson.build                |   9 +
 17 files changed, 1124 insertions(+), 231 deletions(-)
 create mode 100644 drivers/event/cnxk/cn10k_worker_deq.c
 create mode 100644 drivers/event/cnxk/cn10k_worker_deq_burst.c
 create mode 100644 drivers/event/cnxk/cn10k_worker_deq_tmo.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_deq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_deq_burst.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_deq_tmo.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_deq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_deq_burst.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 2060c8fe84..ba7d95fff7 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -237,17 +237,141 @@ static void
 cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	const event_dequeue_t sso_hws_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_tmo_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_tmo_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
 
 	event_dev->enqueue = cn10k_sso_hws_enq;
 	event_dev->enqueue_burst = cn10k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn10k_sso_hws_enq_new_burst;
 	event_dev->enqueue_forward_burst = cn10k_sso_hws_enq_fwd_burst;
-
-	event_dev->dequeue = cn10k_sso_hws_deq;
-	event_dev->dequeue_burst = cn10k_sso_hws_deq_burst;
-	if (dev->is_timeout_deq) {
-		event_dev->dequeue = cn10k_sso_hws_tmo_deq;
-		event_dev->dequeue_burst = cn10k_sso_hws_tmo_deq_burst;
+	if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+		event_dev->dequeue = sso_hws_deq_seg
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_seg_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
+	} else {
+		event_dev->dequeue = sso_hws_deq
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
 	}
 }
 
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 5dbae275ba..c71aa37327 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -60,57 +60,3 @@ cn10k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 
 	return 1;
 }
-
-uint16_t __rte_hot
-cn10k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn10k_sso_hws *ws = port;
-
-	RTE_SET_USED(timeout_ticks);
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
-		return 1;
-	}
-
-	return cn10k_sso_hws_get_work(ws, ev);
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
-			uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn10k_sso_hws_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn10k_sso_hws *ws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
-		return ret;
-	}
-
-	ret = cn10k_sso_hws_get_work(ws, ev);
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
-		ret = cn10k_sso_hws_get_work(ws, ev);
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-			    uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn10k_sso_hws_tmo_deq(port, ev, timeout_ticks);
-}
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index c7250bf9e7..b724083caa 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -87,20 +87,37 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 		cn10k_sso_hws_fwd_group(ws, ev, grp);
 }
 
+static __rte_always_inline void
+cn10k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
+		  const uint32_t tag, const uint32_t flags,
+		  const void *const lookup_mem)
+{
+	const uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM |
+				   (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
+
+	cn10k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag,
+			      (struct rte_mbuf *)mbuf, lookup_mem,
+			      mbuf_init | ((uint64_t)port_id) << 48, flags);
+}
+
 static __rte_always_inline uint16_t
-cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
+cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
+		       const uint32_t flags, void *lookup_mem)
 {
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t tstamp_ptr;
+	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
 		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		: [wdata] "+r"(gw.get_work)
+		"sub %[mbuf], %H[wdata], #0x80				\n"
+		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -109,11 +126,34 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn10k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					  gw.u64[0] & 0xFFFFF, flags,
+					  lookup_mem);
+			/* Extracting tstamp, if PTP enabled*/
+			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
+							    gw.u64[1]) +
+						   CNXK_SSO_WQE_SG_PTR);
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						ws->tstamp,
+						flags & NIX_RX_OFFLOAD_TSTAMP_F,
+						flags & NIX_RX_MULTI_SEG_F,
+						(uint64_t *)tstamp_ptr);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -128,6 +168,7 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
@@ -138,7 +179,9 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=			\n"
 		     "done%=:	dmb ld					\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80		\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
 		     : "memory");
 #else
@@ -146,12 +189,25 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn10k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					  gw.u64[0] & 0xFFFFF, 0, NULL);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -170,16 +226,29 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 					       const struct rte_event ev[],
 					       uint16_t nb_events);
 
-uint16_t __rte_hot cn10k_sso_hws_deq(void *port, struct rte_event *ev,
-				     uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_deq_burst(void *port, struct rte_event ev[],
-					   uint16_t nb_events,
-					   uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev,
-					 uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_tmo_deq_burst(void *port,
-					       struct rte_event ev[],
-					       uint16_t nb_events,
-					       uint64_t timeout_ticks);
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_##name(                           \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_burst_##name(                     \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
 
 #endif
diff --git a/drivers/event/cnxk/cn10k_worker_deq.c b/drivers/event/cnxk/cn10k_worker_deq.c
new file mode 100644
index 0000000000..36ec454ccc
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_deq.c
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_##name(                           \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn10k_sso_hws_get_work(                                 \
+			ws, ev, flags | NIX_RX_MULTI_SEG_F, ws->lookup_mem);   \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn10k_worker_deq_burst.c b/drivers/event/cnxk/cn10k_worker_deq_burst.c
new file mode 100644
index 0000000000..29ecc551cf
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_deq_burst.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_burst_##name(                     \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_##name(port, ev, timeout_ticks);      \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_seg_##name(port, ev, timeout_ticks);  \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn10k_worker_deq_tmo.c b/drivers/event/cnxk/cn10k_worker_deq_tmo.c
new file mode 100644
index 0000000000..c8524a27bd
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_deq_tmo.c
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn10k_sso_hws_get_work(ws, ev, flags,            \
+						     ws->lookup_mem);          \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_tmo_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn10k_sso_hws_get_work(ws, ev, flags,            \
+						     ws->lookup_mem);          \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_tmo_seg_##name(port, ev,              \
+							timeout_ticks);        \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 072800c243..e386cb784a 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -252,17 +252,202 @@ static void
 cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	/* Single WS modes */
+	const event_dequeue_t sso_hws_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_tmo[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_tmo_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_tmo_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_deq_tmo_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
+
+	/* Dual WS modes */
+	const event_dequeue_t sso_hws_dual_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_dual_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_dual_deq_tmo[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_dual_deq_tmo_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
+
+	const event_dequeue_t sso_hws_dual_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_dual_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
+
+	const event_dequeue_t sso_hws_dual_deq_tmo_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_dual_deq_tmo_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
 
 	event_dev->enqueue = cn9k_sso_hws_enq;
 	event_dev->enqueue_burst = cn9k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn9k_sso_hws_enq_new_burst;
 	event_dev->enqueue_forward_burst = cn9k_sso_hws_enq_fwd_burst;
-
-	event_dev->dequeue = cn9k_sso_hws_deq;
-	event_dev->dequeue_burst = cn9k_sso_hws_deq_burst;
-	if (dev->deq_tmo_ns) {
-		event_dev->dequeue = cn9k_sso_hws_tmo_deq;
-		event_dev->dequeue_burst = cn9k_sso_hws_tmo_deq_burst;
+	if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+		event_dev->dequeue = sso_hws_deq_seg
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_seg_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_deq_tmo_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_deq_tmo_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
+	} else {
+		event_dev->dequeue = sso_hws_deq
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_deq_tmo
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_deq_tmo_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
 	}
 
 	if (dev->dual_ws) {
@@ -272,14 +457,110 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 		event_dev->enqueue_forward_burst =
 			cn9k_sso_hws_dual_enq_fwd_burst;
 
-		event_dev->dequeue = cn9k_sso_hws_dual_deq;
-		event_dev->dequeue_burst = cn9k_sso_hws_dual_deq_burst;
-		if (dev->deq_tmo_ns) {
-			event_dev->dequeue = cn9k_sso_hws_dual_tmo_deq;
-			event_dev->dequeue_burst =
-				cn9k_sso_hws_dual_tmo_deq_burst;
+		if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+			event_dev->dequeue = sso_hws_dual_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_dual_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			if (dev->is_timeout_deq) {
+				event_dev->dequeue = sso_hws_dual_deq_tmo_seg
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_TSTAMP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_CHECKSUM_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_PTYPE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_RSS_F)];
+				event_dev->dequeue_burst =
+					sso_hws_dual_deq_tmo_seg_burst
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_TSTAMP_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_CHECKSUM_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_PTYPE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_RSS_F)];
+			}
+		} else {
+			event_dev->dequeue = sso_hws_dual_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_dual_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			if (dev->is_timeout_deq) {
+				event_dev->dequeue = sso_hws_dual_deq_tmo
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_TSTAMP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_CHECKSUM_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_PTYPE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_RSS_F)];
+				event_dev->dequeue_burst =
+					sso_hws_dual_deq_tmo_burst
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_TSTAMP_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_CHECKSUM_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_PTYPE_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_RSS_F)];
+			}
 		}
 	}
+
+	rte_mb();
 }
 
 static void *
diff --git a/drivers/event/cnxk/cn9k_worker.c b/drivers/event/cnxk/cn9k_worker.c
index 9ceacc98dd..538bc4b0b3 100644
--- a/drivers/event/cnxk/cn9k_worker.c
+++ b/drivers/event/cnxk/cn9k_worker.c
@@ -60,60 +60,6 @@ cn9k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 	return 1;
 }
 
-uint16_t __rte_hot
-cn9k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws *ws = port;
-
-	RTE_SET_USED(timeout_ticks);
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_op);
-		return 1;
-	}
-
-	return cn9k_sso_hws_get_work(ws, ev);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
-		       uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws *ws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_op);
-		return ret;
-	}
-
-	ret = cn9k_sso_hws_get_work(ws, ev);
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
-		ret = cn9k_sso_hws_get_work(ws, ev);
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-			   uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_tmo_deq(port, ev, timeout_ticks);
-}
-
 /* Dual ws ops. */
 
 uint16_t __rte_hot
@@ -171,66 +117,3 @@ cn9k_sso_hws_dual_enq_fwd_burst(void *port, const struct rte_event ev[],
 
 	return 1;
 }
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws_dual *dws = port;
-	uint16_t gw;
-
-	RTE_SET_USED(timeout_ticks);
-	if (dws->swtag_req) {
-		dws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(dws->ws_state[!dws->vws].tag_op);
-		return 1;
-	}
-
-	gw = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-					&dws->ws_state[!dws->vws], ev);
-	dws->vws = !dws->vws;
-	return gw;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_deq_burst(void *port, struct rte_event ev[],
-			    uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_dual_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_tmo_deq(void *port, struct rte_event *ev,
-			  uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws_dual *dws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (dws->swtag_req) {
-		dws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(dws->ws_state[!dws->vws].tag_op);
-		return ret;
-	}
-
-	ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-					 &dws->ws_state[!dws->vws], ev);
-	dws->vws = !dws->vws;
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {
-		ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-						 &dws->ws_state[!dws->vws], ev);
-		dws->vws = !dws->vws;
-	}
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_tmo_deq_burst(void *port, struct rte_event ev[],
-				uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_dual_tmo_deq(port, ev, timeout_ticks);
-}
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index f5a4401465..c01c00e1da 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -128,17 +128,36 @@ cn9k_sso_hws_dual_forward_event(struct cn9k_sso_hws_dual *dws,
 	}
 }
 
+static __rte_always_inline void
+cn9k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
+		 const uint32_t tag, const uint32_t flags,
+		 const void *const lookup_mem)
+{
+	const uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM |
+				   (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
+
+	cn9k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag,
+			     (struct rte_mbuf *)mbuf, lookup_mem,
+			     mbuf_init | ((uint64_t)port_id) << 48, flags);
+}
+
 static __rte_always_inline uint16_t
 cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 			   struct cn9k_sso_hws_state *ws_pair,
-			   struct rte_event *ev)
+			   struct rte_event *ev, const uint32_t flags,
+			   const void *const lookup_mem,
+			   struct cnxk_timesync_info *const tstamp)
 {
 	const uint64_t set_gw = BIT_ULL(16) | 1;
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t tstamp_ptr;
+	uint64_t mbuf;
 
+	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
+		rte_prefetch_non_temporal(lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "rty%=:					\n"
@@ -147,7 +166,10 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	str %[gw], [%[pong]]		\n"
 		     "		dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     "		prfm pldl1keep, [%[mbuf]]	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op),
 		       [gw] "r"(set_gw), [pong] "r"(ws_pair->getwrk_op));
 #else
@@ -156,12 +178,34 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 		gw.u64[0] = plt_read64(ws->tag_op);
 	gw.u64[1] = plt_read64(ws->wqp_op);
 	plt_write64(set_gw, ws_pair->getwrk_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, flags,
+					 lookup_mem);
+			/* Extracting tstamp, if PTP enabled*/
+			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
+							    gw.u64[1]) +
+						   CNXK_SSO_WQE_SG_PTR);
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+						flags & NIX_RX_OFFLOAD_TSTAMP_F,
+						flags & NIX_RX_MULTI_SEG_F,
+						(uint64_t *)tstamp_ptr);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -169,16 +213,22 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 }
 
 static __rte_always_inline uint16_t
-cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
+cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
+		      const uint32_t flags, const void *const lookup_mem)
 {
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t tstamp_ptr;
+	uint64_t mbuf;
 
 	plt_write64(BIT_ULL(16) | /* wait for work. */
 			    1,	  /* Use Mask set 0. */
 		    ws->getwrk_op);
+
+	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
+		rte_prefetch_non_temporal(lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
@@ -190,7 +240,10 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     "		prfm pldl1keep, [%[mbuf]]	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op));
 #else
 	gw.u64[0] = plt_read64(ws->tag_op);
@@ -198,12 +251,35 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
 		gw.u64[0] = plt_read64(ws->tag_op);
 
 	gw.u64[1] = plt_read64(ws->wqp_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, flags,
+					 lookup_mem);
+			/* Extracting tstamp, if PTP enabled*/
+			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
+							    gw.u64[1]) +
+						   CNXK_SSO_WQE_SG_PTR);
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						ws->tstamp,
+						flags & NIX_RX_OFFLOAD_TSTAMP_F,
+						flags & NIX_RX_MULTI_SEG_F,
+						(uint64_t *)tstamp_ptr);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -218,6 +294,7 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
@@ -230,7 +307,9 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op));
 #else
 	gw.u64[0] = plt_read64(ws->tag_op);
@@ -238,12 +317,25 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		gw.u64[0] = plt_read64(ws->tag_op);
 
 	gw.u64[1] = plt_read64(ws->wqp_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, 0, NULL);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -274,28 +366,54 @@ uint16_t __rte_hot cn9k_sso_hws_dual_enq_fwd_burst(void *port,
 						   const struct rte_event ev[],
 						   uint16_t nb_events);
 
-uint16_t __rte_hot cn9k_sso_hws_deq(void *port, struct rte_event *ev,
-				    uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_deq_burst(void *port, struct rte_event ev[],
-					  uint16_t nb_events,
-					  uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_tmo_deq(void *port, struct rte_event *ev,
-					uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-					      uint16_t nb_events,
-					      uint64_t timeout_ticks);
-
-uint16_t __rte_hot cn9k_sso_hws_dual_deq(void *port, struct rte_event *ev,
-					 uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst(void *port,
-					       struct rte_event ev[],
-					       uint16_t nb_events,
-					       uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq(void *port, struct rte_event *ev,
-					     uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_burst(void *port,
-						   struct rte_event ev[],
-						   uint16_t nb_events,
-						   uint64_t timeout_ticks);
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_##name(                            \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_burst_##name(                      \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_##name(                    \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_burst_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_##name(               \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_burst_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
 
 #endif
diff --git a/drivers/event/cnxk/cn9k_worker_deq.c b/drivers/event/cnxk/cn9k_worker_deq.c
new file mode 100644
index 0000000000..51ccaf4ec4
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_deq.c
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_##name(                            \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn9k_sso_hws_get_work(                                  \
+			ws, ev, flags | NIX_RX_MULTI_SEG_F, ws->lookup_mem);   \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_deq_burst.c b/drivers/event/cnxk/cn9k_worker_deq_burst.c
new file mode 100644
index 0000000000..4e2801459b
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_deq_burst.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_burst_##name(                      \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_##name(port, ev, timeout_ticks);       \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_seg_##name(port, ev, timeout_ticks);   \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_deq_tmo.c b/drivers/event/cnxk/cn9k_worker_deq_tmo.c
new file mode 100644
index 0000000000..9713d1ef00
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_deq_tmo.c
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);    \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn9k_sso_hws_get_work(ws, ev, flags,             \
+						    ws->lookup_mem);           \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_tmo_##name(port, ev, timeout_ticks);   \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_##name(                    \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);    \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn9k_sso_hws_get_work(ws, ev, flags,             \
+						    ws->lookup_mem);           \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_burst_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_tmo_seg_##name(port, ev,               \
+						       timeout_ticks);         \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_dual_deq.c b/drivers/event/cnxk/cn9k_worker_dual_deq.c
new file mode 100644
index 0000000000..709fa2d9ef
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_deq.c
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t gw;                                                   \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return 1;                                              \
+		}                                                              \
+									       \
+		gw = cn9k_sso_hws_dual_get_work(                               \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		return gw;                                                     \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t gw;                                                   \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return 1;                                              \
+		}                                                              \
+									       \
+		gw = cn9k_sso_hws_dual_get_work(                               \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		return gw;                                                     \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_dual_deq_burst.c b/drivers/event/cnxk/cn9k_worker_dual_deq_burst.c
new file mode 100644
index 0000000000..d50e1cf83f
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_deq_burst.c
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_seg_##name(port, ev,              \
+							timeout_ticks);        \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c b/drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c
new file mode 100644
index 0000000000..a0508fdf0d
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_dual_get_work(                              \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
+			ret = cn9k_sso_hws_dual_get_work(                      \
+				&dws->ws_state[dws->vws],                      \
+				&dws->ws_state[!dws->vws], ev, flags,          \
+				dws->lookup_mem, dws->tstamp);                 \
+			dws->vws = !dws->vws;                                  \
+		}                                                              \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_tmo_##name(port, ev,              \
+							timeout_ticks);        \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_##name(               \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_dual_get_work(                              \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
+			ret = cn9k_sso_hws_dual_get_work(                      \
+				&dws->ws_state[dws->vws],                      \
+				&dws->ws_state[!dws->vws], ev, flags,          \
+				dws->lookup_mem, dws->tstamp);                 \
+			dws->vws = !dws->vws;                                  \
+		}                                                              \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_burst_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_tmo_seg_##name(port, ev,          \
+							    timeout_ticks);    \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index b65d725f55..9d5d2d0339 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -33,6 +33,7 @@
 #define CNXK_SSO_MZ_NAME       "cnxk_evdev_mz"
 #define CNXK_SSO_XAQ_CACHE_CNT (0x7)
 #define CNXK_SSO_XAQ_SLACK     (8)
+#define CNXK_SSO_WQE_SG_PTR    (9)
 
 #define CNXK_TT_FROM_TAG(x)	    (((x) >> 32) & SSO_TT_EMPTY)
 #define CNXK_TT_FROM_EVENT(x)	    (((x) >> 38) & SSO_TT_EMPTY)
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index eda562f5b5..c5c1c0ee8e 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -11,8 +11,17 @@ endif
 sources = files(
         'cn9k_eventdev.c',
         'cn9k_worker.c',
+        'cn9k_worker_deq.c',
+        'cn9k_worker_deq_burst.c',
+        'cn9k_worker_deq_tmo.c',
+        'cn9k_worker_dual_deq.c',
+        'cn9k_worker_dual_deq_burst.c',
+        'cn9k_worker_dual_deq_tmo.c',
         'cn10k_eventdev.c',
         'cn10k_worker.c',
+        'cn10k_worker_deq.c',
+        'cn10k_worker_deq_burst.c',
+        'cn10k_worker_deq_tmo.c',
         'cnxk_eventdev.c',
         'cnxk_eventdev_adptr.c',
         'cnxk_eventdev_selftest.c',
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 09/13] event/cnxk: add Tx adapter support
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (6 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 08/13] event/cnxk: add Rx adapter fastpath ops pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 10/13] event/cnxk: add Tx adapter fastpath ops pbhagavatula
                     ` (4 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Tx adapter.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |   4 +-
 doc/guides/rel_notes/release_21_08.rst   |   6 +-
 drivers/event/cnxk/cn10k_eventdev.c      |  91 ++++++++++++++++++
 drivers/event/cnxk/cn9k_eventdev.c       | 117 +++++++++++++++++++++++
 drivers/event/cnxk/cnxk_eventdev.h       |  21 +++-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 106 ++++++++++++++++++++
 6 files changed, 339 insertions(+), 6 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index b7e82c1273..6fdccc2ab4 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -42,7 +42,9 @@ Features of the OCTEON cnxk SSO PMD are:
 - HW managed packets enqueued from ethdev to eventdev exposed through event eth
   RX adapter.
 - N:1 ethernet device Rx queue to Event queue mapping.
-- Full Rx offload support defined through ethdev queue configuration.
+- Lockfree Tx from event eth Tx adapter using ``DEV_TX_OFFLOAD_MT_LOCKFREE``
+  capability while maintaining receive packet order.
+- Full Rx/Tx offload support defined through ethdev queue configuration.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 3892c8017a..80ff93269c 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -60,10 +60,10 @@ New Features
   * Added net/cnxk driver which provides the support for the integrated ethernet
     device.
 
-* **Added support for Marvell CN10K, CN9K, event Rx adapter.**
+* **Added support for Marvell CN10K, CN9K, event Rx/Tx adapter.**
 
-  * Added Rx adapter support for event/cnxk when the ethernet device requested is
-    net/cnxk.
+  * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested
+    is net/cnxk.
 
 
 Removed Items
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index ba7d95fff7..8a9b04a3db 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -44,6 +44,7 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
+	ws->tx_base = ws->base;
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -233,6 +234,39 @@ cn10k_sso_rsrc_init(void *arg, uint8_t hws, uint8_t hwgrp)
 	return roc_sso_rsrc_init(&dev->sso, hws, hwgrp);
 }
 
+static int
+cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	if (dev->tx_adptr_data == NULL)
+		return 0;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
+		void *ws_cookie;
+
+		ws_cookie = cnxk_sso_hws_get_cookie(ws);
+		ws_cookie = rte_realloc_socket(
+			ws_cookie,
+			sizeof(struct cnxk_sso_hws_cookie) +
+				sizeof(struct cn10k_sso_hws) +
+				(sizeof(uint64_t) * (dev->max_port_id + 1) *
+				 RTE_MAX_QUEUES_PER_PORT),
+			RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+		if (ws_cookie == NULL)
+			return -ENOMEM;
+		ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
+		memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
+		       sizeof(uint64_t) * (dev->max_port_id + 1) *
+			       RTE_MAX_QUEUES_PER_PORT);
+		event_dev->data->ports[i] = ws;
+	}
+
+	return 0;
+}
+
 static void
 cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
@@ -493,6 +527,10 @@ cn10k_sso_start(struct rte_eventdev *event_dev)
 {
 	int rc;
 
+	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+
 	rc = cnxk_sso_start(event_dev, cn10k_sso_hws_reset,
 			    cn10k_sso_hws_flush_events);
 	if (rc < 0)
@@ -595,6 +633,55 @@ cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
+			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (ret)
+		*caps = 0;
+	else
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+
+	return 0;
+}
+
+static int
+cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn10k_sso_tx_adapter_queue_del(uint8_t id, const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_del(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	return cn10k_sso_updt_tx_adptr_data(event_dev);
+}
+
 static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.dev_infos_get = cn10k_sso_info_get,
 	.dev_configure = cn10k_sso_dev_configure,
@@ -614,6 +701,10 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_tx_adapter_caps_get = cn10k_sso_tx_adapter_caps_get,
+	.eth_tx_adapter_queue_add = cn10k_sso_tx_adapter_queue_add,
+	.eth_tx_adapter_queue_del = cn10k_sso_tx_adapter_queue_del,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index e386cb784a..bdc5632235 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -248,6 +248,66 @@ cn9k_sso_rsrc_init(void *arg, uint8_t hws, uint8_t hwgrp)
 	return roc_sso_rsrc_init(&dev->sso, hws, hwgrp);
 }
 
+static int
+cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	if (dev->tx_adptr_data == NULL)
+		return 0;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		if (dev->dual_ws) {
+			struct cn9k_sso_hws_dual *dws =
+				event_dev->data->ports[i];
+			void *ws_cookie;
+
+			ws_cookie = cnxk_sso_hws_get_cookie(dws);
+			ws_cookie = rte_realloc_socket(
+				ws_cookie,
+				sizeof(struct cnxk_sso_hws_cookie) +
+					sizeof(struct cn9k_sso_hws_dual) +
+					(sizeof(uint64_t) *
+					 (dev->max_port_id + 1) *
+					 RTE_MAX_QUEUES_PER_PORT),
+				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+			if (ws_cookie == NULL)
+				return -ENOMEM;
+			dws = RTE_PTR_ADD(ws_cookie,
+					  sizeof(struct cnxk_sso_hws_cookie));
+			memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
+			       sizeof(uint64_t) * (dev->max_port_id + 1) *
+				       RTE_MAX_QUEUES_PER_PORT);
+			event_dev->data->ports[i] = dws;
+		} else {
+			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
+			void *ws_cookie;
+
+			ws_cookie = cnxk_sso_hws_get_cookie(ws);
+			ws_cookie = rte_realloc_socket(
+				ws_cookie,
+				sizeof(struct cnxk_sso_hws_cookie) +
+					sizeof(struct cn9k_sso_hws_dual) +
+					(sizeof(uint64_t) *
+					 (dev->max_port_id + 1) *
+					 RTE_MAX_QUEUES_PER_PORT),
+				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+			if (ws_cookie == NULL)
+				return -ENOMEM;
+			ws = RTE_PTR_ADD(ws_cookie,
+					 sizeof(struct cnxk_sso_hws_cookie));
+			memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
+			       sizeof(uint64_t) * (dev->max_port_id + 1) *
+				       RTE_MAX_QUEUES_PER_PORT);
+			event_dev->data->ports[i] = ws;
+		}
+	}
+	rte_mb();
+
+	return 0;
+}
+
 static void
 cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
@@ -734,6 +794,10 @@ cn9k_sso_start(struct rte_eventdev *event_dev)
 {
 	int rc;
 
+	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+
 	rc = cnxk_sso_start(event_dev, cn9k_sso_hws_reset,
 			    cn9k_sso_hws_flush_events);
 	if (rc < 0)
@@ -844,6 +908,55 @@ cn9k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn9k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
+			     const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (ret)
+		*caps = 0;
+	else
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+
+	return 0;
+}
+
+static int
+cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn9k_sso_tx_adapter_queue_del(uint8_t id, const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_del(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	return cn9k_sso_updt_tx_adptr_data(event_dev);
+}
+
 static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.dev_infos_get = cn9k_sso_info_get,
 	.dev_configure = cn9k_sso_dev_configure,
@@ -863,6 +976,10 @@ static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_tx_adapter_caps_get = cn9k_sso_tx_adapter_caps_get,
+	.eth_tx_adapter_queue_add = cn9k_sso_tx_adapter_queue_add,
+	.eth_tx_adapter_queue_del = cn9k_sso_tx_adapter_queue_del,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 9d5d2d0339..458fdc8d92 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -8,6 +8,7 @@
 #include <rte_devargs.h>
 #include <rte_ethdev.h>
 #include <rte_event_eth_rx_adapter.h>
+#include <rte_event_eth_tx_adapter.h>
 #include <rte_kvargs.h>
 #include <rte_mbuf_pool_ops.h>
 #include <rte_pci.h>
@@ -86,9 +87,12 @@ struct cnxk_sso_evdev {
 	rte_iova_t fc_iova;
 	struct rte_mempool *xaq_pool;
 	uint64_t rx_offloads;
+	uint64_t tx_offloads;
 	uint64_t adptr_xae_cnt;
 	uint16_t rx_adptr_pool_cnt;
 	uint64_t *rx_adptr_pools;
+	uint64_t *tx_adptr_data;
+	uint16_t max_port_id;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -115,7 +119,10 @@ struct cn10k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
+	/* Tx Fastpath data */
+	uint64_t tx_base __rte_cache_aligned;
 	uintptr_t lmt_base;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 /* CN9K HWS ops */
@@ -140,7 +147,9 @@ struct cn9k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base;
+	/* Tx Fastpath data */
+	uint64_t base __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 struct cn9k_sso_hws_state {
@@ -160,7 +169,9 @@ struct cn9k_sso_hws_dual {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base[2];
+	/* Tx Fastpath data */
+	uint64_t base[2] __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 struct cnxk_sso_hws_cookie {
@@ -267,5 +278,11 @@ int cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
 			      const struct rte_eth_dev *eth_dev);
 int cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
 			     const struct rte_eth_dev *eth_dev);
+int cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t tx_queue_id);
+int cnxk_sso_tx_adapter_queue_del(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t tx_queue_id);
 
 #endif /* __CNXK_EVENTDEV_H__ */
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 24bfd985e7..548d7b81ce 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -5,6 +5,8 @@
 #include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 
+#define CNXK_SSO_SQB_LIMIT (0x180)
+
 void
 cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		      uint32_t event_type)
@@ -241,3 +243,107 @@ cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
 
 	return 0;
 }
+
+static int
+cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
+{
+	uint16_t sqb_limit;
+
+	sqb_limit = RTE_MIN(nb_sqb_bufs, sq->nb_sqb_bufs);
+	return roc_npa_aura_limit_modify(sq->aura_handle, sqb_limit);
+}
+
+static int
+cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
+			    uint16_t eth_port_id, uint16_t tx_queue_id,
+			    void *txq)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t max_port_id = dev->max_port_id;
+	uint64_t *txq_data = dev->tx_adptr_data;
+
+	if (txq_data == NULL || eth_port_id > max_port_id) {
+		max_port_id = RTE_MAX(max_port_id, eth_port_id);
+		txq_data = rte_realloc_socket(
+			txq_data,
+			(sizeof(uint64_t) * (max_port_id + 1) *
+			 RTE_MAX_QUEUES_PER_PORT),
+			RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
+		if (txq_data == NULL)
+			return -ENOMEM;
+	}
+
+	((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
+		 txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
+	dev->max_port_id = max_port_id;
+	dev->tx_adptr_data = txq_data;
+	return 0;
+}
+
+int
+cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	struct roc_nix_sq *sq;
+	int i, ret;
+	void *txq;
+
+	if (tx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
+			txq = eth_dev->data->tx_queues[i];
+			sq = &cnxk_eth_dev->sqs[i];
+			cnxk_sso_sqb_aura_limit_edit(sq, CNXK_SSO_SQB_LIMIT);
+			ret = cnxk_sso_updt_tx_queue_data(
+				event_dev, eth_dev->data->port_id, i, txq);
+			if (ret < 0)
+				return ret;
+		}
+	} else {
+		txq = eth_dev->data->tx_queues[tx_queue_id];
+		sq = &cnxk_eth_dev->sqs[tx_queue_id];
+		cnxk_sso_sqb_aura_limit_edit(sq, CNXK_SSO_SQB_LIMIT);
+		ret = cnxk_sso_updt_tx_queue_data(
+			event_dev, eth_dev->data->port_id, tx_queue_id, txq);
+		if (ret < 0)
+			return ret;
+	}
+
+	dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
+
+	return 0;
+}
+
+int
+cnxk_sso_tx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct roc_nix_sq *sq;
+	int i, ret;
+
+	RTE_SET_USED(event_dev);
+	if (tx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
+			sq = &cnxk_eth_dev->sqs[i];
+			cnxk_sso_sqb_aura_limit_edit(sq, sq->nb_sqb_bufs);
+			ret = cnxk_sso_updt_tx_queue_data(
+				event_dev, eth_dev->data->port_id, tx_queue_id,
+				NULL);
+			if (ret < 0)
+				return ret;
+		}
+	} else {
+		sq = &cnxk_eth_dev->sqs[tx_queue_id];
+		cnxk_sso_sqb_aura_limit_edit(sq, sq->nb_sqb_bufs);
+		ret = cnxk_sso_updt_tx_queue_data(
+			event_dev, eth_dev->data->port_id, tx_queue_id, NULL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 10/13] event/cnxk: add Tx adapter fastpath ops
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (7 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 09/13] event/cnxk: add Tx adapter support pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 11/13] event/cnxk: add Rx adapter vector support pbhagavatula
                     ` (3 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Tx adapter fastpath operations.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c           | 38 ++++++++
 drivers/event/cnxk/cn10k_worker.h             | 67 ++++++++++++++
 drivers/event/cnxk/cn10k_worker_tx_enq.c      | 23 +++++
 drivers/event/cnxk/cn10k_worker_tx_enq_seg.c  | 23 +++++
 drivers/event/cnxk/cn9k_eventdev.c            | 81 +++++++++++++++++
 drivers/event/cnxk/cn9k_worker.h              | 87 +++++++++++++++++++
 drivers/event/cnxk/cn9k_worker_dual_tx_enq.c  | 23 +++++
 .../event/cnxk/cn9k_worker_dual_tx_enq_seg.c  | 23 +++++
 drivers/event/cnxk/cn9k_worker_tx_enq.c       | 23 +++++
 drivers/event/cnxk/cn9k_worker_tx_enq_seg.c   | 23 +++++
 drivers/event/cnxk/meson.build                |  6 ++
 11 files changed, 417 insertions(+)
 create mode 100644 drivers/event/cnxk/cn10k_worker_tx_enq.c
 create mode 100644 drivers/event/cnxk/cn10k_worker_tx_enq_seg.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_tx_enq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_tx_enq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_tx_enq_seg.c

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 8a9b04a3db..e462f770c5 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -328,6 +328,23 @@ cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 #undef R
 		};
 
+	/* Tx modes */
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq_seg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
 	event_dev->enqueue = cn10k_sso_hws_enq;
 	event_dev->enqueue_burst = cn10k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn10k_sso_hws_enq_new_burst;
@@ -407,6 +424,27 @@ cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
 		}
 	}
+
+	if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+		/* [SEC] [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM] */
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq_seg
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	} else {
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	}
+
+	event_dev->txa_enqueue_same_dest = event_dev->txa_enqueue;
 }
 
 static void
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index b724083caa..3c90c85009 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -11,6 +11,7 @@
 
 #include "cn10k_ethdev.h"
 #include "cn10k_rx.h"
+#include "cn10k_tx.h"
 
 /* SSO Operations */
 
@@ -251,4 +252,70 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 NIX_RX_FASTPATH_MODES
 #undef R
 
+static __rte_always_inline const struct cn10k_eth_txq *
+cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
+			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+{
+	return (const struct cn10k_eth_txq *)
+		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
+		       uint64_t *cmd,
+		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		       const uint32_t flags)
+{
+	const struct cn10k_eth_txq *txq;
+	struct rte_mbuf *m = ev->mbuf;
+	uint16_t ref_cnt = m->refcnt;
+	uintptr_t lmt_addr;
+	uint16_t lmt_id;
+	uintptr_t pa;
+
+	lmt_addr = ws->lmt_base;
+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
+	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F)
+		cn10k_nix_xmit_prepare_tso(m, flags);
+
+	cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags, txq->lso_tun_fmt);
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		const uint16_t segdw =
+			cn10k_nix_prepare_mseg(m, (uint64_t *)lmt_addr, flags);
+		pa = txq->io_addr | ((segdw - 1) << 4);
+	} else {
+		pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
+	}
+	if (!ev->sched_type)
+		cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
+
+	roc_lmt_submit_steorl(lmt_id, pa);
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if (ref_cnt > 1)
+			return 1;
+	}
+
+	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
+				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+
+	return 1;
+}
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_seg_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_dual_tx_adptr_enq_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_dual_tx_adptr_enq_seg_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events);
+
+NIX_TX_FASTPATH_MODES
+#undef T
+
 #endif
diff --git a/drivers/event/cnxk/cn10k_worker_tx_enq.c b/drivers/event/cnxk/cn10k_worker_tx_enq.c
new file mode 100644
index 0000000000..f9968ac0d0
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_tx_enq.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint64_t cmd[sz];                                              \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn10k_sso_hws_event_tx(                                 \
+			ws, &ev[0], cmd,                                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn10k_worker_tx_enq_seg.c b/drivers/event/cnxk/cn10k_worker_tx_enq_seg.c
new file mode 100644
index 0000000000..a24fc42e5a
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_tx_enq_seg.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_seg_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn10k_sso_hws *ws = port;                               \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn10k_sso_hws_event_tx(                                 \
+			ws, &ev[0], cmd,                                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index bdc5632235..af97020f2f 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -430,6 +430,39 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 #undef R
 		};
 
+	/* Tx modes */
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq_seg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_dual_tx_adptr_enq[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_dual_tx_adptr_enq_seg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
 	event_dev->enqueue = cn9k_sso_hws_enq;
 	event_dev->enqueue_burst = cn9k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn9k_sso_hws_enq_new_burst;
@@ -510,6 +543,25 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 		}
 	}
 
+	if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+		/* [SEC] [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM] */
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq_seg
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	} else {
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	}
+
 	if (dev->dual_ws) {
 		event_dev->enqueue = cn9k_sso_hws_dual_enq;
 		event_dev->enqueue_burst = cn9k_sso_hws_dual_enq_burst;
@@ -618,8 +670,37 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 						  NIX_RX_OFFLOAD_RSS_F)];
 			}
 		}
+
+		if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+			/* [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM]
+			 */
+			event_dev->txa_enqueue = sso_hws_dual_tx_adptr_enq_seg
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+		} else {
+			event_dev->txa_enqueue = sso_hws_dual_tx_adptr_enq
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+		}
 	}
 
+	event_dev->txa_enqueue_same_dest = event_dev->txa_enqueue;
 	rte_mb();
 }
 
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index c01c00e1da..5aa053c586 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -11,6 +11,7 @@
 
 #include "cn9k_ethdev.h"
 #include "cn9k_rx.h"
+#include "cn9k_tx.h"
 
 /* SSO Operations */
 
@@ -416,4 +417,90 @@ NIX_RX_FASTPATH_MODES
 NIX_RX_FASTPATH_MODES
 #undef R
 
+static __rte_always_inline const struct cn9k_eth_txq *
+cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
+			 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+{
+	return (const struct cn9k_eth_txq *)
+		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+}
+
+static __rte_always_inline void
+cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
+			 uint64_t *cmd, const uint32_t flags)
+{
+	roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
+}
+
+static __rte_always_inline uint16_t
+cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
+		      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		      const uint32_t flags)
+{
+	struct rte_mbuf *m = ev->mbuf;
+	const struct cn9k_eth_txq *txq;
+	uint16_t ref_cnt = m->refcnt;
+
+	/* Perform header writes before barrier for TSO */
+	cn9k_nix_xmit_prepare_tso(m, flags);
+	/* Lets commit any changes in the packet here in case when
+	 * fast free is set as no further changes will be made to mbuf.
+	 * In case of fast free is not set, both cn9k_nix_prepare_mseg()
+	 * and cn9k_nix_xmit_prepare() has a barrier after refcnt update.
+	 */
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+		rte_io_wmb();
+	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
+	cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
+
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		if (!CNXK_TT_FROM_EVENT(ev->event)) {
+			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
+			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
+						       txq->io_addr, segdw);
+		} else {
+			cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr,
+					       segdw);
+		}
+	} else {
+		if (!CNXK_TT_FROM_EVENT(ev->event)) {
+			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
+			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
+						  txq->io_addr, flags);
+		} else {
+			cn9k_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr,
+					  flags);
+		}
+	}
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if (ref_cnt > 1)
+			return 1;
+	}
+
+	cnxk_sso_hws_swtag_flush(base + SSOW_LF_GWS_TAG,
+				 base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+
+	return 1;
+}
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_##name(                   \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_seg_##name(               \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_seg_##name(          \
+		void *port, struct rte_event ev[], uint16_t nb_events);
+
+NIX_TX_FASTPATH_MODES
+#undef T
+
 #endif
diff --git a/drivers/event/cnxk/cn9k_worker_dual_tx_enq.c b/drivers/event/cnxk/cn9k_worker_dual_tx_enq.c
new file mode 100644
index 0000000000..92e2981f02
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_tx_enq.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *ws = port;                           \
+		uint64_t cmd[sz];                                              \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base[!ws->vws], &ev[0], cmd,                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c b/drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c
new file mode 100644
index 0000000000..dfb574cf95
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_seg_##name(          \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn9k_sso_hws_dual *ws = port;                           \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base[!ws->vws], &ev[0], cmd,                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker_tx_enq.c b/drivers/event/cnxk/cn9k_worker_tx_enq.c
new file mode 100644
index 0000000000..3df649c0c8
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_tx_enq.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_##name(                   \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint64_t cmd[sz];                                              \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base, &ev[0], cmd,                                 \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker_tx_enq_seg.c b/drivers/event/cnxk/cn9k_worker_tx_enq_seg.c
new file mode 100644
index 0000000000..0efe29113e
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_tx_enq_seg.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_seg_##name(               \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn9k_sso_hws *ws = port;                                \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base, &ev[0], cmd,                                 \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index c5c1c0ee8e..13e0634e86 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -17,11 +17,17 @@ sources = files(
         'cn9k_worker_dual_deq.c',
         'cn9k_worker_dual_deq_burst.c',
         'cn9k_worker_dual_deq_tmo.c',
+        'cn9k_worker_tx_enq.c',
+        'cn9k_worker_tx_enq_seg.c',
+        'cn9k_worker_dual_tx_enq.c',
+        'cn9k_worker_dual_tx_enq_seg.c',
         'cn10k_eventdev.c',
         'cn10k_worker.c',
         'cn10k_worker_deq.c',
         'cn10k_worker_deq_burst.c',
         'cn10k_worker_deq_tmo.c',
+        'cn10k_worker_tx_enq.c',
+        'cn10k_worker_tx_enq_seg.c',
         'cnxk_eventdev.c',
         'cnxk_eventdev_adptr.c',
         'cnxk_eventdev_selftest.c',
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 11/13] event/cnxk: add Rx adapter vector support
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (8 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 10/13] event/cnxk: add Tx adapter fastpath ops pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 12/13] event/cnxk: add Rx event vector fastpath pbhagavatula
                     ` (2 subsequent siblings)
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add event vector support for cnxk event Rx adapter, add control path
APIs to get vector limits and ability to configure event vectorization
on a given Rx queue.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |   2 +
 drivers/event/cnxk/cn10k_eventdev.c      | 106 ++++++++++++++++++++++-
 drivers/event/cnxk/cnxk_eventdev.h       |   2 +
 drivers/event/cnxk/cnxk_eventdev_adptr.c |  25 ++++++
 drivers/net/cnxk/cnxk_ethdev.h           |   2 +-
 5 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 6fdccc2ab4..0297cd3d5f 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -45,6 +45,8 @@ Features of the OCTEON cnxk SSO PMD are:
 - Lockfree Tx from event eth Tx adapter using ``DEV_TX_OFFLOAD_MT_LOCKFREE``
   capability while maintaining receive packet order.
 - Full Rx/Tx offload support defined through ethdev queue configuration.
+- HW managed event vectorization on CN10K for packets enqueued from ethdev to
+  eventdev configurable per each Rx queue in Rx adapter.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index e462f770c5..e85fa4785d 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -610,7 +610,8 @@ cn10k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
 	else
 		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
 			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
-			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_EVENT_VECTOR;
 
 	return 0;
 }
@@ -671,6 +672,105 @@ cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn10k_sso_rx_adapter_vector_limits(
+	const struct rte_eventdev *dev, const struct rte_eth_dev *eth_dev,
+	struct rte_event_eth_rx_adapter_vector_limits *limits)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev;
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (ret)
+		return -ENOTSUP;
+
+	cnxk_eth_dev = cnxk_eth_pmd_priv(eth_dev);
+	limits->log2_sz = true;
+	limits->min_sz = 1 << ROC_NIX_VWQE_MIN_SIZE_LOG2;
+	limits->max_sz = 1 << ROC_NIX_VWQE_MAX_SIZE_LOG2;
+	limits->min_timeout_ns =
+		(roc_nix_get_vwqe_interval(&cnxk_eth_dev->nix) + 1) * 100;
+	limits->max_timeout_ns = BITMASK_ULL(8, 0) * limits->min_timeout_ns;
+
+	return 0;
+}
+
+static int
+cnxk_sso_rx_adapter_vwqe_enable(struct cnxk_eth_dev *cnxk_eth_dev,
+				uint16_t port_id, uint16_t rq_id, uint16_t sz,
+				uint64_t tmo_ns, struct rte_mempool *vmp)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+
+	if (!rq->sso_ena)
+		return -EINVAL;
+	if (rq->flow_tag_width == 0)
+		return -EINVAL;
+
+	rq->vwqe_ena = 1;
+	rq->vwqe_first_skip = 0;
+	rq->vwqe_aura_handle = roc_npa_aura_handle_to_aura(vmp->pool_id);
+	rq->vwqe_max_sz_exp = rte_log2_u32(sz);
+	rq->vwqe_wait_tmo =
+		tmo_ns /
+		((roc_nix_get_vwqe_interval(&cnxk_eth_dev->nix) + 1) * 100);
+	rq->tag_mask = (port_id & 0xF) << 20;
+	rq->tag_mask |=
+		(((port_id >> 4) & 0xF) | (RTE_EVENT_TYPE_ETHDEV_VECTOR << 4))
+		<< 24;
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+static int
+cn10k_sso_rx_adapter_vector_config(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_event_vector_config *config)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev;
+	struct cnxk_sso_evdev *dev;
+	int i, rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	dev = cnxk_sso_pmd_priv(event_dev);
+	cnxk_eth_dev = cnxk_eth_pmd_priv(eth_dev);
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			cnxk_sso_updt_xae_cnt(dev, config->vector_mp,
+					      RTE_EVENT_TYPE_ETHDEV_VECTOR);
+			rc = cnxk_sso_xae_reconfigure(
+				(struct rte_eventdev *)(uintptr_t)event_dev);
+			rc = cnxk_sso_rx_adapter_vwqe_enable(
+				cnxk_eth_dev, eth_dev->data->port_id, i,
+				config->vector_sz, config->vector_timeout_ns,
+				config->vector_mp);
+			if (rc)
+				return -EINVAL;
+		}
+	} else {
+
+		cnxk_sso_updt_xae_cnt(dev, config->vector_mp,
+				      RTE_EVENT_TYPE_ETHDEV_VECTOR);
+		rc = cnxk_sso_xae_reconfigure(
+			(struct rte_eventdev *)(uintptr_t)event_dev);
+		rc = cnxk_sso_rx_adapter_vwqe_enable(
+			cnxk_eth_dev, eth_dev->data->port_id, rx_queue_id,
+			config->vector_sz, config->vector_timeout_ns,
+			config->vector_mp);
+		if (rc)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int
 cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
 			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
@@ -739,6 +839,10 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_rx_adapter_vector_limits_get = cn10k_sso_rx_adapter_vector_limits,
+	.eth_rx_adapter_event_vector_config =
+		cn10k_sso_rx_adapter_vector_config,
+
 	.eth_tx_adapter_caps_get = cn10k_sso_tx_adapter_caps_get,
 	.eth_tx_adapter_queue_add = cn10k_sso_tx_adapter_queue_add,
 	.eth_tx_adapter_queue_del = cn10k_sso_tx_adapter_queue_del,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 458fdc8d92..3783e0c95b 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -96,6 +96,8 @@ struct cnxk_sso_evdev {
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
+	uint16_t vec_pool_cnt;
+	uint64_t *vec_pools;
 	/* Dev args */
 	uint32_t xae_cnt;
 	uint8_t qos_queue_cnt;
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 548d7b81ce..c4c4f5a7f4 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -40,6 +40,31 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		dev->adptr_xae_cnt += rxq->qconf.mp->size;
 		break;
 	}
+	case RTE_EVENT_TYPE_ETHDEV_VECTOR: {
+		struct rte_mempool *mp = data;
+		uint64_t *old_ptr;
+
+		for (i = 0; i < dev->vec_pool_cnt; i++) {
+			if ((uint64_t)mp == dev->vec_pools[i])
+				return;
+		}
+
+		dev->vec_pool_cnt++;
+		old_ptr = dev->vec_pools;
+		dev->vec_pools =
+			rte_realloc(dev->vec_pools,
+				    sizeof(uint64_t) * dev->vec_pool_cnt, 0);
+		if (dev->vec_pools == NULL) {
+			dev->adptr_xae_cnt += mp->size;
+			dev->vec_pools = old_ptr;
+			dev->vec_pool_cnt--;
+			return;
+		}
+		dev->vec_pools[dev->vec_pool_cnt - 1] = (uint64_t)mp;
+
+		dev->adptr_xae_cnt += mp->size;
+		break;
+	}
 	case RTE_EVENT_TYPE_TIMER: {
 		struct cnxk_tim_ring *timr = data;
 		uint16_t *old_ring_ptr;
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index 4eead03905..2528b3cdaa 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -238,7 +238,7 @@ struct cnxk_eth_txq_sp {
 } __plt_cache_aligned;
 
 static inline struct cnxk_eth_dev *
-cnxk_eth_pmd_priv(struct rte_eth_dev *eth_dev)
+cnxk_eth_pmd_priv(const struct rte_eth_dev *eth_dev)
 {
 	return eth_dev->data->dev_private;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 12/13] event/cnxk: add Rx event vector fastpath
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (9 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 11/13] event/cnxk: add Rx adapter vector support pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 13/13] event/cnxk: add Tx " pbhagavatula
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx event vector fastpath to convert HW defined metadata into
rte_mbuf and rte_event_vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/rel_notes/release_21_08.rst |   1 +
 drivers/event/cnxk/cn10k_worker.h      |  56 +++++++
 drivers/net/cnxk/cn10k_rx.h            | 200 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx_vec.c        |   2 +-
 drivers/net/cnxk/cn10k_rx_vec_mseg.c   |   5 +-
 5 files changed, 179 insertions(+), 85 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 80ff93269c..11ccc9bcb5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -64,6 +64,7 @@ New Features
 
   * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested
     is net/cnxk.
+  * Add support for event vectorization for Rx adapter.
 
 
 Removed Items
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 3c90c85009..7a48a6b17d 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -5,6 +5,8 @@
 #ifndef __CN10K_WORKER_H__
 #define __CN10K_WORKER_H__
 
+#include <rte_vect.h>
+
 #include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
@@ -101,6 +103,49 @@ cn10k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
 			      mbuf_init | ((uint64_t)port_id) << 48, flags);
 }
 
+static __rte_always_inline void
+cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
+		   void *lookup_mem, void *tstamp)
+{
+	uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM |
+			     (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
+	struct rte_event_vector *vec;
+	uint16_t nb_mbufs, non_vec;
+	uint64_t **wqe;
+
+	mbuf_init |= ((uint64_t)port_id) << 48;
+	vec = (struct rte_event_vector *)vwqe;
+	wqe = vec->u64s;
+
+	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
+	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
+					      flags | NIX_RX_VWQE_F, lookup_mem,
+					      tstamp);
+	wqe += nb_mbufs;
+	non_vec = vec->nb_elem - nb_mbufs;
+
+	while (non_vec) {
+		struct nix_cqe_hdr_s *cqe = (struct nix_cqe_hdr_s *)wqe[0];
+		struct rte_mbuf *mbuf;
+		uint64_t tstamp_ptr;
+
+		mbuf = (struct rte_mbuf *)((char *)cqe -
+					   sizeof(struct rte_mbuf));
+		cn10k_nix_cqe_to_mbuf(cqe, cqe->tag, mbuf, lookup_mem,
+				      mbuf_init, flags);
+		/* Extracting tstamp, if PTP enabled*/
+		tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)cqe) +
+					   CNXK_SSO_WQE_SG_PTR);
+		cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+					flags & NIX_RX_OFFLOAD_TSTAMP_F,
+					flags & NIX_RX_MULTI_SEG_F,
+					(uint64_t *)tstamp_ptr);
+		wqe[0] = (uint64_t *)mbuf;
+		non_vec--;
+		wqe++;
+	}
+}
+
 static __rte_always_inline uint16_t
 cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		       const uint32_t flags, void *lookup_mem)
@@ -152,6 +197,17 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 						flags & NIX_RX_MULTI_SEG_F,
 						(uint64_t *)tstamp_ptr);
 			gw.u64[1] = mbuf;
+		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+			   RTE_EVENT_TYPE_ETHDEV_VECTOR) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			__uint128_t vwqe_hdr = *(__uint128_t *)gw.u64[1];
+
+			vwqe_hdr = ((vwqe_hdr >> 64) & 0xFFF) | BIT_ULL(31) |
+				   ((vwqe_hdr & 0xFFFF) << 48) |
+				   ((uint64_t)port << 32);
+			*(uint64_t *)gw.u64[1] = (uint64_t)vwqe_hdr;
+			cn10k_process_vwqe(gw.u64[1], port, flags, lookup_mem,
+					   ws->tstamp);
 		}
 	}
 
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index abdd58e888..075e1124ed 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -21,6 +21,7 @@
  * Defining it from backwards to denote its been
  * not used as offload flags to pick function
  */
+#define NIX_RX_VWQE_F	   BIT(14)
 #define NIX_RX_MULTI_SEG_F BIT(15)
 
 #define CNXK_NIX_CQ_ENTRY_SZ 128
@@ -28,6 +29,11 @@
 #define CQE_CAST(x)	     ((struct nix_cqe_hdr_s *)(x))
 #define CQE_SZ(x)	     ((x) * CNXK_NIX_CQ_ENTRY_SZ)
 
+#define CQE_PTR_OFF(b, i, o, f)                                                \
+	(((f) & NIX_RX_VWQE_F) ?                                               \
+		       (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \
+		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o)))
+
 union mbuf_initializer {
 	struct {
 		uint16_t data_off;
@@ -317,61 +323,87 @@ nix_qinq_update(const uint64_t w2, uint64_t ol_flags, struct rte_mbuf *mbuf)
 }
 
 static __rte_always_inline uint16_t
-cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
-			   uint16_t pkts, const uint16_t flags)
+cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
+			   const uint16_t flags, void *lookup_mem,
+			   struct cnxk_timesync_info *tstamp)
 {
-	struct cn10k_eth_rxq *rxq = rx_queue;
-	uint16_t packets = 0;
+	struct cn10k_eth_rxq *rxq = args;
+	const uint64_t mbuf_initializer = (flags & NIX_RX_VWQE_F) ?
+							*(uint64_t *)args :
+							rxq->mbuf_initializer;
+	const uint64x2_t data_off = flags & NIX_RX_VWQE_F ?
+						  vdupq_n_u64(0x80ULL) :
+						  vdupq_n_u64(rxq->data_off);
+	const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask;
+	const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata;
+	const uintptr_t desc = flags & NIX_RX_VWQE_F ? 0 : rxq->desc;
 	uint64x2_t cq0_w8, cq1_w8, cq2_w8, cq3_w8, mbuf01, mbuf23;
-	const uint64_t mbuf_initializer = rxq->mbuf_initializer;
-	const uint64x2_t data_off = vdupq_n_u64(rxq->data_off);
 	uint64_t ol_flags0, ol_flags1, ol_flags2, ol_flags3;
 	uint64x2_t rearm0 = vdupq_n_u64(mbuf_initializer);
 	uint64x2_t rearm1 = vdupq_n_u64(mbuf_initializer);
 	uint64x2_t rearm2 = vdupq_n_u64(mbuf_initializer);
 	uint64x2_t rearm3 = vdupq_n_u64(mbuf_initializer);
 	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3;
-	const uint16_t *lookup_mem = rxq->lookup_mem;
-	const uint32_t qmask = rxq->qmask;
-	const uint64_t wdata = rxq->wdata;
-	const uintptr_t desc = rxq->desc;
 	uint8x16_t f0, f1, f2, f3;
-	uint32_t head = rxq->head;
+	uint16_t packets = 0;
 	uint16_t pkts_left;
-
-	pkts = nix_rx_nb_pkts(rxq, wdata, pkts, qmask);
-	pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
-
-	/* Packets has to be floor-aligned to NIX_DESCS_PER_LOOP */
-	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	uint32_t head;
+	uintptr_t cq0;
+
+	if (!(flags & NIX_RX_VWQE_F)) {
+		lookup_mem = rxq->lookup_mem;
+		head = rxq->head;
+
+		pkts = nix_rx_nb_pkts(rxq, wdata, pkts, qmask);
+		pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
+		/* Packets has to be floor-aligned to NIX_DESCS_PER_LOOP */
+		pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F)
+			tstamp = rxq->tstamp;
+	} else {
+		RTE_SET_USED(head);
+	}
 
 	while (packets < pkts) {
-		/* Exit loop if head is about to wrap and become unaligned */
-		if (((head + NIX_DESCS_PER_LOOP - 1) & qmask) <
-		    NIX_DESCS_PER_LOOP) {
-			pkts_left += (pkts - packets);
-			break;
-		}
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Exit loop if head is about to wrap and become
+			 * unaligned.
+			 */
+			if (((head + NIX_DESCS_PER_LOOP - 1) & qmask) <
+			    NIX_DESCS_PER_LOOP) {
+				pkts_left += (pkts - packets);
+				break;
+			}
 
-		const uintptr_t cq0 = desc + CQE_SZ(head);
+			cq0 = desc + CQE_SZ(head);
+		} else {
+			cq0 = (uintptr_t)&mbufs[packets];
+		}
 
 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(8)));
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(9)));
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(10)));
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(11)));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
 
 		/* Get NIX_RX_SG_S for size and buffer pointer */
-		cq0_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(0) + 64));
-		cq1_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(1) + 64));
-		cq2_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(2) + 64));
-		cq3_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(3) + 64));
-
-		/* Extract mbuf from NIX_RX_SG_S */
-		mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
-		mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
-		mbuf01 = vqsubq_u64(mbuf01, data_off);
-		mbuf23 = vqsubq_u64(mbuf23, data_off);
+		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
+		cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
+		cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
+		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Extract mbuf from NIX_RX_SG_S */
+			mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
+			mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
+			mbuf01 = vqsubq_u64(mbuf01, data_off);
+			mbuf23 = vqsubq_u64(mbuf23, data_off);
+		} else {
+			mbuf01 =
+				vsubq_u64(vld1q_u64((uint64_t *)cq0), data_off);
+			mbuf23 = vsubq_u64(vld1q_u64((uint64_t *)(cq0 + 16)),
+					   data_off);
+		}
 
 		/* Move mbufs to scalar registers for future use */
 		mbuf0 = (struct rte_mbuf *)vgetq_lane_u64(mbuf01, 0);
@@ -395,14 +427,14 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
 
 		/* Load CQE word0 and word 1 */
-		uint64_t cq0_w0 = ((uint64_t *)(cq0 + CQE_SZ(0)))[0];
-		uint64_t cq0_w1 = ((uint64_t *)(cq0 + CQE_SZ(0)))[1];
-		uint64_t cq1_w0 = ((uint64_t *)(cq0 + CQE_SZ(1)))[0];
-		uint64_t cq1_w1 = ((uint64_t *)(cq0 + CQE_SZ(1)))[1];
-		uint64_t cq2_w0 = ((uint64_t *)(cq0 + CQE_SZ(2)))[0];
-		uint64_t cq2_w1 = ((uint64_t *)(cq0 + CQE_SZ(2)))[1];
-		uint64_t cq3_w0 = ((uint64_t *)(cq0 + CQE_SZ(3)))[0];
-		uint64_t cq3_w1 = ((uint64_t *)(cq0 + CQE_SZ(3)))[1];
+		const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags);
+		const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 1, flags);
+		const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags);
+		const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 1, flags);
+		const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags);
+		const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 1, flags);
+		const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags);
+		const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 1, flags);
 
 		if (flags & NIX_RX_OFFLOAD_RSS_F) {
 			/* Fill rss in the rx_descriptor_fields1 */
@@ -459,17 +491,17 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		if (flags & NIX_RX_OFFLOAD_MARK_UPDATE_F) {
 			ol_flags0 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(0) + 38), ol_flags0,
-				mbuf0);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 0, 38, flags),
+				ol_flags0, mbuf0);
 			ol_flags1 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(1) + 38), ol_flags1,
-				mbuf1);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 1, 38, flags),
+				ol_flags1, mbuf1);
 			ol_flags2 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(2) + 38), ol_flags2,
-				mbuf2);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 2, 38, flags),
+				ol_flags2, mbuf2);
 			ol_flags3 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(3) + 38), ol_flags3,
-				mbuf3);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 3, 38, flags),
+				ol_flags3, mbuf3);
 		}
 
 		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
@@ -488,7 +520,7 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 						  RTE_PTYPE_L2_ETHER_TIMESYNC};
 			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
 						PKT_RX_IEEE1588_TMST |
-						rxq->tstamp->rx_tstamp_dynflag;
+						tstamp->rx_tstamp_dynflag;
 			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
 			uint64x2_t ts01, ts23, mask;
 			uint64_t ts[4];
@@ -526,14 +558,10 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			ts[3] = vgetq_lane_u64(ts23, 1);
 
 			/* Store timestamp into dynfield. */
-			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
-				ts[0];
-			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
-				ts[1];
-			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
-				ts[2];
-			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
-				ts[3];
+			*cnxk_nix_timestamp_dynfield(mbuf0, tstamp) = ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, tstamp) = ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, tstamp) = ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, tstamp) = ts[3];
 
 			/* Generate ptype mask to filter L2 ether timesync */
 			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
@@ -559,9 +587,8 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				/* Update Rxq timestamp with the latest
 				 * timestamp.
 				 */
-				rxq->tstamp->rx_ready = 1;
-				rxq->tstamp->rx_tstamp =
-					ts[31 - __builtin_clz(res)];
+				tstamp->rx_ready = 1;
+				tstamp->rx_tstamp = ts[31 - __builtin_clz(res)];
 			}
 		}
 
@@ -584,25 +611,25 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
 		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+		vst1q_u64((uint64_t *)&mbufs[packets], mbuf01);
+		vst1q_u64((uint64_t *)&mbufs[packets + 2], mbuf23);
 
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 0, 8, flags)),
+					    mbuf0, mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 1, 8, flags)),
+					    mbuf1, mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 2, 8, flags)),
+					    mbuf2, mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 3, 8, flags)),
+					    mbuf3, mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
@@ -623,12 +650,18 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		__mempool_check_cookies(mbuf2->pool, (void **)&mbuf2, 1, 1);
 		__mempool_check_cookies(mbuf3->pool, (void **)&mbuf3, 1, 1);
 
-		/* Advance head pointer and packets */
-		head += NIX_DESCS_PER_LOOP;
-		head &= qmask;
 		packets += NIX_DESCS_PER_LOOP;
+
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Advance head pointer and packets */
+			head += NIX_DESCS_PER_LOOP;
+			head &= qmask;
+		}
 	}
 
+	if (flags & NIX_RX_VWQE_F)
+		return packets;
+
 	rxq->head = head;
 	rxq->available -= packets;
 
@@ -637,8 +670,8 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 	plt_write64((rxq->wdata | packets), rxq->cq_door);
 
 	if (unlikely(pkts_left))
-		packets += cn10k_nix_recv_pkts(rx_queue, &rx_pkts[packets],
-					       pkts_left, flags);
+		packets += cn10k_nix_recv_pkts(args, &mbufs[packets], pkts_left,
+					       flags);
 
 	return packets;
 }
@@ -647,12 +680,15 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 static inline uint16_t
 cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
-			   uint16_t pkts, const uint16_t flags)
+			   uint16_t pkts, const uint16_t flags,
+			   void *lookup_mem, void *tstamp)
 {
+	RTE_SET_USED(lookup_mem);
 	RTE_SET_USED(rx_queue);
 	RTE_SET_USED(rx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(flags);
+	RTE_SET_USED(tstamp);
 
 	return 0;
 }
diff --git a/drivers/net/cnxk/cn10k_rx_vec.c b/drivers/net/cnxk/cn10k_rx_vec.c
index 93528a44f9..166735ad59 100644
--- a/drivers/net/cnxk/cn10k_rx_vec.c
+++ b/drivers/net/cnxk/cn10k_rx_vec.c
@@ -12,7 +12,7 @@
 					       uint16_t pkts)                  \
 	{                                                                      \
 		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
-						  (flags));		       \
+						  (flags), NULL, NULL);        \
 	}
 
 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_rx_vec_mseg.c b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
index 04d1e46c82..1f44dddddd 100644
--- a/drivers/net/cnxk/cn10k_rx_vec_mseg.c
+++ b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
@@ -9,8 +9,9 @@
 	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
 	{                                                                      \
-		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
-					  (flags) | NIX_RX_MULTI_SEG_F);       \
+		return cn10k_nix_recv_pkts_vector(                             \
+			rx_queue, rx_pkts, pkts, (flags) | NIX_RX_MULTI_SEG_F, \
+			NULL, NULL);                                           \
 	}
 
 NIX_RX_FASTPATH_MODES
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v2 13/13] event/cnxk: add Tx event vector fastpath
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (10 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 12/13] event/cnxk: add Rx event vector fastpath pbhagavatula
@ 2021-06-19 11:01   ` pbhagavatula
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  12 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-19 11:01 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Tx event vector fastpath, integrate event vector Tx routine
into Tx burst.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst          |   1 +
 doc/guides/rel_notes/release_21_08.rst |   2 +-
 drivers/common/cnxk/roc_sso.h          |  23 ++++++
 drivers/event/cnxk/cn10k_eventdev.c    |   3 +-
 drivers/event/cnxk/cn10k_worker.h      | 104 +++++++++++++++++++++++--
 drivers/event/cnxk/cn9k_worker.h       |   4 +-
 drivers/event/cnxk/cnxk_worker.h       |  22 ------
 drivers/net/cnxk/cn10k_tx.c            |   2 +-
 drivers/net/cnxk/cn10k_tx.h            |  52 +++++++++----
 drivers/net/cnxk/cn10k_tx_mseg.c       |   3 +-
 drivers/net/cnxk/cn10k_tx_vec.c        |   2 +-
 drivers/net/cnxk/cn10k_tx_vec_mseg.c   |   2 +-
 12 files changed, 167 insertions(+), 53 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 0297cd3d5f..53560d3830 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -47,6 +47,7 @@ Features of the OCTEON cnxk SSO PMD are:
 - Full Rx/Tx offload support defined through ethdev queue configuration.
 - HW managed event vectorization on CN10K for packets enqueued from ethdev to
   eventdev configurable per each Rx queue in Rx adapter.
+- Event vector transmission via Tx adapter.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 11ccc9bcb5..9e49cb27d7 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -64,7 +64,7 @@ New Features
 
   * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested
     is net/cnxk.
-  * Add support for event vectorization for Rx adapter.
+  * Add support for event vectorization for Rx/Tx adapter.
 
 
 Removed Items
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index a6030e7d8a..316c6ccd59 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -44,6 +44,29 @@ struct roc_sso {
 	uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
 } __plt_cache_aligned;
 
+static __rte_always_inline void
+roc_sso_hws_head_wait(uintptr_t tag_op)
+{
+#ifdef RTE_ARCH_ARM64
+	uint64_t tag;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldr %[tag], [%[tag_op]]	\n"
+		     "		tbnz %[tag], 35, done%=		\n"
+		     "		sevl				\n"
+		     "rty%=:	wfe				\n"
+		     "		ldr %[tag], [%[tag_op]]	\n"
+		     "		tbz %[tag], 35, rty%=		\n"
+		     "done%=:					\n"
+		     : [tag] "=&r"(tag)
+		     : [tag_op] "r"(tag_op));
+#else
+	/* Wait for the SWTAG/SWTAG_FULL operation */
+	while (!(plt_read64(tag_op) & BIT_ULL(35)))
+		;
+#endif
+}
+
 /* SSO device initialization */
 int __roc_api roc_sso_dev_init(struct roc_sso *roc_sso);
 int __roc_api roc_sso_dev_fini(struct roc_sso *roc_sso);
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index e85fa4785d..6f37c5bd23 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -782,7 +782,8 @@ cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
 	if (ret)
 		*caps = 0;
 	else
-		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_TX_ADAPTER_CAP_EVENT_VECTOR;
 
 	return 0;
 }
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 7a48a6b17d..9cc0992063 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -308,29 +308,120 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 NIX_RX_FASTPATH_MODES
 #undef R
 
-static __rte_always_inline const struct cn10k_eth_txq *
+static __rte_always_inline struct cn10k_eth_txq *
 cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
 			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
 {
-	return (const struct cn10k_eth_txq *)
+	return (struct cn10k_eth_txq *)
 		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
 }
 
+static __rte_always_inline void
+cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
+			uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
+			uint8_t sched_type, uintptr_t base,
+			const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+			const uint32_t flags)
+{
+	uint16_t port[4], queue[4];
+	struct cn10k_eth_txq *txq;
+	uint16_t i, j;
+	uintptr_t pa;
+
+	for (i = 0; i < nb_mbufs; i += 4) {
+		port[0] = mbufs[i]->port;
+		port[1] = mbufs[i + 1]->port;
+		port[2] = mbufs[i + 2]->port;
+		port[3] = mbufs[i + 3]->port;
+
+		queue[0] = rte_event_eth_tx_adapter_txq_get(mbufs[i]);
+		queue[1] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 1]);
+		queue[2] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 2]);
+		queue[3] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 3]);
+
+		if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
+		    ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
+
+			for (j = 0; j < 4; j++) {
+				struct rte_mbuf *m = mbufs[i + j];
+
+				txq = (struct cn10k_eth_txq *)
+					txq_data[port[j]][queue[j]];
+				cn10k_nix_tx_skeleton(txq, cmd, flags);
+				/* Perform header writes before barrier
+				 * for TSO
+				 */
+				if (flags & NIX_TX_OFFLOAD_TSO_F)
+					cn10k_nix_xmit_prepare_tso(m, flags);
+
+				cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags,
+						       txq->lso_tun_fmt);
+				if (flags & NIX_TX_MULTI_SEG_F) {
+					const uint16_t segdw =
+						cn10k_nix_prepare_mseg(
+							m, (uint64_t *)lmt_addr,
+							flags);
+					pa = txq->io_addr | ((segdw - 1) << 4);
+				} else {
+					pa = txq->io_addr |
+					     (cn10k_nix_tx_ext_subs(flags) + 1)
+						     << 4;
+				}
+				if (!sched_type)
+					roc_sso_hws_head_wait(base +
+							      SSOW_LF_GWS_TAG);
+
+				roc_lmt_submit_steorl(lmt_id, pa);
+			}
+		} else {
+			txq = (struct cn10k_eth_txq *)
+				txq_data[port[0]][queue[0]];
+			cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd, base
+					+ SSOW_LF_GWS_TAG,
+						   flags | NIX_TX_VWQE_F);
+		}
+	}
+}
+
 static __rte_always_inline uint16_t
 cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		       uint64_t *cmd,
 		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
 		       const uint32_t flags)
 {
-	const struct cn10k_eth_txq *txq;
-	struct rte_mbuf *m = ev->mbuf;
-	uint16_t ref_cnt = m->refcnt;
+	struct cn10k_eth_txq *txq;
+	struct rte_mbuf *m;
 	uintptr_t lmt_addr;
+	uint16_t ref_cnt;
 	uint16_t lmt_id;
 	uintptr_t pa;
 
 	lmt_addr = ws->lmt_base;
 	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+	if (ev->event_type & RTE_EVENT_TYPE_VECTOR) {
+		struct rte_mbuf **mbufs = ev->vec->mbufs;
+		uint64_t meta = *(uint64_t *)ev->vec;
+
+		if (meta & BIT(31)) {
+			txq = (struct cn10k_eth_txq *)
+				txq_data[meta >> 32][meta >> 48];
+
+			cn10k_nix_xmit_pkts_vector(
+				txq, mbufs, meta & 0xFFFF, cmd,
+				ws->tx_base + SSOW_LF_GWS_TAG,
+				flags | NIX_TX_VWQE_F);
+		} else {
+			cn10k_sso_vwqe_split_tx(
+				mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
+				ev->sched_type, ws->tx_base, txq_data, flags);
+		}
+		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
+		return (meta & 0xFFFF);
+	}
+
+	m = ev->mbuf;
+	ref_cnt = m->refcnt;
 	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
 	cn10k_nix_tx_skeleton(txq, cmd, flags);
 	/* Perform header writes before barrier for TSO */
@@ -346,7 +437,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
 	}
 	if (!ev->sched_type)
-		cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
 
 	roc_lmt_submit_steorl(lmt_id, pa);
 
@@ -357,7 +448,6 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 
 	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
 				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
-
 	return 1;
 }
 
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 5aa053c586..ef1e83741a 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -458,7 +458,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
-			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
 						       txq->io_addr, segdw);
@@ -469,7 +469,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	} else {
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
-			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
 						  txq->io_addr, flags);
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 4eb46ae162..945132b748 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -75,27 +75,5 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 #endif
 }
 
-static __rte_always_inline void
-cnxk_sso_hws_head_wait(uintptr_t tag_op)
-{
-#ifdef RTE_ARCH_ARM64
-	uint64_t swtp;
-
-	asm volatile(PLT_CPU_FEATURE_PREAMBLE
-		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbz %[swtb], 35, done%=		\n"
-		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
-		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbnz %[swtb], 35, rty%=		\n"
-		     "done%=:					\n"
-		     : [swtb] "=&r"(swtp)
-		     : [swtp_loc] "r"(tag_op));
-#else
-	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (plt_read64(tag_op) & BIT_ULL(35))
-		;
-#endif
-}
 
 #endif
diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 1f30bab59a..0e1276c60b 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -16,7 +16,7 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd,       \
-					   flags);			       \
+					   0, flags);			       \
 	}
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index b25b20dcb2..e8a99808cc 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -18,6 +18,7 @@
  * Defining it from backwards to denote its been
  * not used as offload flags to pick function
  */
+#define NIX_TX_VWQE_F	   BIT(14)
 #define NIX_TX_MULTI_SEG_F BIT(15)
 
 #define NIX_TX_NEED_SEND_HDR_W1                                                \
@@ -519,7 +520,7 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
-		    uint64_t *cmd, const uint16_t flags)
+		    uint64_t *cmd, uintptr_t base, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	const rte_iova_t io_addr = txq->io_addr;
@@ -528,14 +529,15 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 	uint64_t lso_tun_fmt;
 	uint64_t data;
 
-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	}
 
 	/* Get cmd skeleton */
 	cn10k_nix_tx_skeleton(txq, cmd, flags);
 
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
-
 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
 
@@ -558,6 +560,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
 	}
 
+	if (flags & NIX_TX_VWQE_F)
+		roc_sso_hws_head_wait(base);
+
 	/* Trigger LMTST */
 	if (burst > 16) {
 		data = cn10k_nix_tx_steor_data(flags);
@@ -604,7 +609,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
-			 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+			 uint16_t pkts, uint64_t *cmd, uintptr_t base,
+			 const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
@@ -652,6 +658,9 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 		shft += 3;
 	}
 
+	if (flags & NIX_TX_VWQE_F)
+		roc_sso_hws_head_wait(base);
+
 	data0 = (uint64_t)data128;
 	data1 = (uint64_t)(data128 >> 64);
 	/* Make data0 similar to data1 */
@@ -984,7 +993,8 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
 
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
+			   const uint16_t flags)
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -1013,13 +1023,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint64_t data[2];
 	} wd;
 
-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
-
-	scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
-	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+		pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	} else {
+		scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+		pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	}
 
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
 		for (i = 0; i < pkts; i++)
@@ -1972,6 +1986,9 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (flags & NIX_TX_MULTI_SEG_F)
 		wd.data[0] >>= 16;
 
+	if (flags & NIX_TX_VWQE_F)
+		roc_sso_hws_head_wait(base);
+
 	/* Trigger LMTST */
 	if (lnum > 16) {
 		if (!(flags & NIX_TX_MULTI_SEG_F))
@@ -2028,10 +2045,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (unlikely(scalar)) {
 		if (flags & NIX_TX_MULTI_SEG_F)
 			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
-							 scalar, cmd, flags);
+							 scalar, cmd, base,
+							 flags);
 		else
 			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
-						    cmd, flags);
+						    cmd, base, flags);
 	}
 
 	return pkts;
@@ -2040,13 +2058,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 #else
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
+			   const uint16_t flags)
 {
 	RTE_SET_USED(tx_queue);
 	RTE_SET_USED(tx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(cmd);
 	RTE_SET_USED(flags);
+	RTE_SET_USED(base);
 	return 0;
 }
 #endif
diff --git a/drivers/net/cnxk/cn10k_tx_mseg.c b/drivers/net/cnxk/cn10k_tx_mseg.c
index 33f6754722..4ea4c8a4e5 100644
--- a/drivers/net/cnxk/cn10k_tx_mseg.c
+++ b/drivers/net/cnxk/cn10k_tx_mseg.c
@@ -18,7 +18,8 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
-						(flags) | NIX_TX_MULTI_SEG_F); \
+						0, (flags)		       \
+							| NIX_TX_MULTI_SEG_F); \
 	}
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 34e3737501..a0350496ab 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -18,7 +18,7 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
-						  (flags));                    \
+						  0, (flags));                 \
 	}
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
index 1fad81dbad..7f98f79b97 100644
--- a/drivers/net/cnxk/cn10k_tx_vec_mseg.c
+++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
@@ -16,7 +16,7 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(                             \
-			tx_queue, tx_pkts, pkts, cmd,                          \
+			tx_queue, tx_pkts, pkts, cmd, 0,                       \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine
  2021-06-19 11:01 ` [dpdk-dev] [PATCH v2 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                     ` (11 preceding siblings ...)
  2021-06-19 11:01   ` [dpdk-dev] [PATCH v2 13/13] event/cnxk: add Tx " pbhagavatula
@ 2021-06-20 20:28   ` pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 02/13] net/cnxk: enable ptp processing in vector Rx pbhagavatula
                       ` (14 more replies)
  12 siblings, 15 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:28 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add multi-segment Rx vector routine, form the primary mbufs using
vector path switch to scalar path when extracting segments.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 Depends-on: http://patches.dpdk.org/project/dpdk/list/?series=17394

 v3 Changes:
 - Spell check.

 drivers/net/cnxk/cn10k_rx.c          | 31 +++++++++++------
 drivers/net/cnxk/cn10k_rx.h          | 51 +++++++++++++++++++++-------
 drivers/net/cnxk/cn10k_rx_vec_mseg.c | 17 ++++++++++
 drivers/net/cnxk/cn9k_rx.c           | 31 +++++++++++------
 drivers/net/cnxk/cn9k_rx.h           | 51 +++++++++++++++++++++-------
 drivers/net/cnxk/cn9k_rx_vec_mseg.c  | 18 ++++++++++
 drivers/net/cnxk/meson.build         |  2 ++
 7 files changed, 157 insertions(+), 44 deletions(-)
 create mode 100644 drivers/net/cnxk/cn10k_rx_vec_mseg.c
 create mode 100644 drivers/net/cnxk/cn9k_rx_vec_mseg.c

diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
index 5c956c06b4..3a9fd71309 100644
--- a/drivers/net/cnxk/cn10k_rx.c
+++ b/drivers/net/cnxk/cn10k_rx.c
@@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
+
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 }

 void
@@ -60,20 +62,29 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 #undef R
 	};

-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
-		pick_rx_func(eth_dev, nix_eth_rx_burst);
-	else
-		pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
+	const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_nix_recv_pkts_vec_mseg_##name,

-	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
-		pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};

 	/* Copy multi seg version with no offload for tear down sequence */
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
-	rte_mb();
+
+	/* For PTP enabled, scalar rx function should be chosen as most of the
+	 * PTP apps are implemented to rx burst 1 pkt.
+	 */
+	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		return pick_rx_func(eth_dev, nix_eth_rx_burst);
+	}
+
+	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+		return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
+	return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
 }
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 1cc37cbaa0..5926ff7f46 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -119,8 +119,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,

 	sg = *(const uint64_t *)(rx + 1);
 	nb_segs = (sg >> 48) & 0x3;
-	mbuf->nb_segs = nb_segs;
+
+	if (nb_segs == 1) {
+		mbuf->next = NULL;
+		return;
+	}
+
+	mbuf->pkt_len = rx->pkt_lenm1 + 1;
 	mbuf->data_len = sg & 0xFFFF;
+	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;

 	eol = ((const rte_iova_t *)(rx + 1) + ((rx->desc_sizem1 + 1) << 1));
@@ -195,15 +202,14 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags = nix_update_match_id(rx->match_id, ol_flags, mbuf);

 	mbuf->ol_flags = ol_flags;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
 	mbuf->pkt_len = len;
+	mbuf->data_len = len;
+	*(uint64_t *)(&mbuf->rearm_data) = val;

-	if (flag & NIX_RX_MULTI_SEG_F) {
+	if (flag & NIX_RX_MULTI_SEG_F)
 		nix_cqe_xtract_mseg(rx, mbuf, val);
-	} else {
-		mbuf->data_len = len;
+	else
 		mbuf->next = NULL;
-	}
 }

 static inline uint16_t
@@ -481,16 +487,34 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Update that no more segments */
-		mbuf0->next = NULL;
-		mbuf1->next = NULL;
-		mbuf2->next = NULL;
-		mbuf3->next = NULL;
-
 		/* Store the mbufs to rx_pkts */
 		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
 		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);

+		if (flags & NIX_RX_MULTI_SEG_F) {
+			/* Multi segment is enable build mseg list for
+			 * individual mbufs in scalar mode.
+			 */
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer);
+		} else {
+			/* Update that no more segments */
+			mbuf0->next = NULL;
+			mbuf1->next = NULL;
+			mbuf2->next = NULL;
+			mbuf3->next = NULL;
+		}
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
@@ -645,6 +669,9 @@ R(vlan_ts_mark_cksum_ptype_rss,	1, 1, 1, 1, 1, 1,			       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_##name(      \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);

 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_rx_vec_mseg.c b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
new file mode 100644
index 0000000000..04d1e46c82
--- /dev/null
+++ b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_ethdev.h"
+#include "cn10k_rx.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
+					  (flags) | NIX_RX_MULTI_SEG_F);       \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
index 0acedd0a1f..d293d4eac3 100644
--- a/drivers/net/cnxk/cn9k_rx.c
+++ b/drivers/net/cnxk/cn9k_rx.c
@@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
+
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 }

 void
@@ -60,20 +62,29 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 #undef R
 	};

-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
-		pick_rx_func(eth_dev, nix_eth_rx_burst);
-	else
-		pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
+	const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_nix_recv_pkts_vec_mseg_##name,

-	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
-		pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};

 	/* Copy multi seg version with no offload for tear down sequence */
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
-	rte_mb();
+
+	/* For PTP enabled, scalar rx function should be chosen as most of the
+	 * PTP apps are implemented to rx burst 1 pkt.
+	 */
+	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		return pick_rx_func(eth_dev, nix_eth_rx_burst);
+	}
+
+	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+		return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
+	return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
 }
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 10ef5c6905..5ae9e8195c 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -120,8 +120,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,

 	sg = *(const uint64_t *)(rx + 1);
 	nb_segs = (sg >> 48) & 0x3;
-	mbuf->nb_segs = nb_segs;
+
+	if (nb_segs == 1) {
+		mbuf->next = NULL;
+		return;
+	}
+
+	mbuf->pkt_len = rx->pkt_lenm1 + 1;
 	mbuf->data_len = sg & 0xFFFF;
+	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;

 	eol = ((const rte_iova_t *)(rx + 1) +
@@ -198,15 +205,14 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);

 	mbuf->ol_flags = ol_flags;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
 	mbuf->pkt_len = len;
+	mbuf->data_len = len;
+	*(uint64_t *)(&mbuf->rearm_data) = val;

-	if (flag & NIX_RX_MULTI_SEG_F) {
+	if (flag & NIX_RX_MULTI_SEG_F)
 		nix_cqe_xtract_mseg(rx, mbuf, val);
-	} else {
-		mbuf->data_len = len;
+	else
 		mbuf->next = NULL;
-	}
 }

 static inline uint16_t
@@ -484,16 +490,34 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Update that no more segments */
-		mbuf0->next = NULL;
-		mbuf1->next = NULL;
-		mbuf2->next = NULL;
-		mbuf3->next = NULL;
-
 		/* Store the mbufs to rx_pkts */
 		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
 		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);

+		if (flags & NIX_RX_MULTI_SEG_F) {
+			/* Multi segment is enable build mseg list for
+			 * individual mbufs in scalar mode.
+			 */
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer);
+		} else {
+			/* Update that no more segments */
+			mbuf0->next = NULL;
+			mbuf1->next = NULL;
+			mbuf2->next = NULL;
+			mbuf3->next = NULL;
+		}
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
@@ -647,6 +671,9 @@ R(vlan_ts_mark_cksum_ptype_rss,	1, 1, 1, 1, 1, 1,			       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);

 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn9k_rx_vec_mseg.c b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
new file mode 100644
index 0000000000..e46d8a4749
--- /dev/null
+++ b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_ethdev.h"
+#include "cn9k_rx.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
+						 (flags) |                     \
+							 NIX_RX_MULTI_SEG_F);  \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/net/cnxk/meson.build b/drivers/net/cnxk/meson.build
index 2071d0dcb2..aa8c7253fb 100644
--- a/drivers/net/cnxk/meson.build
+++ b/drivers/net/cnxk/meson.build
@@ -23,6 +23,7 @@ sources += files('cn9k_ethdev.c',
 		 'cn9k_rx.c',
 		 'cn9k_rx_mseg.c',
 		 'cn9k_rx_vec.c',
+		 'cn9k_rx_vec_mseg.c',
 		 'cn9k_tx.c',
 		 'cn9k_tx_mseg.c',
 		 'cn9k_tx_vec.c')
@@ -32,6 +33,7 @@ sources += files('cn10k_ethdev.c',
 		 'cn10k_rx.c',
 		 'cn10k_rx_mseg.c',
 		 'cn10k_rx_vec.c',
+		 'cn10k_rx_vec_mseg.c',
 		 'cn10k_tx.c',
 		 'cn10k_tx_mseg.c',
 		 'cn10k_tx_vec.c')
--
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 02/13] net/cnxk: enable ptp processing in vector Rx
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
@ 2021-06-20 20:28     ` pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 03/13] net/cnxk: enable VLAN processing in vector Tx pbhagavatula
                       ` (13 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:28 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable PTP offload in vector Rx burst function, use vector path
for processing mbufs and finally switch to scalar when extracting
timestamp.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_ethdev.c |   1 -
 drivers/net/cnxk/cn10k_rx.c     |   5 +-
 drivers/net/cnxk/cn10k_rx.h     | 124 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn10k_rx_vec.c |   3 -
 drivers/net/cnxk/cn9k_ethdev.c  |   1 -
 drivers/net/cnxk/cn9k_rx.c      |   5 +-
 drivers/net/cnxk/cn9k_rx.h      | 124 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_rx_vec.c  |   3 -
 drivers/net/cnxk/cnxk_ethdev.h  |  19 ++---
 9 files changed, 232 insertions(+), 53 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index b079edbd35..7caec6cf14 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -301,7 +301,6 @@ nix_ptp_enable_vf(struct rte_eth_dev *eth_dev)
 	if (nix_recalc_mtu(eth_dev))
 		plt_err("Failed to set MTU size for ptp");
 
-	dev->scalar_ena = true;
 	dev->rx_offload_flags |= NIX_RX_OFFLOAD_TSTAMP_F;
 
 	/* Setting up the function pointers as per new offload flags */
diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
index 3a9fd71309..69e767ac3d 100644
--- a/drivers/net/cnxk/cn10k_rx.c
+++ b/drivers/net/cnxk/cn10k_rx.c
@@ -75,10 +75,7 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
 
-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+	if (dev->scalar_ena) {
 		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
 			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
 		return pick_rx_func(eth_dev, nix_eth_rx_burst);
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 5926ff7f46..d9572b19e7 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -109,7 +109,7 @@ nix_update_match_id(const uint16_t match_id, uint64_t ol_flags,
 
 static __rte_always_inline void
 nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
-		    uint64_t rearm)
+		    uint64_t rearm, const uint16_t flags)
 {
 	const rte_iova_t *iova_list;
 	struct rte_mbuf *head;
@@ -125,8 +125,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
 		return;
 	}
 
-	mbuf->pkt_len = rx->pkt_lenm1 + 1;
-	mbuf->data_len = sg & 0xFFFF;
+	mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					       CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+	mbuf->data_len = (sg & 0xFFFF) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
 	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;
 
@@ -207,7 +209,7 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 	*(uint64_t *)(&mbuf->rearm_data) = val;
 
 	if (flag & NIX_RX_MULTI_SEG_F)
-		nix_cqe_xtract_mseg(rx, mbuf, val);
+		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
 	else
 		mbuf->next = NULL;
 }
@@ -272,8 +274,9 @@ cn10k_nix_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts,
 				      flags);
 		cnxk_nix_mbuf_to_tstamp(mbuf, rxq->tstamp,
 					(flags & NIX_RX_OFFLOAD_TSTAMP_F),
-					(uint64_t *)((uint8_t *)mbuf + data_off)
-					);
+					(flags & NIX_RX_MULTI_SEG_F),
+					(uint64_t *)((uint8_t *)mbuf
+								+ data_off));
 		rx_pkts[packets++] = mbuf;
 		roc_prefetch_store_keep(mbuf);
 		head++;
@@ -469,6 +472,99 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				mbuf3);
 		}
 
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
+			const uint16x8_t len_off = {
+				0,			     /* ptype   0:15 */
+				0,			     /* ptype  16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* pktlen  0:15*/
+				0,			     /* pktlen 16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* datalen 0:15 */
+				0,
+				0,
+				0};
+			const uint32x4_t ptype = {RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC};
+			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
+						PKT_RX_IEEE1588_TMST |
+						rxq->tstamp->rx_tstamp_dynflag;
+			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
+			uint64x2_t ts01, ts23, mask;
+			uint64_t ts[4];
+			uint8_t res;
+
+			/* Subtract timesync length from total pkt length. */
+			f0 = vsubq_u16(f0, len_off);
+			f1 = vsubq_u16(f1, len_off);
+			f2 = vsubq_u16(f2, len_off);
+			f3 = vsubq_u16(f3, len_off);
+
+			/* Get the address of actual timestamp. */
+			ts01 = vaddq_u64(mbuf01, data_off);
+			ts23 = vaddq_u64(mbuf23, data_off);
+			/* Load timestamp from address. */
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  0),
+					      ts01, 0);
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  1),
+					      ts01, 1);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  0),
+					      ts23, 0);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  1),
+					      ts23, 1);
+			/* Convert from be to cpu byteorder. */
+			ts01 = vrev64q_u8(ts01);
+			ts23 = vrev64q_u8(ts23);
+			/* Store timestamp into scalar for later use. */
+			ts[0] = vgetq_lane_u64(ts01, 0);
+			ts[1] = vgetq_lane_u64(ts01, 1);
+			ts[2] = vgetq_lane_u64(ts23, 0);
+			ts[3] = vgetq_lane_u64(ts23, 1);
+
+			/* Store timestamp into dynfield. */
+			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
+				ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
+				ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
+				ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
+				ts[3];
+
+			/* Generate ptype mask to filter L2 ether timesync */
+			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
+			mask = vsetq_lane_u32(vgetq_lane_u32(f1, 0), mask, 1);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f2, 0), mask, 2);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f3, 0), mask, 3);
+
+			/* Match against L2 ether timesync. */
+			mask = vceqq_u32(mask, ptype);
+			/* Convert from vector from scalar mask */
+			res = vaddvq_u32(vandq_u32(mask, and_mask));
+			res &= 0xF;
+
+			if (res) {
+				/* Fill in the ol_flags for any packets that
+				 * matched.
+				 */
+				ol_flags0 |= ((res & 0x1) ? ts_olf : 0);
+				ol_flags1 |= ((res & 0x2) ? ts_olf : 0);
+				ol_flags2 |= ((res & 0x4) ? ts_olf : 0);
+				ol_flags3 |= ((res & 0x8) ? ts_olf : 0);
+
+				/* Update Rxq timestamp with the latest
+				 * timestamp.
+				 */
+				rxq->tstamp->rx_ready = 1;
+				rxq->tstamp->rx_tstamp =
+					ts[31 - __builtin_clz(res)];
+			}
+		}
+
 		/* Form rearm_data with ol_flags */
 		rearm0 = vsetq_lane_u64(ol_flags0, rearm0, 1);
 		rearm1 = vsetq_lane_u64(ol_flags1, rearm1, 1);
@@ -496,17 +592,17 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
diff --git a/drivers/net/cnxk/cn10k_rx_vec.c b/drivers/net/cnxk/cn10k_rx_vec.c
index 65ffa97841..93528a44f9 100644
--- a/drivers/net/cnxk/cn10k_rx_vec.c
+++ b/drivers/net/cnxk/cn10k_rx_vec.c
@@ -11,9 +11,6 @@
 					       struct rte_mbuf **rx_pkts,      \
 					       uint16_t pkts)                  \
 	{                                                                      \
-		/* TSTMP is not supported by vector */                         \
-		if ((flags) & NIX_RX_OFFLOAD_TSTAMP_F)                         \
-			return 0;                                              \
 		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
 						  (flags));		       \
 	}
diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index 107a540915..cb302b75d8 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -309,7 +309,6 @@ nix_ptp_enable_vf(struct rte_eth_dev *eth_dev)
 	if (nix_recalc_mtu(eth_dev))
 		plt_err("Failed to set MTU size for ptp");
 
-	dev->scalar_ena = true;
 	dev->rx_offload_flags |= NIX_RX_OFFLOAD_TSTAMP_F;
 
 	/* Setting up the function pointers as per new offload flags */
diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
index d293d4eac3..7d9f1bd61f 100644
--- a/drivers/net/cnxk/cn9k_rx.c
+++ b/drivers/net/cnxk/cn9k_rx.c
@@ -75,10 +75,7 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
 
-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+	if (dev->scalar_ena) {
 		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
 			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
 		return pick_rx_func(eth_dev, nix_eth_rx_burst);
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 5ae9e8195c..beb52f39d5 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -110,7 +110,7 @@ nix_update_match_id(const uint16_t match_id, uint64_t ol_flags,
 
 static __rte_always_inline void
 nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
-		    uint64_t rearm)
+		    uint64_t rearm, const uint16_t flags)
 {
 	const rte_iova_t *iova_list;
 	struct rte_mbuf *head;
@@ -126,8 +126,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
 		return;
 	}
 
-	mbuf->pkt_len = rx->pkt_lenm1 + 1;
-	mbuf->data_len = sg & 0xFFFF;
+	mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					       CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+	mbuf->data_len = (sg & 0xFFFF) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
 	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;
 
@@ -210,7 +212,7 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 	*(uint64_t *)(&mbuf->rearm_data) = val;
 
 	if (flag & NIX_RX_MULTI_SEG_F)
-		nix_cqe_xtract_mseg(rx, mbuf, val);
+		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
 	else
 		mbuf->next = NULL;
 }
@@ -275,8 +277,9 @@ cn9k_nix_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts,
 				     flags);
 		cnxk_nix_mbuf_to_tstamp(mbuf, rxq->tstamp,
 					(flags & NIX_RX_OFFLOAD_TSTAMP_F),
-					(uint64_t *)((uint8_t *)mbuf + data_off)
-					);
+					(flags & NIX_RX_MULTI_SEG_F),
+					(uint64_t *)((uint8_t *)mbuf
+								+ data_off));
 		rx_pkts[packets++] = mbuf;
 		roc_prefetch_store_keep(mbuf);
 		head++;
@@ -472,6 +475,99 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				mbuf3);
 		}
 
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
+			const uint16x8_t len_off = {
+				0,			     /* ptype   0:15 */
+				0,			     /* ptype  16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* pktlen  0:15*/
+				0,			     /* pktlen 16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* datalen 0:15 */
+				0,
+				0,
+				0};
+			const uint32x4_t ptype = {RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC};
+			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
+						PKT_RX_IEEE1588_TMST |
+						rxq->tstamp->rx_tstamp_dynflag;
+			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
+			uint64x2_t ts01, ts23, mask;
+			uint64_t ts[4];
+			uint8_t res;
+
+			/* Subtract timesync length from total pkt length. */
+			f0 = vsubq_u16(f0, len_off);
+			f1 = vsubq_u16(f1, len_off);
+			f2 = vsubq_u16(f2, len_off);
+			f3 = vsubq_u16(f3, len_off);
+
+			/* Get the address of actual timestamp. */
+			ts01 = vaddq_u64(mbuf01, data_off);
+			ts23 = vaddq_u64(mbuf23, data_off);
+			/* Load timestamp from address. */
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  0),
+					      ts01, 0);
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  1),
+					      ts01, 1);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  0),
+					      ts23, 0);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  1),
+					      ts23, 1);
+			/* Convert from be to cpu byteorder. */
+			ts01 = vrev64q_u8(ts01);
+			ts23 = vrev64q_u8(ts23);
+			/* Store timestamp into scalar for later use. */
+			ts[0] = vgetq_lane_u64(ts01, 0);
+			ts[1] = vgetq_lane_u64(ts01, 1);
+			ts[2] = vgetq_lane_u64(ts23, 0);
+			ts[3] = vgetq_lane_u64(ts23, 1);
+
+			/* Store timestamp into dynfield. */
+			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
+				ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
+				ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
+				ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
+				ts[3];
+
+			/* Generate ptype mask to filter L2 ether timesync */
+			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
+			mask = vsetq_lane_u32(vgetq_lane_u32(f1, 0), mask, 1);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f2, 0), mask, 2);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f3, 0), mask, 3);
+
+			/* Match against L2 ether timesync. */
+			mask = vceqq_u32(mask, ptype);
+			/* Convert from vector from scalar mask */
+			res = vaddvq_u32(vandq_u32(mask, and_mask));
+			res &= 0xF;
+
+			if (res) {
+				/* Fill in the ol_flags for any packets that
+				 * matched.
+				 */
+				ol_flags0 |= ((res & 0x1) ? ts_olf : 0);
+				ol_flags1 |= ((res & 0x2) ? ts_olf : 0);
+				ol_flags2 |= ((res & 0x4) ? ts_olf : 0);
+				ol_flags3 |= ((res & 0x8) ? ts_olf : 0);
+
+				/* Update Rxq timestamp with the latest
+				 * timestamp.
+				 */
+				rxq->tstamp->rx_ready = 1;
+				rxq->tstamp->rx_tstamp =
+					ts[31 - __builtin_clz(res)];
+			}
+		}
+
 		/* Form rearm_data with ol_flags */
 		rearm0 = vsetq_lane_u64(ol_flags0, rearm0, 1);
 		rearm1 = vsetq_lane_u64(ol_flags1, rearm1, 1);
@@ -499,17 +595,17 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
diff --git a/drivers/net/cnxk/cn9k_rx_vec.c b/drivers/net/cnxk/cn9k_rx_vec.c
index e61c2225c6..ef5f771ef7 100644
--- a/drivers/net/cnxk/cn9k_rx_vec.c
+++ b/drivers/net/cnxk/cn9k_rx_vec.c
@@ -9,9 +9,6 @@
 	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
 	{                                                                      \
-		/* TSTMP is not supported by vector */                         \
-		if ((flags) & NIX_RX_OFFLOAD_TSTAMP_F)                         \
-			return 0;                                              \
 		return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
 						 (flags));                     \
 	}
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index 67b1f42531..4eead03905 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -136,13 +136,12 @@ struct cnxk_eth_qconf {
 };
 
 struct cnxk_timesync_info {
+	uint8_t rx_ready;
+	uint64_t rx_tstamp;
 	uint64_t rx_tstamp_dynflag;
+	int tstamp_dynfield_offset;
 	rte_iova_t tx_tstamp_iova;
 	uint64_t *tx_tstamp;
-	uint64_t rx_tstamp;
-	int tstamp_dynfield_offset;
-	uint8_t tx_ready;
-	uint8_t rx_ready;
 } __plt_cache_aligned;
 
 struct cnxk_eth_dev {
@@ -465,13 +464,15 @@ cnxk_nix_timestamp_dynfield(struct rte_mbuf *mbuf,
 
 static __rte_always_inline void
 cnxk_nix_mbuf_to_tstamp(struct rte_mbuf *mbuf,
-			struct cnxk_timesync_info *tstamp, bool ts_enable,
+			struct cnxk_timesync_info *tstamp,
+			const uint8_t ts_enable, const uint8_t mseg_enable,
 			uint64_t *tstamp_ptr)
 {
-	if (ts_enable &&
-	    (mbuf->data_off ==
-	     RTE_PKTMBUF_HEADROOM + CNXK_NIX_TIMESYNC_RX_OFFSET)) {
-		mbuf->pkt_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+	if (ts_enable) {
+		if (!mseg_enable) {
+			mbuf->pkt_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+			mbuf->data_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+		}
 
 		/* Reading the rx timestamp inserted by CGX, viz at
 		 * starting of the packet data.
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 03/13] net/cnxk: enable VLAN processing in vector Tx
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 02/13] net/cnxk: enable ptp processing in vector Rx pbhagavatula
@ 2021-06-20 20:28     ` pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 04/13] net/cnxk: enable ptp " pbhagavatula
                       ` (12 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:28 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable VLAN offload in vector Tx burst function.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |   3 +-
 drivers/net/cnxk/cn10k_tx.h     | 125 +++++++++++++++++++++++++++----
 drivers/net/cnxk/cn10k_tx_vec.c |   3 +-
 drivers/net/cnxk/cn9k_tx.c      |   3 +-
 drivers/net/cnxk/cn9k_tx.h      | 128 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_tx_vec.c  |   3 +-
 6 files changed, 227 insertions(+), 38 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 18694dc704..05bc163a40 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -69,8 +69,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 
 	if (dev->scalar_ena ||
 	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
-	      NIX_TX_OFFLOAD_TSO_F)))
+	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8b1446f25c..1e16978584 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -62,9 +62,14 @@ cn10k_nix_tx_ext_subs(const uint16_t flags)
 static __rte_always_inline uint8_t
 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
 {
-	RTE_SET_USED(flags);
-	/* We can pack up to 4 packets per LMTLINE if there are no offloads. */
-	return 4 << ROC_LMT_LINES_PER_CORE_LOG2;
+	return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
+	       << ROC_LMT_LINES_PER_CORE_LOG2;
+}
+
+static __rte_always_inline uint8_t
+cn10k_nix_tx_dwords_per_line(const uint16_t flags)
+{
+	return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
 }
 
 static __rte_always_inline uint64_t
@@ -98,10 +103,9 @@ cn10k_nix_tx_steor_data(const uint16_t flags)
 static __rte_always_inline uint64_t
 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
 {
-	const uint64_t dw_m1 = 0x7;
+	const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
 	uint64_t data;
 
-	RTE_SET_USED(flags);
 	/* This will be moved to addr area */
 	data = dw_m1;
 	/* 15 vector sizes for single seg */
@@ -690,11 +694,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
-	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
+	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
+		cmd2[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
+	uint64x2_t sendext01_w0, sendext23_w0;
+	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -720,6 +727,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
 	sgdesc23_w0 = sgdesc01_w0;
 
+	/* Load command defaults into vector variables. */
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
+		sendext23_w0 = sendext01_w0;
+		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		sendext23_w1 = sendext01_w1;
+	}
+
 	/* Get LMT base address and LMT ID as lcore id */
 	ROC_LMT_BASE_ID_GET(laddr, lmt_id);
 	left = pkts;
@@ -738,6 +753,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc23_w0 = senddesc01_w0;
 		sgdesc23_w0 = sgdesc01_w0;
 
+		/* Clear vlan enables. */
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			sendext01_w1 = vbicq_u64(sendext01_w1,
+						 vdupq_n_u64(0x3FFFF00FFFF00));
+			sendext23_w1 = sendext01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1303,6 +1325,52 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 		senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+			/* Tx ol_flag for vlan. */
+			const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
+			/* Bit enable for VLAN1 */
+			const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+			/* Tx ol_flag for QnQ. */
+			const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
+			/* Bit enable for VLAN0 */
+			const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+			/* Load vlan values from packet. outer is VLAN 0 */
+			uint64x2_t ext01 = {
+				((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+			};
+			uint64x2_t ext23 = {
+				((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+			};
+
+			/* Get ol_flags of the packets. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* ORR vlan outer/inner values into cmd. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+			/* Test for offload enable bits and generate masks. */
+			xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(xtmp128, olq),
+						      mlq));
+			ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(ytmp128, olq),
+						      mlq));
+
+			/* Set vlan enable bits into cmd based on mask. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1381,16 +1449,41 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 		cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 
-		/* Store the prepared send desc to LMT lines */
-		vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
-		lnum += 1;
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+			cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+			cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+		}
+
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			/* Store the prepared send desc to LMT lines */
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+			lnum += 1;
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			lnum += 1;
+		} else {
+			/* Store the prepared send desc to LMT lines */
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
+			lnum += 1;
+		}
 
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 7453f3bc98..beb5c649bb 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -14,8 +14,7 @@
 		uint64_t cmd[sz];                                              \
 									       \
 		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
+		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
 		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index b802606075..4b43cdaff9 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -68,8 +68,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 
 	if (dev->scalar_ena ||
 	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
-	      NIX_TX_OFFLOAD_TSO_F)))
+	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 1899d6670f..d5715bb52d 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -552,10 +552,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
-	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
+	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
+		cmd2[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
+	uint64x2_t sendext01_w0, sendext23_w0;
+	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
@@ -585,8 +588,19 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	senddesc23_w0 = senddesc01_w0;
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
-	sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-	sgdesc23_w0 = sgdesc01_w0;
+
+	/* Load command defaults into vector variables. */
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+		sendext23_w0 = sendext01_w0;
+		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		sendext23_w1 = sendext01_w1;
+		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
+		sgdesc23_w0 = sgdesc01_w0;
+	} else {
+		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+		sgdesc23_w0 = sgdesc01_w0;
+	}
 
 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
 		/* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
@@ -597,6 +611,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc23_w0 = senddesc01_w0;
 		sgdesc23_w0 = sgdesc01_w0;
 
+		/* Clear vlan enables. */
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			sendext01_w1 = vbicq_u64(sendext01_w1,
+						 vdupq_n_u64(0x3FFFF00FFFF00));
+			sendext23_w1 = sendext01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1162,6 +1183,52 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 		senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+			/* Tx ol_flag for vlan. */
+			const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
+			/* Bit enable for VLAN1 */
+			const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+			/* Tx ol_flag for QnQ. */
+			const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
+			/* Bit enable for VLAN0 */
+			const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+			/* Load vlan values from packet. outer is VLAN 0 */
+			uint64x2_t ext01 = {
+				((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+			};
+			uint64x2_t ext23 = {
+				((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+			};
+
+			/* Get ol_flags of the packets. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* ORR vlan outer/inner values into cmd. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+			/* Test for offload enable bits and generate masks. */
+			xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(xtmp128, olq),
+						      mlq));
+			ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(ytmp128, olq),
+						      mlq));
+
+			/* Set vlan enable bits into cmd based on mask. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1247,17 +1314,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 		cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 
-		do {
-			vst1q_u64(lmt_addr, cmd0[0]);
-			vst1q_u64(lmt_addr + 2, cmd1[0]);
-			vst1q_u64(lmt_addr + 4, cmd0[1]);
-			vst1q_u64(lmt_addr + 6, cmd1[1]);
-			vst1q_u64(lmt_addr + 8, cmd0[2]);
-			vst1q_u64(lmt_addr + 10, cmd1[2]);
-			vst1q_u64(lmt_addr + 12, cmd0[3]);
-			vst1q_u64(lmt_addr + 14, cmd1[3]);
-			lmt_status = roc_lmt_submit_ldeor(io_addr);
-		} while (lmt_status == 0);
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+			cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+			cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+		}
+
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			/* With ext header in the command we can no longer send
+			 * all 4 packets together since LMTLINE is 128bytes.
+			 * Split and Tx twice.
+			 */
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd2[0]);
+				vst1q_u64(lmt_addr + 4, cmd1[0]);
+				vst1q_u64(lmt_addr + 6, cmd0[1]);
+				vst1q_u64(lmt_addr + 8, cmd2[1]);
+				vst1q_u64(lmt_addr + 10, cmd1[1]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+
+			do {
+				vst1q_u64(lmt_addr, cmd0[2]);
+				vst1q_u64(lmt_addr + 2, cmd2[2]);
+				vst1q_u64(lmt_addr + 4, cmd1[2]);
+				vst1q_u64(lmt_addr + 6, cmd0[3]);
+				vst1q_u64(lmt_addr + 8, cmd2[3]);
+				vst1q_u64(lmt_addr + 10, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+		} else {
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd1[0]);
+				vst1q_u64(lmt_addr + 4, cmd0[1]);
+				vst1q_u64(lmt_addr + 6, cmd1[1]);
+				vst1q_u64(lmt_addr + 8, cmd0[2]);
+				vst1q_u64(lmt_addr + 10, cmd1[2]);
+				vst1q_u64(lmt_addr + 12, cmd0[3]);
+				vst1q_u64(lmt_addr + 14, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+		}
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index a6e7c9e542..5842facb58 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -14,8 +14,7 @@
 		uint64_t cmd[sz];                                              \
 									       \
 		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
+		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
 		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 04/13] net/cnxk: enable ptp processing in vector Tx
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 02/13] net/cnxk: enable ptp processing in vector Rx pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 03/13] net/cnxk: enable VLAN processing in vector Tx pbhagavatula
@ 2021-06-20 20:28     ` pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 05/13] net/cnxk: enable TSO " pbhagavatula
                       ` (11 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:28 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable PTP offload in vector Tx burst function. Since, we can
no-longer use a single LMT line for burst of 4, split the LMT
into two and transmit twice.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |   4 +-
 drivers/net/cnxk/cn10k_tx.h     | 109 +++++++++++++++++++++++++++-----
 drivers/net/cnxk/cn10k_tx_vec.c |   5 +-
 drivers/net/cnxk/cn9k_tx.c      |   4 +-
 drivers/net/cnxk/cn9k_tx.h      | 105 ++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_tx_vec.c  |   5 +-
 6 files changed, 192 insertions(+), 40 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 05bc163a40..c4c3e65704 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,9 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena ||
-	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
+	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 1e16978584..8af6799ff6 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -69,7 +69,9 @@ cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
 static __rte_always_inline uint8_t
 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
 {
-	return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
+	return (flags & NIX_TX_NEED_EXT_HDR) ?
+			     ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
+			     8;
 }
 
 static __rte_always_inline uint64_t
@@ -695,13 +697,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
-		cmd2[NIX_DESCS_PER_LOOP];
+		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
 	uint64x2_t sendext01_w0, sendext23_w0;
 	uint64x2_t sendext01_w1, sendext23_w1;
+	uint64x2_t sendmem01_w0, sendmem23_w0;
+	uint64x2_t sendmem01_w1, sendmem23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -733,6 +737,12 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		sendext23_w0 = sendext01_w0;
 		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
 		sendext23_w1 = sendext01_w1;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+			sendmem23_w0 = sendmem01_w0;
+			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
+			sendmem23_w1 = sendmem01_w1;
+		}
 	}
 
 	/* Get LMT base address and LMT ID as lcore id */
@@ -760,6 +770,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = sendext01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Reset send mem alg to SETTSTMP from SUB*/
+			sendmem01_w0 = vbicq_u64(sendmem01_w0,
+						 vdupq_n_u64(BIT_ULL(59)));
+			/* Reset send mem address to default. */
+			sendmem01_w1 =
+				vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+			sendmem23_w0 = sendmem01_w0;
+			sendmem23_w1 = sendmem01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1371,6 +1392,44 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Tx ol_flag for timestam. */
+			const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
+						PKT_TX_IEEE1588_TMST};
+			/* Set send mem alg to SUB. */
+			const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+			/* Increment send mem address by 8. */
+			const uint64x2_t addr = {0x8, 0x8};
+
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Check if timestamp is requested and generate inverted
+			 * mask as we need not make any changes to default cmd
+			 * value.
+			 */
+			xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+			ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+			/* Change send mem address to an 8 byte offset when
+			 * TSTMP is disabled.
+			 */
+			sendmem01_w1 = vaddq_u64(sendmem01_w1,
+						 vandq_u64(xtmp128, addr));
+			sendmem23_w1 = vaddq_u64(sendmem23_w1,
+						 vandq_u64(ytmp128, addr));
+			/* Change send mem alg to SUB when TSTMP is disabled. */
+			sendmem01_w0 = vorrq_u64(sendmem01_w0,
+						 vandq_u64(xtmp128, alg));
+			sendmem23_w0 = vorrq_u64(sendmem23_w0,
+						 vandq_u64(ytmp128, alg));
+
+			cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1458,19 +1517,39 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 		if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* Store the prepared send desc to LMT lines */
-			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
-			lnum += 1;
-			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
+				lnum += 1;
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
+			} else {
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+				lnum += 1;
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			}
 			lnum += 1;
 		} else {
 			/* Store the prepared send desc to LMT lines */
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index beb5c649bb..0b4a4c7bae 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -13,9 +13,8 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* TSO is not supported by vec */                              \
+		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
 						  (flags));                    \
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index 4b43cdaff9..c32681ed44 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,9 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena ||
-	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
+	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index d5715bb52d..cb574a1c1d 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -553,12 +553,14 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
-		cmd2[NIX_DESCS_PER_LOOP];
+		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint64x2_t sendext01_w0, sendext23_w0;
 	uint64x2_t sendext01_w1, sendext23_w1;
+	uint64x2_t sendmem01_w0, sendmem23_w0;
+	uint64x2_t sendmem01_w1, sendmem23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
@@ -597,6 +599,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		sendext23_w1 = sendext01_w1;
 		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
 		sgdesc23_w0 = sgdesc01_w0;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
+			sendmem23_w0 = sendmem01_w0;
+			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
+			sendmem23_w1 = sendmem01_w1;
+		}
 	} else {
 		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
 		sgdesc23_w0 = sgdesc01_w0;
@@ -618,6 +626,17 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = sendext01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Reset send mem alg to SETTSTMP from SUB*/
+			sendmem01_w0 = vbicq_u64(sendmem01_w0,
+						 vdupq_n_u64(BIT_ULL(59)));
+			/* Reset send mem address to default. */
+			sendmem01_w1 =
+				vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+			sendmem23_w0 = sendmem01_w0;
+			sendmem23_w1 = sendmem01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1229,6 +1248,44 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Tx ol_flag for timestam. */
+			const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
+						PKT_TX_IEEE1588_TMST};
+			/* Set send mem alg to SUB. */
+			const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+			/* Increment send mem address by 8. */
+			const uint64x2_t addr = {0x8, 0x8};
+
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Check if timestamp is requested and generate inverted
+			 * mask as we need not make any changes to default cmd
+			 * value.
+			 */
+			xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+			ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+			/* Change send mem address to an 8 byte offset when
+			 * TSTMP is disabled.
+			 */
+			sendmem01_w1 = vaddq_u64(sendmem01_w1,
+						 vandq_u64(xtmp128, addr));
+			sendmem23_w1 = vaddq_u64(sendmem23_w1,
+						 vandq_u64(ytmp128, addr));
+			/* Change send mem alg to SUB when TSTMP is disabled. */
+			sendmem01_w0 = vorrq_u64(sendmem01_w0,
+						 vandq_u64(xtmp128, alg));
+			sendmem23_w0 = vorrq_u64(sendmem23_w0,
+						 vandq_u64(ytmp128, alg));
+
+			cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1327,22 +1384,44 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			 * Split and Tx twice.
 			 */
 			do {
-				vst1q_u64(lmt_addr, cmd0[0]);
-				vst1q_u64(lmt_addr + 2, cmd2[0]);
-				vst1q_u64(lmt_addr + 4, cmd1[0]);
-				vst1q_u64(lmt_addr + 6, cmd0[1]);
-				vst1q_u64(lmt_addr + 8, cmd2[1]);
-				vst1q_u64(lmt_addr + 10, cmd1[1]);
+				if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+					vst1q_u64(lmt_addr, cmd0[0]);
+					vst1q_u64(lmt_addr + 2, cmd2[0]);
+					vst1q_u64(lmt_addr + 4, cmd1[0]);
+					vst1q_u64(lmt_addr + 6, cmd3[0]);
+					vst1q_u64(lmt_addr + 8, cmd0[1]);
+					vst1q_u64(lmt_addr + 10, cmd2[1]);
+					vst1q_u64(lmt_addr + 12, cmd1[1]);
+					vst1q_u64(lmt_addr + 14, cmd3[1]);
+				} else {
+					vst1q_u64(lmt_addr, cmd0[0]);
+					vst1q_u64(lmt_addr + 2, cmd2[0]);
+					vst1q_u64(lmt_addr + 4, cmd1[0]);
+					vst1q_u64(lmt_addr + 6, cmd0[1]);
+					vst1q_u64(lmt_addr + 8, cmd2[1]);
+					vst1q_u64(lmt_addr + 10, cmd1[1]);
+				}
 				lmt_status = roc_lmt_submit_ldeor(io_addr);
 			} while (lmt_status == 0);
 
 			do {
-				vst1q_u64(lmt_addr, cmd0[2]);
-				vst1q_u64(lmt_addr + 2, cmd2[2]);
-				vst1q_u64(lmt_addr + 4, cmd1[2]);
-				vst1q_u64(lmt_addr + 6, cmd0[3]);
-				vst1q_u64(lmt_addr + 8, cmd2[3]);
-				vst1q_u64(lmt_addr + 10, cmd1[3]);
+				if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+					vst1q_u64(lmt_addr, cmd0[2]);
+					vst1q_u64(lmt_addr + 2, cmd2[2]);
+					vst1q_u64(lmt_addr + 4, cmd1[2]);
+					vst1q_u64(lmt_addr + 6, cmd3[2]);
+					vst1q_u64(lmt_addr + 8, cmd0[3]);
+					vst1q_u64(lmt_addr + 10, cmd2[3]);
+					vst1q_u64(lmt_addr + 12, cmd1[3]);
+					vst1q_u64(lmt_addr + 14, cmd3[3]);
+				} else {
+					vst1q_u64(lmt_addr, cmd0[2]);
+					vst1q_u64(lmt_addr + 2, cmd2[2]);
+					vst1q_u64(lmt_addr + 4, cmd1[2]);
+					vst1q_u64(lmt_addr + 6, cmd0[3]);
+					vst1q_u64(lmt_addr + 8, cmd2[3]);
+					vst1q_u64(lmt_addr + 10, cmd1[3]);
+				}
 				lmt_status = roc_lmt_submit_ldeor(io_addr);
 			} while (lmt_status == 0);
 		} else {
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index 5842facb58..9ade66db2b 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -13,9 +13,8 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* TSO is not supported by vec */                              \
+		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
 						 (flags));		       \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 05/13] net/cnxk: enable TSO processing in vector Tx
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (2 preceding siblings ...)
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 04/13] net/cnxk: enable ptp " pbhagavatula
@ 2021-06-20 20:28     ` pbhagavatula
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 06/13] net/cnxk: add multi seg Tx vector routine pbhagavatula
                       ` (10 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:28 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable TSO offload in vector Tx burst function.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |  2 +-
 drivers/net/cnxk/cn10k_tx.h     | 97 +++++++++++++++++++++++++++++++++
 drivers/net/cnxk/cn10k_tx_vec.c |  5 +-
 drivers/net/cnxk/cn9k_tx.c      |  2 +-
 drivers/net/cnxk/cn9k_tx.h      | 94 ++++++++++++++++++++++++++++++++
 drivers/net/cnxk/cn9k_tx_vec.c  |  5 +-
 6 files changed, 199 insertions(+), 6 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index c4c3e65704..d06879163f 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,7 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
+	if (dev->scalar_ena)
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8af6799ff6..26797581e7 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -689,6 +689,46 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 #if defined(RTE_ARCH_ARM64)
 
+static __rte_always_inline void
+cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
+		      union nix_send_ext_w0_u *w0, uint64_t ol_flags,
+		      const uint64_t flags, const uint64_t lso_tun_fmt)
+{
+	uint16_t lso_sb;
+	uint64_t mask;
+
+	if (!(ol_flags & PKT_TX_TCP_SEG))
+		return;
+
+	mask = -(!w1->il3type);
+	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+
+	w0->u |= BIT(14);
+	w0->lso_sb = lso_sb;
+	w0->lso_mps = m->tso_segsz;
+	w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
+	w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
+
+	/* Handle tunnel tso */
+	if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
+	    (ol_flags & PKT_TX_TUNNEL_MASK)) {
+		const uint8_t is_udp_tun =
+			(CNXK_NIX_UDP_TUN_BITMASK >>
+			 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
+			0x1;
+		uint8_t shift = is_udp_tun ? 32 : 0;
+
+		shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
+		shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
+
+		w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
+		w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
+		/* Update format for UDP tunneled packet */
+
+		w0->lso_format = (lso_tun_fmt >> shift);
+	}
+}
+
 #define NIX_DESCS_PER_LOOP 4
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -723,6 +763,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F) {
+		for (i = 0; i < pkts; i++)
+			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+	}
 
 	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
 	senddesc23_w0 = senddesc01_w0;
@@ -781,6 +826,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendmem23_w1 = sendmem01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			/* Clear the LSO enable bit. */
+			sendext01_w0 = vbicq_u64(sendext01_w0,
+						 vdupq_n_u64(BIT_ULL(14)));
+			sendext23_w0 = sendext01_w0;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1430,6 +1482,51 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			const uint64_t lso_fmt = txq->lso_tun_fmt;
+			uint64_t sx_w0[NIX_DESCS_PER_LOOP];
+			uint64_t sd_w1[NIX_DESCS_PER_LOOP];
+
+			/* Extract SD W1 as we need to set L4 types. */
+			vst1q_u64(sd_w1, senddesc01_w1);
+			vst1q_u64(sd_w1 + 2, senddesc23_w1);
+
+			/* Extract SX W0 as we need to set LSO fields. */
+			vst1q_u64(sx_w0, sendext01_w0);
+			vst1q_u64(sx_w0 + 2, sendext23_w0);
+
+			/* Extract ol_flags. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Prepare individual mbufs. */
+			cn10k_nix_prepare_tso(tx_pkts[0],
+				(union nix_send_hdr_w1_u *)&sd_w1[0],
+				(union nix_send_ext_w0_u *)&sx_w0[0],
+				vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[1],
+				(union nix_send_hdr_w1_u *)&sd_w1[1],
+				(union nix_send_ext_w0_u *)&sx_w0[1],
+				vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[2],
+				(union nix_send_hdr_w1_u *)&sd_w1[2],
+				(union nix_send_ext_w0_u *)&sx_w0[2],
+				vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[3],
+				(union nix_send_hdr_w1_u *)&sd_w1[3],
+				(union nix_send_ext_w0_u *)&sx_w0[3],
+				vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
+
+			senddesc01_w1 = vld1q_u64(sd_w1);
+			senddesc23_w1 = vld1q_u64(sd_w1 + 2);
+
+			sendext01_w0 = vld1q_u64(sx_w0);
+			sendext23_w0 = vld1q_u64(sx_w0 + 2);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 0b4a4c7bae..34e3737501 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -13,8 +13,9 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* TSO is not supported by vec */                              \
-		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&			       \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
 						  (flags));                    \
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index c32681ed44..735e21cc60 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,7 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
+	if (dev->scalar_ena)
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index cb574a1c1d..dca732a9fa 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -545,6 +545,43 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 #if defined(RTE_ARCH_ARM64)
 
+static __rte_always_inline void
+cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
+		     union nix_send_ext_w0_u *w0, uint64_t ol_flags,
+		     uint64_t flags)
+{
+	uint16_t lso_sb;
+	uint64_t mask;
+
+	if (!(ol_flags & PKT_TX_TCP_SEG))
+		return;
+
+	mask = -(!w1->il3type);
+	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+
+	w0->u |= BIT(14);
+	w0->lso_sb = lso_sb;
+	w0->lso_mps = m->tso_segsz;
+	w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
+	w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
+
+	/* Handle tunnel tso */
+	if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
+	    (ol_flags & PKT_TX_TUNNEL_MASK)) {
+		const uint8_t is_udp_tun =
+			(CNXK_NIX_UDP_TUN_BITMASK >>
+			 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
+			0x1;
+
+		w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
+		w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
+		/* Update format for UDP tunneled packet */
+		w0->lso_format += is_udp_tun ? 2 : 6;
+
+		w0->lso_format += !!(ol_flags & PKT_TX_OUTER_IPV6) << 1;
+	}
+}
+
 #define NIX_DESCS_PER_LOOP 4
 static __rte_always_inline uint16_t
 cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -580,6 +617,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
 
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F) {
+		for (i = 0; i < pkts; i++)
+			cn9k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+	}
+
 	/* Lets commit any changes in the packet here as no further changes
 	 * to the packet will be done unless no fast free is enabled.
 	 */
@@ -637,6 +680,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendmem23_w1 = sendmem01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			/* Clear the LSO enable bit. */
+			sendext01_w0 = vbicq_u64(sendext01_w0,
+						 vdupq_n_u64(BIT_ULL(14)));
+			sendext23_w0 = sendext01_w0;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1286,6 +1336,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			uint64_t sx_w0[NIX_DESCS_PER_LOOP];
+			uint64_t sd_w1[NIX_DESCS_PER_LOOP];
+
+			/* Extract SD W1 as we need to set L4 types. */
+			vst1q_u64(sd_w1, senddesc01_w1);
+			vst1q_u64(sd_w1 + 2, senddesc23_w1);
+
+			/* Extract SX W0 as we need to set LSO fields. */
+			vst1q_u64(sx_w0, sendext01_w0);
+			vst1q_u64(sx_w0 + 2, sendext23_w0);
+
+			/* Extract ol_flags. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Prepare individual mbufs. */
+			cn9k_nix_prepare_tso(tx_pkts[0],
+				(union nix_send_hdr_w1_u *)&sd_w1[0],
+				(union nix_send_ext_w0_u *)&sx_w0[0],
+				vgetq_lane_u64(xtmp128, 0), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[1],
+				(union nix_send_hdr_w1_u *)&sd_w1[1],
+				(union nix_send_ext_w0_u *)&sx_w0[1],
+				vgetq_lane_u64(xtmp128, 1), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[2],
+				(union nix_send_hdr_w1_u *)&sd_w1[2],
+				(union nix_send_ext_w0_u *)&sx_w0[2],
+				vgetq_lane_u64(ytmp128, 0), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[3],
+				(union nix_send_hdr_w1_u *)&sd_w1[3],
+				(union nix_send_ext_w0_u *)&sx_w0[3],
+				vgetq_lane_u64(ytmp128, 1), flags);
+
+			senddesc01_w1 = vld1q_u64(sd_w1);
+			senddesc23_w1 = vld1q_u64(sd_w1 + 2);
+
+			sendext01_w0 = vld1q_u64(sx_w0);
+			sendext23_w0 = vld1q_u64(sx_w0 + 2);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index 9ade66db2b..56a3e2514a 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -13,8 +13,9 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* TSO is not supported by vec */                              \
-		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
 						 (flags));		       \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 06/13] net/cnxk: add multi seg Tx vector routine
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (3 preceding siblings ...)
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 05/13] net/cnxk: enable TSO " pbhagavatula
@ 2021-06-20 20:28     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 07/13] event/cnxk: add Rx adapter support pbhagavatula
                       ` (9 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:28 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add multi segment Tx vector routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c          |  20 +-
 drivers/net/cnxk/cn10k_tx.h          | 388 +++++++++++++++++++++++++--
 drivers/net/cnxk/cn10k_tx_vec_mseg.c |  24 ++
 drivers/net/cnxk/cn9k_tx.c           |  20 +-
 drivers/net/cnxk/cn9k_tx.h           | 272 ++++++++++++++++++-
 drivers/net/cnxk/cn9k_tx_vec_mseg.c  |  24 ++
 drivers/net/cnxk/meson.build         |   6 +-
 7 files changed, 709 insertions(+), 45 deletions(-)
 create mode 100644 drivers/net/cnxk/cn10k_tx_vec_mseg.c
 create mode 100644 drivers/net/cnxk/cn9k_tx_vec_mseg.c

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index d06879163f..1f30bab59a 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,13 +67,23 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena)
+	const eth_tx_burst_t nix_eth_tx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_nix_xmit_pkts_vec_mseg_##name,
+
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	if (dev->scalar_ena) {
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
-	else
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+	} else {
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
-
-	if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-		pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_vec_burst_mseg);
+	}
 
 	rte_mb();
 }
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 26797581e7..532b53b319 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -42,6 +42,13 @@
 		}                                                              \
 	} while (0)
 
+/* Encoded number of segments to number of dwords macro, each value of nb_segs
+ * is encoded as 4bits.
+ */
+#define NIX_SEGDW_MAGIC 0x76654432210ULL
+
+#define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
+
 #define LMT_OFF(lmt_addr, lmt_num, offset)                                     \
 	(void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
 
@@ -102,6 +109,14 @@ cn10k_nix_tx_steor_data(const uint16_t flags)
 	return data;
 }
 
+static __rte_always_inline uint8_t
+cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
+{
+	return ((flags & NIX_TX_NEED_EXT_HDR) ?
+			      (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
+			      4);
+}
+
 static __rte_always_inline uint64_t
 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
 {
@@ -729,7 +744,244 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 	}
 }
 
+static __rte_always_inline void
+cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
+				union nix_send_hdr_w0_u *sh,
+				union nix_send_sg_s *sg, const uint32_t flags)
+{
+	struct rte_mbuf *m_next;
+	uint64_t *slist, sg_u;
+	uint16_t nb_segs;
+	int i = 1;
+
+	sh->total = m->pkt_len;
+	/* Clear sg->u header before use */
+	sg->u &= 0xFC00000000000000;
+	sg_u = sg->u;
+	slist = &cmd[0];
+
+	sg_u = sg_u | ((uint64_t)m->data_len);
+
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+
+	m = m_next;
+	/* Fill mbuf segments */
+	do {
+		m_next = m->next;
+		sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
+		*slist = rte_mbuf_data_iova(m);
+		/* Set invert df if buffer is not to be freed by H/W */
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			/* Mark mempool object as "put" since it is freed by NIX
+			 */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (!(sg_u & (1ULL << (i + 55))))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		slist++;
+		i++;
+		nb_segs--;
+		if (i > 2 && nb_segs) {
+			i = 0;
+			/* Next SG subdesc */
+			*(uint64_t *)slist = sg_u & 0xFC00000000000000;
+			sg->u = sg_u;
+			sg->segs = 3;
+			sg = (union nix_send_sg_s *)slist;
+			sg_u = sg->u;
+			slist++;
+		}
+		m = m_next;
+	} while (nb_segs);
+
+	sg->u = sg_u;
+	sg->segs = i;
+}
+
+static __rte_always_inline void
+cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
+			   uint64x2_t *cmd1, const uint8_t segdw,
+			   const uint32_t flags)
+{
+	union nix_send_hdr_w0_u sh;
+	union nix_send_sg_s sg;
+
+	if (m->nb_segs == 1) {
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+			sg.u = vgetq_lane_u64(cmd1[0], 0);
+			sg.u |= (cnxk_nix_prefree_seg(m) << 55);
+			cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+		}
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		sg.u = vgetq_lane_u64(cmd1[0], 0);
+		if (!(sg.u & (1ULL << 55)))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		return;
+	}
+
+	sh.u = vgetq_lane_u64(cmd0[0], 0);
+	sg.u = vgetq_lane_u64(cmd1[0], 0);
+
+	cn10k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
+
+	sh.sizem1 = segdw - 1;
+	cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
+	cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+}
+
 #define NIX_DESCS_PER_LOOP 4
+
+static __rte_always_inline uint8_t
+cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
+			       uint64x2_t *cmd1, uint64x2_t *cmd2,
+			       uint64x2_t *cmd3, uint8_t *segdw,
+			       uint64_t *lmt_addr, __uint128_t *data128,
+			       uint8_t *shift, const uint16_t flags)
+{
+	uint8_t j, off, lmt_used;
+
+	if (!(flags & NIX_TX_NEED_EXT_HDR) &&
+	    !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+		/* No segments in 4 consecutive packets. */
+		if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
+			for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
+				cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+			vst1q_u64(lmt_addr, cmd0[0]);
+			vst1q_u64(lmt_addr + 2, cmd1[0]);
+			vst1q_u64(lmt_addr + 4, cmd0[1]);
+			vst1q_u64(lmt_addr + 6, cmd1[1]);
+			vst1q_u64(lmt_addr + 8, cmd0[2]);
+			vst1q_u64(lmt_addr + 10, cmd1[2]);
+			vst1q_u64(lmt_addr + 12, cmd0[3]);
+			vst1q_u64(lmt_addr + 14, cmd1[3]);
+
+			*data128 |= ((__uint128_t)7) << *shift;
+			shift += 3;
+
+			return 1;
+		}
+	}
+
+	lmt_used = 0;
+	for (j = 0; j < NIX_DESCS_PER_LOOP;) {
+		/* Fit consecutive packets in same LMTLINE. */
+		if ((segdw[j] + segdw[j + 1]) <= 8) {
+			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+				cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL,
+							   &cmd0[j + 1],
+							   &cmd1[j + 1],
+							   segdw[j + 1], flags);
+				/* TSTAMP takes 4 each, no segs. */
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				vst1q_u64(lmt_addr + 6, cmd3[j]);
+
+				vst1q_u64(lmt_addr + 8, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 10, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 12, cmd1[j + 1]);
+				vst1q_u64(lmt_addr + 14, cmd3[j + 1]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				/* EXT header take 3 each, space for 2 segs.*/
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 6,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				off = segdw[j] - 3;
+				off <<= 1;
+				cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
+							   lmt_addr + 12 + off,
+							   &cmd0[j + 1],
+							   &cmd1[j + 1],
+							   segdw[j + 1], flags);
+				vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
+			} else {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 4,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+				off = segdw[j] - 2;
+				off <<= 1;
+				cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
+							   lmt_addr + 8 + off,
+							   &cmd0[j + 1],
+							   &cmd1[j + 1],
+							   segdw[j + 1], flags);
+				vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
+			}
+			*data128 |= ((__uint128_t)(segdw[j] + segdw[j + 1]) - 1)
+				    << *shift;
+			*shift += 3;
+			j += 2;
+		} else {
+			if ((flags & NIX_TX_NEED_EXT_HDR) &&
+			    (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 6,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				off = segdw[j] - 4;
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 6,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+			} else {
+				cn10k_nix_prepare_mseg_vec(mbufs[j],
+							   lmt_addr + 4,
+							   &cmd0[j], &cmd1[j],
+							   segdw[j], flags);
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+			}
+			*data128 |= ((__uint128_t)(segdw[j]) - 1) << *shift;
+			*shift += 3;
+			j++;
+		}
+		lmt_used++;
+		lmt_addr += 16;
+	}
+
+	return lmt_used;
+}
+
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
@@ -738,7 +990,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
 		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
-	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
+	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
@@ -746,6 +998,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sendmem01_w0, sendmem23_w0;
 	uint64x2_t sendmem01_w1, sendmem23_w1;
+	uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -754,7 +1007,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t ltypes01, ltypes23;
 	uint64x2_t xtmp128, ytmp128;
 	uint64x2_t xmask01, xmask23;
-	uint8_t lnum;
+	uint8_t lnum, shift;
+	union wdata {
+		__uint128_t data128;
+		uint64_t data[2];
+	} wd;
 
 	NIX_XMIT_FC_OR_RETURN(txq, pkts);
 
@@ -798,8 +1055,43 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
 			      cn10k_nix_pkts_per_vec_brst(flags) :
 			      left;
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		wd.data128 = 0;
+		shift = 16;
+	}
 	lnum = 0;
+
 	for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			struct rte_mbuf *m = tx_pkts[j];
+			uint8_t j;
+
+			for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
+				/* Get dwords based on nb_segs. */
+				segdw[j] = NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
+				/* Add dwords based on offloads. */
+				segdw[j] += 1 + /* SEND HDR */
+					    !!(flags & NIX_TX_NEED_EXT_HDR) +
+					    !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+			}
+
+			/* Check if there are enough LMTLINES for this loop */
+			if (lnum + 4 > 32) {
+				uint8_t ldwords_con = 0, lneeded = 0;
+				for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
+					ldwords_con += segdw[j];
+					if (ldwords_con > 8) {
+						lneeded += 1;
+						ldwords_con = segdw[j];
+					}
+				}
+				lneeded += 1;
+				if (lnum + lneeded > 32) {
+					burst = i;
+					break;
+				}
+			}
+		}
 		/* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
 		senddesc01_w0 =
 			vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
@@ -1527,7 +1819,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w0 = vld1q_u64(sx_w0 + 2);
 		}
 
-		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
+		    !(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
 			xmask23 = xmask01;
@@ -1567,7 +1860,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 					(void **)&mbuf3, 1, 0);
 			senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 			senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
-		} else {
+		} else if (!(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Move mbufs to iova */
 			mbuf0 = (uint64_t *)tx_pkts[0];
 			mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1612,7 +1905,19 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
 		}
 
-		if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			uint8_t j;
+
+			segdw[4] = 8;
+			j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
+							  cmd2, cmd3, segdw,
+							  (uint64_t *)
+							  LMT_OFF(laddr, lnum,
+								  0),
+							  &wd.data128, &shift,
+							  flags);
+			lnum += j;
+		} else if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* Store the prepared send desc to LMT lines */
 			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
 				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
@@ -1664,34 +1969,55 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
+	if (flags & NIX_TX_MULTI_SEG_F)
+		wd.data[0] >>= 16;
+
 	/* Trigger LMTST */
 	if (lnum > 16) {
-		data = cn10k_nix_tx_steor_vec_data(flags);
-		pa = io_addr | (data & 0x7) << 4;
-		data &= ~0x7ULL;
-		data |= (15ULL << 12);
-		data |= (uint64_t)lmt_id;
+		if (!(flags & NIX_TX_MULTI_SEG_F))
+			wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
+
+		pa = io_addr | (wd.data[0] & 0x7) << 4;
+		wd.data[0] &= ~0x7ULL;
+
+		if (flags & NIX_TX_MULTI_SEG_F)
+			wd.data[0] <<= 16;
+
+		wd.data[0] |= (15ULL << 12);
+		wd.data[0] |= (uint64_t)lmt_id;
 
 		/* STEOR0 */
-		roc_lmt_submit_steorl(data, pa);
+		roc_lmt_submit_steorl(wd.data[0], pa);
 
-		data = cn10k_nix_tx_steor_vec_data(flags);
-		pa = io_addr | (data & 0x7) << 4;
-		data &= ~0x7ULL;
-		data |= ((uint64_t)(lnum - 17)) << 12;
-		data |= (uint64_t)(lmt_id + 16);
+		if (!(flags & NIX_TX_MULTI_SEG_F))
+			wd.data[1] = cn10k_nix_tx_steor_vec_data(flags);
+
+		pa = io_addr | (wd.data[1] & 0x7) << 4;
+		wd.data[1] &= ~0x7ULL;
+
+		if (flags & NIX_TX_MULTI_SEG_F)
+			wd.data[1] <<= 16;
+
+		wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
+		wd.data[1] |= (uint64_t)(lmt_id + 16);
 
 		/* STEOR1 */
-		roc_lmt_submit_steorl(data, pa);
+		roc_lmt_submit_steorl(wd.data[1], pa);
 	} else if (lnum) {
-		data = cn10k_nix_tx_steor_vec_data(flags);
-		pa = io_addr | (data & 0x7) << 4;
-		data &= ~0x7ULL;
-		data |= ((uint64_t)(lnum - 1)) << 12;
-		data |= lmt_id;
+		if (!(flags & NIX_TX_MULTI_SEG_F))
+			wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
+
+		pa = io_addr | (wd.data[0] & 0x7) << 4;
+		wd.data[0] &= ~0x7ULL;
+
+		if (flags & NIX_TX_MULTI_SEG_F)
+			wd.data[0] <<= 16;
+
+		wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
+		wd.data[0] |= lmt_id;
 
 		/* STEOR0 */
-		roc_lmt_submit_steorl(data, pa);
+		roc_lmt_submit_steorl(wd.data[0], pa);
 	}
 
 	left -= burst;
@@ -1699,9 +2025,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (left)
 		goto again;
 
-	if (unlikely(scalar))
-		pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar, cmd,
-					    flags);
+	if (unlikely(scalar)) {
+		if (flags & NIX_TX_MULTI_SEG_F)
+			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
+							 scalar, cmd, flags);
+		else
+			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
+						    cmd, flags);
+	}
 
 	return pkts;
 }
@@ -1866,7 +2197,10 @@ T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum,	1, 1, 1, 1, 1, 1,	8,	\
 		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name(      \
-		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
 
 NIX_TX_FASTPATH_MODES
 #undef T
diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
new file mode 100644
index 0000000000..1fad81dbad
--- /dev/null
+++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_ethdev.h"
+#include "cn10k_tx.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		uint64_t cmd[sz];                                              \
+									       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
+			return 0;                                              \
+		return cn10k_nix_xmit_pkts_vector(                             \
+			tx_queue, tx_pkts, pkts, cmd,                          \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index 735e21cc60..763f9a14fd 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,13 +66,23 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena)
+	const eth_tx_burst_t nix_eth_tx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)			       \
+	[f5][f4][f3][f2][f1][f0] = cn9k_nix_xmit_pkts_vec_mseg_##name,
+
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	if (dev->scalar_ena) {
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
-	else
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+	} else {
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
-
-	if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-		pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_vec_burst_mseg);
+	}
 
 	rte_mb();
 }
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index dca732a9fa..ed65cd351f 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -582,7 +582,238 @@ cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 	}
 }
 
+static __rte_always_inline uint8_t
+cn9k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
+			       union nix_send_hdr_w0_u *sh,
+			       union nix_send_sg_s *sg, const uint32_t flags)
+{
+	struct rte_mbuf *m_next;
+	uint64_t *slist, sg_u;
+	uint16_t nb_segs;
+	uint64_t segdw;
+	int i = 1;
+
+	sh->total = m->pkt_len;
+	/* Clear sg->u header before use */
+	sg->u &= 0xFC00000000000000;
+	sg_u = sg->u;
+	slist = &cmd[0];
+
+	sg_u = sg_u | ((uint64_t)m->data_len);
+
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+
+	m = m_next;
+	/* Fill mbuf segments */
+	do {
+		m_next = m->next;
+		sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
+		*slist = rte_mbuf_data_iova(m);
+		/* Set invert df if buffer is not to be freed by H/W */
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			/* Mark mempool object as "put" since it is freed by NIX
+			 */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (!(sg_u & (1ULL << (i + 55))))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		slist++;
+		i++;
+		nb_segs--;
+		if (i > 2 && nb_segs) {
+			i = 0;
+			/* Next SG subdesc */
+			*(uint64_t *)slist = sg_u & 0xFC00000000000000;
+			sg->u = sg_u;
+			sg->segs = 3;
+			sg = (union nix_send_sg_s *)slist;
+			sg_u = sg->u;
+			slist++;
+		}
+		m = m_next;
+	} while (nb_segs);
+
+	sg->u = sg_u;
+	sg->segs = i;
+	segdw = (uint64_t *)slist - (uint64_t *)&cmd[0];
+
+	segdw += 2;
+	/* Roundup extra dwords to multiple of 2 */
+	segdw = (segdw >> 1) + (segdw & 0x1);
+	/* Default dwords */
+	segdw += 1 + !!(flags & NIX_TX_NEED_EXT_HDR) +
+		 !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+	sh->sizem1 = segdw - 1;
+
+	return segdw;
+}
+
+static __rte_always_inline uint8_t
+cn9k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
+			  uint64x2_t *cmd1, const uint32_t flags)
+{
+	union nix_send_hdr_w0_u sh;
+	union nix_send_sg_s sg;
+	uint8_t ret;
+
+	if (m->nb_segs == 1) {
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+			sg.u = vgetq_lane_u64(cmd1[0], 0);
+			sg.u |= (cnxk_nix_prefree_seg(m) << 55);
+			cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+		}
+
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		sg.u = vgetq_lane_u64(cmd1[0], 0);
+		if (!(sg.u & (1ULL << 55)))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rte_io_wmb();
+#endif
+		return 2 + !!(flags & NIX_TX_NEED_EXT_HDR) +
+		       !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
+	}
+
+	sh.u = vgetq_lane_u64(cmd0[0], 0);
+	sg.u = vgetq_lane_u64(cmd1[0], 0);
+
+	ret = cn9k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
+
+	cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
+	cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
+	return ret;
+}
+
 #define NIX_DESCS_PER_LOOP 4
+
+static __rte_always_inline void
+cn9k_nix_xmit_pkts_mseg_vector(uint64x2_t *cmd0, uint64x2_t *cmd1,
+			       uint64x2_t *cmd2, uint64x2_t *cmd3,
+			       uint8_t *segdw,
+			       uint64_t slist[][CNXK_NIX_TX_MSEG_SG_DWORDS - 2],
+			       uint64_t *lmt_addr, rte_iova_t io_addr,
+			       const uint32_t flags)
+{
+	uint64_t lmt_status;
+	uint8_t j, off;
+
+	if (!(flags & NIX_TX_NEED_EXT_HDR) &&
+	    !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+		/* No segments in 4 consecutive packets. */
+		if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd1[0]);
+				vst1q_u64(lmt_addr + 4, cmd0[1]);
+				vst1q_u64(lmt_addr + 6, cmd1[1]);
+				vst1q_u64(lmt_addr + 8, cmd0[2]);
+				vst1q_u64(lmt_addr + 10, cmd1[2]);
+				vst1q_u64(lmt_addr + 12, cmd0[3]);
+				vst1q_u64(lmt_addr + 14, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+
+			return;
+		}
+	}
+
+	for (j = 0; j < NIX_DESCS_PER_LOOP;) {
+		/* Fit consecutive packets in same LMTLINE. */
+		if ((segdw[j] + segdw[j + 1]) <= 8) {
+again0:
+			if ((flags & NIX_TX_NEED_EXT_HDR) &&
+			    (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 4;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
+
+				vst1q_u64(lmt_addr + 8 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 10 + off, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 12 + off, cmd1[j + 1]);
+				roc_lmt_mov_seg(lmt_addr + 14 + off,
+						slist[j + 1], segdw[j + 1] - 4);
+				off += ((segdw[j + 1] - 4) << 1);
+				vst1q_u64(lmt_addr + 14 + off, cmd3[j + 1]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 3;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
+				vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
+				roc_lmt_mov_seg(lmt_addr + 12 + off,
+						slist[j + 1], segdw[j + 1] - 3);
+			} else {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 2;
+				roc_lmt_mov_seg(lmt_addr + 4, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
+				vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
+				roc_lmt_mov_seg(lmt_addr + 8 + off,
+						slist[j + 1], segdw[j + 1] - 2);
+			}
+			lmt_status = roc_lmt_submit_ldeor(io_addr);
+			if (lmt_status == 0)
+				goto again0;
+			j += 2;
+		} else {
+again1:
+			if ((flags & NIX_TX_NEED_EXT_HDR) &&
+			    (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 4;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+				off <<= 1;
+				vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
+			} else if (flags & NIX_TX_NEED_EXT_HDR) {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd2[j]);
+				vst1q_u64(lmt_addr + 4, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 3;
+				roc_lmt_mov_seg(lmt_addr + 6, slist[j], off);
+			} else {
+				vst1q_u64(lmt_addr, cmd0[j]);
+				vst1q_u64(lmt_addr + 2, cmd1[j]);
+				/* Copy segs */
+				off = segdw[j] - 2;
+				roc_lmt_mov_seg(lmt_addr + 4, slist[j], off);
+			}
+			lmt_status = roc_lmt_submit_ldeor(io_addr);
+			if (lmt_status == 0)
+				goto again1;
+			j += 1;
+		}
+	}
+}
+
 static __rte_always_inline uint16_t
 cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			  uint16_t pkts, uint64_t *cmd, const uint16_t flags)
@@ -1380,7 +1611,8 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w0 = vld1q_u64(sx_w0 + 2);
 		}
 
-		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
+		    !(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
 			xmask23 = xmask01;
@@ -1424,7 +1656,7 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			 * cnxk_nix_prefree_seg are written before LMTST.
 			 */
 			rte_io_wmb();
-		} else {
+		} else if (!(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Move mbufs to iova */
 			mbuf0 = (uint64_t *)tx_pkts[0];
 			mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1472,7 +1704,27 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
 		}
 
-		if (flags & NIX_TX_NEED_EXT_HDR) {
+		if (flags & NIX_TX_MULTI_SEG_F) {
+			uint64_t seg_list[NIX_DESCS_PER_LOOP]
+					 [CNXK_NIX_TX_MSEG_SG_DWORDS - 2];
+			uint8_t j, segdw[NIX_DESCS_PER_LOOP + 1];
+
+			/* Build mseg list for each packet individually. */
+			for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
+				segdw[j] = cn9k_nix_prepare_mseg_vec(tx_pkts[j],
+							seg_list[j], &cmd0[j],
+							&cmd1[j], flags);
+			segdw[4] = 8;
+
+			/* Commit all changes to mbuf before LMTST. */
+			if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+				rte_io_wmb();
+
+			cn9k_nix_xmit_pkts_mseg_vector(cmd0, cmd1, cmd2, cmd3,
+						       segdw, seg_list,
+						       lmt_addr, io_addr,
+						       flags);
+		} else if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* With ext header in the command we can no longer send
 			 * all 4 packets together since LMTLINE is 128bytes.
 			 * Split and Tx twice.
@@ -1534,9 +1786,14 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
-	if (unlikely(pkts_left))
-		pkts += cn9k_nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd,
-					   flags);
+	if (unlikely(pkts_left)) {
+		if (flags & NIX_TX_MULTI_SEG_F)
+			pkts += cn9k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
+							pkts_left, cmd, flags);
+		else
+			pkts += cn9k_nix_xmit_pkts(tx_queue, tx_pkts, pkts_left,
+						   cmd, flags);
+	}
 
 	return pkts;
 }
@@ -1701,6 +1958,9 @@ T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum,	1, 1, 1, 1, 1, 1,	8,	       \
 		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn9k_nix_xmit_pkts_vec_##name(       \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_xmit_pkts_vec_mseg_##name(  \
 		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn9k_tx_vec_mseg.c b/drivers/net/cnxk/cn9k_tx_vec_mseg.c
new file mode 100644
index 0000000000..0256efd45a
--- /dev/null
+++ b/drivers/net/cnxk/cn9k_tx_vec_mseg.c
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_ethdev.h"
+#include "cn9k_tx.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_xmit_pkts_vec_mseg_##name(  \
+		void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		uint64_t cmd[sz];                                              \
+									       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
+			return 0;                                              \
+		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
+						 (flags) |                     \
+							 NIX_TX_MULTI_SEG_F);  \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/net/cnxk/meson.build b/drivers/net/cnxk/meson.build
index aa8c7253fb..361f7ce849 100644
--- a/drivers/net/cnxk/meson.build
+++ b/drivers/net/cnxk/meson.build
@@ -26,7 +26,8 @@ sources += files('cn9k_ethdev.c',
 		 'cn9k_rx_vec_mseg.c',
 		 'cn9k_tx.c',
 		 'cn9k_tx_mseg.c',
-		 'cn9k_tx_vec.c')
+		 'cn9k_tx_vec.c',
+		 'cn9k_tx_vec_mseg.c')
 # CN10K
 sources += files('cn10k_ethdev.c',
 		 'cn10k_rte_flow.c',
@@ -36,7 +37,8 @@ sources += files('cn10k_ethdev.c',
 		 'cn10k_rx_vec_mseg.c',
 		 'cn10k_tx.c',
 		 'cn10k_tx_mseg.c',
-		 'cn10k_tx_vec.c')
+		 'cn10k_tx_vec.c',
+		 'cn10k_tx_vec_mseg.c')
 
 deps += ['bus_pci', 'cryptodev', 'eventdev', 'security']
 deps += ['common_cnxk', 'mempool_cnxk']
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 07/13] event/cnxk: add Rx adapter support
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (4 preceding siblings ...)
  2021-06-20 20:28     ` [dpdk-dev] [PATCH v3 06/13] net/cnxk: add multi seg Tx vector routine pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 08/13] event/cnxk: add Rx adapter fastpath ops pbhagavatula
                       ` (8 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao, Ray Kinsella,
	Neil Horman
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Rx adapter.
Resize cn10k workslot fastpath structure to fit in 64B cacheline size.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |  28 ++++
 doc/guides/rel_notes/release_21_08.rst   |   5 +
 drivers/common/cnxk/roc_nix.h            |   3 +
 drivers/common/cnxk/roc_nix_fc.c         |  78 ++++++++++
 drivers/common/cnxk/roc_nix_priv.h       |   3 +-
 drivers/common/cnxk/version.map          |   1 +
 drivers/event/cnxk/cn10k_eventdev.c      | 107 +++++++++++---
 drivers/event/cnxk/cn10k_worker.c        |   7 +-
 drivers/event/cnxk/cn10k_worker.h        |  32 +++--
 drivers/event/cnxk/cn9k_eventdev.c       |  89 ++++++++++++
 drivers/event/cnxk/cn9k_worker.h         |   4 +
 drivers/event/cnxk/cnxk_eventdev.c       |   2 +
 drivers/event/cnxk/cnxk_eventdev.h       |  43 ++++--
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 176 +++++++++++++++++++++++
 drivers/event/cnxk/meson.build           |   9 +-
 15 files changed, 540 insertions(+), 47 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 36da3800cc..b7e82c1273 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -39,6 +39,10 @@ Features of the OCTEON cnxk SSO PMD are:
   time granularity of 2.5us on CN9K and 1us on CN10K.
 - Up to 256 TIM rings a.k.a event timer adapters.
 - Up to 8 rings traversed in parallel.
+- HW managed packets enqueued from ethdev to eventdev exposed through event eth
+  RX adapter.
+- N:1 ethernet device Rx queue to Event queue mapping.
+- Full Rx offload support defined through ethdev queue configuration.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
@@ -93,6 +97,15 @@ Runtime Config Options
 
     -a 0002:0e:00.0,qos=[1-50-50-50]
 
+- ``Force Rx Back pressure``
+
+   Force Rx back pressure when same mempool is used across ethernet device
+   connected to event device.
+
+   For example::
+
+      -a 0002:0e:00.0,force_rx_bp=1
+
 - ``TIM disable NPA``
 
   By default chunks are allocated from NPA then TIM can automatically free
@@ -160,3 +173,18 @@ Debugging Options
    +---+------------+-------------------------------------------------------+
    | 2 | TIM        | --log-level='pmd\.event\.cnxk\.timer,8'               |
    +---+------------+-------------------------------------------------------+
+
+Limitations
+-----------
+
+Rx adapter support
+~~~~~~~~~~~~~~~~~~
+
+Using the same mempool for all the ethernet device ports connected to
+event device would cause back pressure to be asserted only on the first
+ethernet device.
+Back pressure is automatically disabled when using same mempool for all the
+ethernet devices connected to event device to override this applications can
+use `force_rx_bp=1` device arguments.
+Using unique mempool per each ethernet device is recommended when they are
+connected to event device.
diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 31e49e1a56..3892c8017a 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -60,6 +60,11 @@ New Features
   * Added net/cnxk driver which provides the support for the integrated ethernet
     device.
 
+* **Added support for Marvell CN10K, CN9K, event Rx adapter.**
+
+  * Added Rx adapter support for event/cnxk when the ethernet device requested is
+    net/cnxk.
+
 
 Removed Items
 -------------
diff --git a/drivers/common/cnxk/roc_nix.h b/drivers/common/cnxk/roc_nix.h
index bb69027956..76613fe84e 100644
--- a/drivers/common/cnxk/roc_nix.h
+++ b/drivers/common/cnxk/roc_nix.h
@@ -514,6 +514,9 @@ int __roc_api roc_nix_fc_mode_set(struct roc_nix *roc_nix,
 
 enum roc_nix_fc_mode __roc_api roc_nix_fc_mode_get(struct roc_nix *roc_nix);
 
+void __roc_api rox_nix_fc_npa_bp_cfg(struct roc_nix *roc_nix, uint64_t pool_id,
+				     uint8_t ena, uint8_t force);
+
 /* NPC */
 int __roc_api roc_nix_npc_promisc_ena_dis(struct roc_nix *roc_nix, int enable);
 
diff --git a/drivers/common/cnxk/roc_nix_fc.c b/drivers/common/cnxk/roc_nix_fc.c
index 47be8aa3f8..f17eba4169 100644
--- a/drivers/common/cnxk/roc_nix_fc.c
+++ b/drivers/common/cnxk/roc_nix_fc.c
@@ -249,3 +249,81 @@ roc_nix_fc_mode_set(struct roc_nix *roc_nix, enum roc_nix_fc_mode mode)
 exit:
 	return rc;
 }
+
+void
+rox_nix_fc_npa_bp_cfg(struct roc_nix *roc_nix, uint64_t pool_id, uint8_t ena,
+		      uint8_t force)
+{
+	struct nix *nix = roc_nix_to_nix_priv(roc_nix);
+	struct npa_lf *lf = idev_npa_obj_get();
+	struct npa_aq_enq_req *req;
+	struct npa_aq_enq_rsp *rsp;
+	struct mbox *mbox;
+	uint32_t limit;
+	int rc;
+
+	if (roc_nix_is_sdp(roc_nix))
+		return;
+
+	if (!lf)
+		return;
+	mbox = lf->mbox;
+
+	req = mbox_alloc_msg_npa_aq_enq(mbox);
+	if (req == NULL)
+		return;
+
+	req->aura_id = roc_npa_aura_handle_to_aura(pool_id);
+	req->ctype = NPA_AQ_CTYPE_AURA;
+	req->op = NPA_AQ_INSTOP_READ;
+
+	rc = mbox_process_msg(mbox, (void *)&rsp);
+	if (rc)
+		return;
+
+	limit = rsp->aura.limit;
+	/* BP is already enabled. */
+	if (rsp->aura.bp_ena) {
+		/* If BP ids don't match disable BP. */
+		if ((rsp->aura.nix0_bpid != nix->bpid[0]) && !force) {
+			req = mbox_alloc_msg_npa_aq_enq(mbox);
+			if (req == NULL)
+				return;
+
+			req->aura_id = roc_npa_aura_handle_to_aura(pool_id);
+			req->ctype = NPA_AQ_CTYPE_AURA;
+			req->op = NPA_AQ_INSTOP_WRITE;
+
+			req->aura.bp_ena = 0;
+			req->aura_mask.bp_ena = ~(req->aura_mask.bp_ena);
+
+			mbox_process(mbox);
+		}
+		return;
+	}
+
+	/* BP was previously enabled but now disabled skip. */
+	if (rsp->aura.bp)
+		return;
+
+	req = mbox_alloc_msg_npa_aq_enq(mbox);
+	if (req == NULL)
+		return;
+
+	req->aura_id = roc_npa_aura_handle_to_aura(pool_id);
+	req->ctype = NPA_AQ_CTYPE_AURA;
+	req->op = NPA_AQ_INSTOP_WRITE;
+
+	if (ena) {
+		req->aura.nix0_bpid = nix->bpid[0];
+		req->aura_mask.nix0_bpid = ~(req->aura_mask.nix0_bpid);
+		req->aura.bp = NIX_RQ_AURA_THRESH(
+			limit > 128 ? 256 : limit); /* 95% of size*/
+		req->aura_mask.bp = ~(req->aura_mask.bp);
+	}
+
+	req->aura.bp_ena = !!ena;
+	req->aura_mask.bp_ena = ~(req->aura_mask.bp_ena);
+
+	mbox_process(mbox);
+}
diff --git a/drivers/common/cnxk/roc_nix_priv.h b/drivers/common/cnxk/roc_nix_priv.h
index d9c32df442..9dc0c88a6f 100644
--- a/drivers/common/cnxk/roc_nix_priv.h
+++ b/drivers/common/cnxk/roc_nix_priv.h
@@ -16,7 +16,8 @@
 #define NIX_SQB_LOWER_THRESH ((uint16_t)70)
 
 /* Apply BP/DROP when CQ is 95% full */
-#define NIX_CQ_THRESH_LEVEL (5 * 256 / 100)
+#define NIX_CQ_THRESH_LEVEL	(5 * 256 / 100)
+#define NIX_RQ_AURA_THRESH(x)	(((x) * 95) / 100)
 
 /* IRQ triggered when NIX_LF_CINTX_CNT[QCOUNT] crosses this value */
 #define CQ_CQE_THRESH_DEFAULT	0x1ULL
diff --git a/drivers/common/cnxk/version.map b/drivers/common/cnxk/version.map
index 8a5c839e57..cb1ce4b6fc 100644
--- a/drivers/common/cnxk/version.map
+++ b/drivers/common/cnxk/version.map
@@ -29,6 +29,7 @@ INTERNAL {
 	roc_nix_fc_config_set;
 	roc_nix_fc_mode_set;
 	roc_nix_fc_mode_get;
+	rox_nix_fc_npa_bp_cfg;
 	roc_nix_get_base_chan;
 	roc_nix_get_pf;
 	roc_nix_get_pf_func;
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index bf4052c76c..2060c8fe84 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -6,18 +6,6 @@
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
 
-static void
-cn10k_init_hws_ops(struct cn10k_sso_hws *ws, uintptr_t base)
-{
-	ws->tag_wqe_op = base + SSOW_LF_GWS_WQE0;
-	ws->getwrk_op = base + SSOW_LF_GWS_OP_GET_WORK0;
-	ws->updt_wqe_op = base + SSOW_LF_GWS_OP_UPD_WQP_GRP1;
-	ws->swtag_norm_op = base + SSOW_LF_GWS_OP_SWTAG_NORM;
-	ws->swtag_untag_op = base + SSOW_LF_GWS_OP_SWTAG_UNTAG;
-	ws->swtag_flush_op = base + SSOW_LF_GWS_OP_SWTAG_FLUSH;
-	ws->swtag_desched_op = base + SSOW_LF_GWS_OP_SWTAG_DESCHED;
-}
-
 static uint32_t
 cn10k_sso_gw_mode_wdata(struct cnxk_sso_evdev *dev)
 {
@@ -56,7 +44,6 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
-	cn10k_init_hws_ops(ws, ws->base);
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -135,13 +122,14 @@ cn10k_sso_hws_flush_events(void *hws, uint8_t queue_id, uintptr_t base,
 	cq_ds_cnt &= 0x3FFF3FFF0000;
 
 	while (aq_cnt || cq_ds_cnt || ds_cnt) {
-		plt_write64(req, ws->getwrk_op);
+		plt_write64(req, ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 		cn10k_sso_hws_get_work_empty(ws, &ev);
 		if (fn != NULL && ev.u64 != 0)
 			fn(arg, ev);
 		if (ev.sched_type != SSO_TT_EMPTY)
-			cnxk_sso_hws_swtag_flush(ws->tag_wqe_op,
-						 ws->swtag_flush_op);
+			cnxk_sso_hws_swtag_flush(
+				ws->base + SSOW_LF_GWS_WQE0,
+				ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 		do {
 			val = plt_read64(ws->base + SSOW_LF_GWS_PENDSTATE);
 		} while (val & BIT_ULL(56));
@@ -205,9 +193,11 @@ cn10k_sso_hws_reset(void *arg, void *hws)
 
 	if (CNXK_TT_FROM_TAG(plt_read64(base + SSOW_LF_GWS_PRF_WQE0)) !=
 	    SSO_TT_EMPTY) {
-		plt_write64(BIT_ULL(16) | 1, ws->getwrk_op);
+		plt_write64(BIT_ULL(16) | 1,
+			    ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 		do {
-			roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
+			roc_load_pair(gw.u64[0], gw.u64[1],
+				      ws->base + SSOW_LF_GWS_WQE0);
 		} while (gw.u64[0] & BIT_ULL(63));
 		pend_tt = CNXK_TT_FROM_TAG(plt_read64(base + SSOW_LF_GWS_WQE0));
 		if (pend_tt != SSO_TT_EMPTY) { /* Work was pending */
@@ -407,6 +397,80 @@ cn10k_sso_selftest(void)
 	return cnxk_sso_selftest(RTE_STR(event_cn10k));
 }
 
+static int
+cn10k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int rc;
+
+	RTE_SET_USED(event_dev);
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 9);
+	if (rc)
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_SW_CAP;
+	else
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+
+	return 0;
+}
+
+static void
+cn10k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem,
+		       void *tstmp_info)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
+		ws->lookup_mem = lookup_mem;
+		ws->tstamp = tstmp_info;
+	}
+}
+
+static int
+cn10k_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cn10k_eth_rxq *rxq;
+	void *lookup_mem;
+	void *tstmp_info;
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	rc = cnxk_sso_rx_adapter_queue_add(event_dev, eth_dev, rx_queue_id,
+					   queue_conf);
+	if (rc)
+		return -EINVAL;
+	rxq = eth_dev->data->rx_queues[0];
+	lookup_mem = rxq->lookup_mem;
+	tstmp_info = rxq->tstamp;
+	cn10k_sso_set_priv_mem(event_dev, lookup_mem, tstmp_info);
+	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t rx_queue_id)
+{
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
+}
+
 static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.dev_infos_get = cn10k_sso_info_get,
 	.dev_configure = cn10k_sso_dev_configure,
@@ -420,6 +484,12 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.port_unlink = cn10k_sso_port_unlink,
 	.timeout_ticks = cnxk_sso_timeout_ticks,
 
+	.eth_rx_adapter_caps_get = cn10k_sso_rx_adapter_caps_get,
+	.eth_rx_adapter_queue_add = cn10k_sso_rx_adapter_queue_add,
+	.eth_rx_adapter_queue_del = cn10k_sso_rx_adapter_queue_del,
+	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
+	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
@@ -502,6 +572,7 @@ RTE_PMD_REGISTER_PCI_TABLE(event_cn10k, cn10k_pci_sso_map);
 RTE_PMD_REGISTER_KMOD_DEP(event_cn10k, "vfio-pci");
 RTE_PMD_REGISTER_PARAM_STRING(event_cn10k, CNXK_SSO_XAE_CNT "=<int>"
 			      CNXK_SSO_GGRP_QOS "=<string>"
+			      CNXK_SSO_FORCE_BP "=1"
 			      CN10K_SSO_GW_MODE "=<int>"
 			      CNXK_TIM_DISABLE_NPA "=1"
 			      CNXK_TIM_CHNK_SLOTS "=<int>"
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index e2aa534c64..5dbae275ba 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -18,7 +18,8 @@ cn10k_sso_hws_enq(void *port, const struct rte_event *ev)
 		cn10k_sso_hws_forward_event(ws, ev);
 		break;
 	case RTE_EVENT_OP_RELEASE:
-		cnxk_sso_hws_swtag_flush(ws->tag_wqe_op, ws->swtag_flush_op);
+		cnxk_sso_hws_swtag_flush(ws->base + SSOW_LF_GWS_WQE0,
+					 ws->base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
 		break;
 	default:
 		return 0;
@@ -69,7 +70,7 @@ cn10k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
 
 	if (ws->swtag_req) {
 		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);
+		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
 		return 1;
 	}
 
@@ -94,7 +95,7 @@ cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
 
 	if (ws->swtag_req) {
 		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_wqe_op);
+		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
 		return ret;
 	}
 
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 2f093a8dd5..c7250bf9e7 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -5,9 +5,13 @@
 #ifndef __CN10K_WORKER_H__
 #define __CN10K_WORKER_H__
 
+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
 
+#include "cn10k_ethdev.h"
+#include "cn10k_rx.h"
+
 /* SSO Operations */
 
 static __rte_always_inline uint8_t
@@ -31,7 +35,8 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 {
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
-	const uint8_t cur_tt = CNXK_TT_FROM_TAG(plt_read64(ws->tag_wqe_op));
+	const uint8_t cur_tt =
+		CNXK_TT_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0));
 
 	/* CNXK model
 	 * cur_tt/new_tt     SSO_TT_ORDERED SSO_TT_ATOMIC SSO_TT_UNTAGGED
@@ -43,9 +48,11 @@ cn10k_sso_hws_fwd_swtag(struct cn10k_sso_hws *ws, const struct rte_event *ev)
 
 	if (new_tt == SSO_TT_UNTAGGED) {
 		if (cur_tt != SSO_TT_UNTAGGED)
-			cnxk_sso_hws_swtag_untag(ws->swtag_untag_op);
+			cnxk_sso_hws_swtag_untag(ws->base +
+						 SSOW_LF_GWS_OP_SWTAG_UNTAG);
 	} else {
-		cnxk_sso_hws_swtag_norm(tag, new_tt, ws->swtag_norm_op);
+		cnxk_sso_hws_swtag_norm(tag, new_tt,
+					ws->base + SSOW_LF_GWS_OP_SWTAG_NORM);
 	}
 	ws->swtag_req = 1;
 }
@@ -57,8 +64,9 @@ cn10k_sso_hws_fwd_group(struct cn10k_sso_hws *ws, const struct rte_event *ev,
 	const uint32_t tag = (uint32_t)ev->event;
 	const uint8_t new_tt = ev->sched_type;
 
-	plt_write64(ev->u64, ws->updt_wqe_op);
-	cnxk_sso_hws_swtag_desched(tag, new_tt, grp, ws->swtag_desched_op);
+	plt_write64(ev->u64, ws->base + SSOW_LF_GWS_OP_UPD_WQP_GRP1);
+	cnxk_sso_hws_swtag_desched(tag, new_tt, grp,
+				   ws->base + SSOW_LF_GWS_OP_SWTAG_DESCHED);
 }
 
 static __rte_always_inline void
@@ -68,7 +76,7 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 	const uint8_t grp = ev->queue_id;
 
 	/* Group hasn't changed, Use SWTAG to forward the event */
-	if (CNXK_GRP_FROM_TAG(plt_read64(ws->tag_wqe_op)) == grp)
+	if (CNXK_GRP_FROM_TAG(plt_read64(ws->base + SSOW_LF_GWS_WQE0)) == grp)
 		cn10k_sso_hws_fwd_swtag(ws, ev);
 	else
 		/*
@@ -93,12 +101,13 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		PLT_CPU_FEATURE_PREAMBLE
 		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
 		: [wdata] "+r"(gw.get_work)
-		: [gw_loc] "r"(ws->getwrk_op)
+		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
-	plt_write64(gw.u64[0], ws->getwrk_op);
+	plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
 	do {
-		roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
+		roc_load_pair(gw.u64[0], gw.u64[1],
+			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
 #endif
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
@@ -130,11 +139,12 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		     "		tbnz %[tag], 63, rty%=			\n"
 		     "done%=:	dmb ld					\n"
 		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
-		     : [tag_loc] "r"(ws->tag_wqe_op)
+		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
 		     : "memory");
 #else
 	do {
-		roc_load_pair(gw.u64[0], gw.u64[1], ws->tag_wqe_op);
+		roc_load_pair(gw.u64[0], gw.u64[1],
+			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
 #endif
 
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 0684417eab..072800c243 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -481,6 +481,88 @@ cn9k_sso_selftest(void)
 	return cnxk_sso_selftest(RTE_STR(event_cn9k));
 }
 
+static int
+cn9k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
+			     const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int rc;
+
+	RTE_SET_USED(event_dev);
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 9);
+	if (rc)
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_SW_CAP;
+	else
+		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+
+	return 0;
+}
+
+static void
+cn9k_sso_set_priv_mem(const struct rte_eventdev *event_dev, void *lookup_mem,
+		      void *tstmp_info)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		if (dev->dual_ws) {
+			struct cn9k_sso_hws_dual *dws =
+				event_dev->data->ports[i];
+			dws->lookup_mem = lookup_mem;
+			dws->tstamp = tstmp_info;
+		} else {
+			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
+			ws->lookup_mem = lookup_mem;
+			ws->tstamp = tstmp_info;
+		}
+	}
+}
+
+static int
+cn9k_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cn9k_eth_rxq *rxq;
+	void *lookup_mem;
+	void *tstmp_info;
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (rc)
+		return -EINVAL;
+
+	rc = cnxk_sso_rx_adapter_queue_add(event_dev, eth_dev, rx_queue_id,
+					   queue_conf);
+	if (rc)
+		return -EINVAL;
+
+	rxq = eth_dev->data->rx_queues[0];
+	lookup_mem = rxq->lookup_mem;
+	tstmp_info = rxq->tstamp;
+	cn9k_sso_set_priv_mem(event_dev, lookup_mem, tstmp_info);
+	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn9k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t rx_queue_id)
+{
+	int rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (rc)
+		return -EINVAL;
+
+	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
+}
+
 static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.dev_infos_get = cn9k_sso_info_get,
 	.dev_configure = cn9k_sso_dev_configure,
@@ -494,6 +576,12 @@ static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.port_unlink = cn9k_sso_port_unlink,
 	.timeout_ticks = cnxk_sso_timeout_ticks,
 
+	.eth_rx_adapter_caps_get = cn9k_sso_rx_adapter_caps_get,
+	.eth_rx_adapter_queue_add = cn9k_sso_rx_adapter_queue_add,
+	.eth_rx_adapter_queue_del = cn9k_sso_rx_adapter_queue_del,
+	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
+	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
@@ -571,6 +659,7 @@ RTE_PMD_REGISTER_PCI_TABLE(event_cn9k, cn9k_pci_sso_map);
 RTE_PMD_REGISTER_KMOD_DEP(event_cn9k, "vfio-pci");
 RTE_PMD_REGISTER_PARAM_STRING(event_cn9k, CNXK_SSO_XAE_CNT "=<int>"
 			      CNXK_SSO_GGRP_QOS "=<string>"
+			      CNXK_SSO_FORCE_BP "=1"
 			      CN9K_SSO_SINGLE_WS "=1"
 			      CNXK_TIM_DISABLE_NPA "=1"
 			      CNXK_TIM_CHNK_SLOTS "=<int>"
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 38fca08fb6..f5a4401465 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -5,9 +5,13 @@
 #ifndef __CN9K_WORKER_H__
 #define __CN9K_WORKER_H__
 
+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
 
+#include "cn9k_ethdev.h"
+#include "cn9k_rx.h"
+
 /* SSO Operations */
 
 static __rte_always_inline uint8_t
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index 7189ee3a79..cfd7fb971c 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -571,6 +571,8 @@ cnxk_sso_parse_devargs(struct cnxk_sso_evdev *dev, struct rte_devargs *devargs)
 			   &dev->xae_cnt);
 	rte_kvargs_process(kvlist, CNXK_SSO_GGRP_QOS, &parse_sso_kvargs_dict,
 			   dev);
+	rte_kvargs_process(kvlist, CNXK_SSO_FORCE_BP, &parse_kvargs_value,
+			   &dev->force_ena_bp);
 	rte_kvargs_process(kvlist, CN9K_SSO_SINGLE_WS, &parse_kvargs_value,
 			   &single_ws);
 	rte_kvargs_process(kvlist, CN10K_SSO_GW_MODE, &parse_kvargs_value,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 668e51d62a..b65d725f55 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -6,6 +6,8 @@
 #define __CNXK_EVENTDEV_H__
 
 #include <rte_devargs.h>
+#include <rte_ethdev.h>
+#include <rte_event_eth_rx_adapter.h>
 #include <rte_kvargs.h>
 #include <rte_mbuf_pool_ops.h>
 #include <rte_pci.h>
@@ -18,6 +20,7 @@
 
 #define CNXK_SSO_XAE_CNT   "xae_cnt"
 #define CNXK_SSO_GGRP_QOS  "qos"
+#define CNXK_SSO_FORCE_BP  "force_rx_bp"
 #define CN9K_SSO_SINGLE_WS "single_ws"
 #define CN10K_SSO_GW_MODE  "gw_mode"
 
@@ -81,7 +84,10 @@ struct cnxk_sso_evdev {
 	uint64_t nb_xaq_cfg;
 	rte_iova_t fc_iova;
 	struct rte_mempool *xaq_pool;
+	uint64_t rx_offloads;
 	uint64_t adptr_xae_cnt;
+	uint16_t rx_adptr_pool_cnt;
+	uint64_t *rx_adptr_pools;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -89,25 +95,18 @@ struct cnxk_sso_evdev {
 	uint32_t xae_cnt;
 	uint8_t qos_queue_cnt;
 	struct cnxk_sso_qos *qos_parse_data;
+	uint8_t force_ena_bp;
 	/* CN9K */
 	uint8_t dual_ws;
 	/* CN10K */
 	uint8_t gw_mode;
 } __rte_cache_aligned;
 
-/* CN10K HWS ops */
-#define CN10K_SSO_HWS_OPS                                                      \
-	uintptr_t swtag_desched_op;                                            \
-	uintptr_t swtag_flush_op;                                              \
-	uintptr_t swtag_untag_op;                                              \
-	uintptr_t swtag_norm_op;                                               \
-	uintptr_t updt_wqe_op;                                                 \
-	uintptr_t tag_wqe_op;                                                  \
-	uintptr_t getwrk_op
-
 struct cn10k_sso_hws {
-	/* Get Work Fastpath data */
-	CN10K_SSO_HWS_OPS;
+	uint64_t base;
+	/* PTP timestamp */
+	struct cnxk_timesync_info *tstamp;
+	void *lookup_mem;
 	uint32_t gw_wdata;
 	uint8_t swtag_req;
 	uint8_t hws_id;
@@ -115,7 +114,6 @@ struct cn10k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base;
 	uintptr_t lmt_base;
 } __rte_cache_aligned;
 
@@ -132,6 +130,9 @@ struct cn10k_sso_hws {
 struct cn9k_sso_hws {
 	/* Get Work Fastpath data */
 	CN9K_SSO_HWS_OPS;
+	/* PTP timestamp */
+	struct cnxk_timesync_info *tstamp;
+	void *lookup_mem;
 	uint8_t swtag_req;
 	uint8_t hws_id;
 	/* Add Work Fastpath data */
@@ -148,6 +149,9 @@ struct cn9k_sso_hws_state {
 struct cn9k_sso_hws_dual {
 	/* Get Work Fastpath data */
 	struct cn9k_sso_hws_state ws_state[2]; /* Ping and Pong */
+	/* PTP timestamp */
+	struct cnxk_timesync_info *tstamp;
+	void *lookup_mem;
 	uint8_t swtag_req;
 	uint8_t vws; /* Ping pong bit */
 	uint8_t hws_id;
@@ -250,4 +254,17 @@ int cnxk_sso_xstats_reset(struct rte_eventdev *event_dev,
 /* CN9K */
 void cn9k_sso_set_rsrc(void *arg);
 
+/* Common adapter ops */
+int cnxk_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf);
+int cnxk_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t rx_queue_id);
+int cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev);
+int cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
+			     const struct rte_eth_dev *eth_dev);
+
 #endif /* __CNXK_EVENTDEV_H__ */
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 89a1d82c14..24bfd985e7 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -2,6 +2,7 @@
  * Copyright(C) 2021 Marvell.
  */
 
+#include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 
 void
@@ -11,6 +12,32 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 	int i;
 
 	switch (event_type) {
+	case RTE_EVENT_TYPE_ETHDEV: {
+		struct cnxk_eth_rxq_sp *rxq = data;
+		uint64_t *old_ptr;
+
+		for (i = 0; i < dev->rx_adptr_pool_cnt; i++) {
+			if ((uint64_t)rxq->qconf.mp == dev->rx_adptr_pools[i])
+				return;
+		}
+
+		dev->rx_adptr_pool_cnt++;
+		old_ptr = dev->rx_adptr_pools;
+		dev->rx_adptr_pools = rte_realloc(
+			dev->rx_adptr_pools,
+			sizeof(uint64_t) * dev->rx_adptr_pool_cnt, 0);
+		if (dev->rx_adptr_pools == NULL) {
+			dev->adptr_xae_cnt += rxq->qconf.mp->size;
+			dev->rx_adptr_pools = old_ptr;
+			dev->rx_adptr_pool_cnt--;
+			return;
+		}
+		dev->rx_adptr_pools[dev->rx_adptr_pool_cnt - 1] =
+			(uint64_t)rxq->qconf.mp;
+
+		dev->adptr_xae_cnt += rxq->qconf.mp->size;
+		break;
+	}
 	case RTE_EVENT_TYPE_TIMER: {
 		struct cnxk_tim_ring *timr = data;
 		uint16_t *old_ring_ptr;
@@ -65,3 +92,152 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		break;
 	}
 }
+
+static int
+cnxk_sso_rxq_enable(struct cnxk_eth_dev *cnxk_eth_dev, uint16_t rq_id,
+		    uint16_t port_id, const struct rte_event *ev,
+		    uint8_t custom_flowid)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+	rq->sso_ena = 1;
+	rq->tt = ev->sched_type;
+	rq->hwgrp = ev->queue_id;
+	rq->flow_tag_width = 20;
+	rq->wqe_skip = 1;
+	rq->tag_mask = (port_id & 0xF) << 20;
+	rq->tag_mask |= (((port_id >> 4) & 0xF) | (RTE_EVENT_TYPE_ETHDEV << 4))
+			<< 24;
+
+	if (custom_flowid) {
+		rq->flow_tag_width = 0;
+		rq->tag_mask |= ev->flow_id;
+	}
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+static int
+cnxk_sso_rxq_disable(struct cnxk_eth_dev *cnxk_eth_dev, uint16_t rq_id)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+	rq->sso_ena = 0;
+	rq->flow_tag_width = 32;
+	rq->tag_mask = 0;
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+int
+cnxk_sso_rx_adapter_queue_add(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_queue_conf *queue_conf)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t port = eth_dev->data->port_id;
+	struct cnxk_eth_rxq_sp *rxq_sp;
+	int i, rc = 0;
+
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			rxq_sp = eth_dev->data->rx_queues[i];
+			rxq_sp = rxq_sp - 1;
+			cnxk_sso_updt_xae_cnt(dev, rxq_sp,
+					      RTE_EVENT_TYPE_ETHDEV);
+			rc = cnxk_sso_xae_reconfigure(
+				(struct rte_eventdev *)(uintptr_t)event_dev);
+			rc |= cnxk_sso_rxq_enable(
+				cnxk_eth_dev, i, port, &queue_conf->ev,
+				!!(queue_conf->rx_queue_flags &
+				RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID));
+			rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+					      rxq_sp->qconf.mp->pool_id, true,
+					      dev->force_ena_bp);
+		}
+	} else {
+		rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+		rxq_sp = rxq_sp - 1;
+		cnxk_sso_updt_xae_cnt(dev, rxq_sp, RTE_EVENT_TYPE_ETHDEV);
+		rc = cnxk_sso_xae_reconfigure(
+			(struct rte_eventdev *)(uintptr_t)event_dev);
+		rc |= cnxk_sso_rxq_enable(
+			cnxk_eth_dev, (uint16_t)rx_queue_id, port,
+			&queue_conf->ev,
+			!!(queue_conf->rx_queue_flags &
+			   RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID));
+		rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+				      rxq_sp->qconf.mp->pool_id, true,
+				      dev->force_ena_bp);
+	}
+
+	if (rc < 0) {
+		plt_err("Failed to configure Rx adapter port=%d, q=%d", port,
+			queue_conf->ev.queue_id);
+		return rc;
+	}
+
+	dev->rx_offloads |= cnxk_eth_dev->rx_offload_flags;
+
+	return 0;
+}
+
+int
+cnxk_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t rx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	struct cnxk_eth_rxq_sp *rxq_sp;
+	int i, rc = 0;
+
+	RTE_SET_USED(event_dev);
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+			rxq_sp = rxq_sp - 1;
+			rc = cnxk_sso_rxq_disable(cnxk_eth_dev, i);
+			rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+					      rxq_sp->qconf.mp->pool_id, false,
+					      dev->force_ena_bp);
+		}
+	} else {
+		rxq_sp = eth_dev->data->rx_queues[rx_queue_id];
+		rxq_sp = rxq_sp - 1;
+		rc = cnxk_sso_rxq_disable(cnxk_eth_dev, (uint16_t)rx_queue_id);
+		rox_nix_fc_npa_bp_cfg(&cnxk_eth_dev->nix,
+				      rxq_sp->qconf.mp->pool_id, false,
+				      dev->force_ena_bp);
+	}
+
+	if (rc < 0)
+		plt_err("Failed to clear Rx adapter config port=%d, q=%d",
+			eth_dev->data->port_id, rx_queue_id);
+
+	return rc;
+}
+
+int
+cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
+			  const struct rte_eth_dev *eth_dev)
+{
+	RTE_SET_USED(event_dev);
+	RTE_SET_USED(eth_dev);
+
+	return 0;
+}
+
+int
+cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
+			 const struct rte_eth_dev *eth_dev)
+{
+	RTE_SET_USED(event_dev);
+	RTE_SET_USED(eth_dev);
+
+	return 0;
+}
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index 87bb9f76a9..eda562f5b5 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -21,4 +21,11 @@ sources = files(
         'cnxk_tim_worker.c',
 )
 
-deps += ['bus_pci', 'common_cnxk']
+extra_flags = ['-flax-vector-conversions', '-Wno-strict-aliasing']
+foreach flag: extra_flags
+    if cc.has_argument(flag)
+        cflags += flag
+    endif
+endforeach
+
+deps += ['bus_pci', 'common_cnxk', 'net_cnxk']
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 08/13] event/cnxk: add Rx adapter fastpath ops
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (5 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 07/13] event/cnxk: add Rx adapter support pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 09/13] event/cnxk: add Tx adapter support pbhagavatula
                       ` (7 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Rx adapter fastpath operations.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c           | 136 +++++++-
 drivers/event/cnxk/cn10k_worker.c             |  54 ----
 drivers/event/cnxk/cn10k_worker.h             |  97 +++++-
 drivers/event/cnxk/cn10k_worker_deq.c         |  44 +++
 drivers/event/cnxk/cn10k_worker_deq_burst.c   |  29 ++
 drivers/event/cnxk/cn10k_worker_deq_tmo.c     |  72 +++++
 drivers/event/cnxk/cn9k_eventdev.c            | 305 +++++++++++++++++-
 drivers/event/cnxk/cn9k_worker.c              | 117 -------
 drivers/event/cnxk/cn9k_worker.h              | 174 ++++++++--
 drivers/event/cnxk/cn9k_worker_deq.c          |  44 +++
 drivers/event/cnxk/cn9k_worker_deq_burst.c    |  29 ++
 drivers/event/cnxk/cn9k_worker_deq_tmo.c      |  72 +++++
 drivers/event/cnxk/cn9k_worker_dual_deq.c     |  53 +++
 .../event/cnxk/cn9k_worker_dual_deq_burst.c   |  30 ++
 drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c |  89 +++++
 drivers/event/cnxk/cnxk_eventdev.h            |   1 +
 drivers/event/cnxk/meson.build                |   9 +
 17 files changed, 1124 insertions(+), 231 deletions(-)
 create mode 100644 drivers/event/cnxk/cn10k_worker_deq.c
 create mode 100644 drivers/event/cnxk/cn10k_worker_deq_burst.c
 create mode 100644 drivers/event/cnxk/cn10k_worker_deq_tmo.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_deq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_deq_burst.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_deq_tmo.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_deq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_deq_burst.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 2060c8fe84..ba7d95fff7 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -237,17 +237,141 @@ static void
 cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	const event_dequeue_t sso_hws_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_tmo_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_tmo_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_tmo_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_deq_tmo_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
 
 	event_dev->enqueue = cn10k_sso_hws_enq;
 	event_dev->enqueue_burst = cn10k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn10k_sso_hws_enq_new_burst;
 	event_dev->enqueue_forward_burst = cn10k_sso_hws_enq_fwd_burst;
-
-	event_dev->dequeue = cn10k_sso_hws_deq;
-	event_dev->dequeue_burst = cn10k_sso_hws_deq_burst;
-	if (dev->is_timeout_deq) {
-		event_dev->dequeue = cn10k_sso_hws_tmo_deq;
-		event_dev->dequeue_burst = cn10k_sso_hws_tmo_deq_burst;
+	if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+		event_dev->dequeue = sso_hws_deq_seg
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_seg_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
+	} else {
+		event_dev->dequeue = sso_hws_deq
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_tmo_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_tmo_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
 	}
 }
 
diff --git a/drivers/event/cnxk/cn10k_worker.c b/drivers/event/cnxk/cn10k_worker.c
index 5dbae275ba..c71aa37327 100644
--- a/drivers/event/cnxk/cn10k_worker.c
+++ b/drivers/event/cnxk/cn10k_worker.c
@@ -60,57 +60,3 @@ cn10k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 
 	return 1;
 }
-
-uint16_t __rte_hot
-cn10k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn10k_sso_hws *ws = port;
-
-	RTE_SET_USED(timeout_ticks);
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
-		return 1;
-	}
-
-	return cn10k_sso_hws_get_work(ws, ev);
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
-			uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn10k_sso_hws_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn10k_sso_hws *ws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
-		return ret;
-	}
-
-	ret = cn10k_sso_hws_get_work(ws, ev);
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
-		ret = cn10k_sso_hws_get_work(ws, ev);
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn10k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-			    uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn10k_sso_hws_tmo_deq(port, ev, timeout_ticks);
-}
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index c7250bf9e7..b724083caa 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -87,20 +87,37 @@ cn10k_sso_hws_forward_event(struct cn10k_sso_hws *ws,
 		cn10k_sso_hws_fwd_group(ws, ev, grp);
 }
 
+static __rte_always_inline void
+cn10k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
+		  const uint32_t tag, const uint32_t flags,
+		  const void *const lookup_mem)
+{
+	const uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM |
+				   (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
+
+	cn10k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag,
+			      (struct rte_mbuf *)mbuf, lookup_mem,
+			      mbuf_init | ((uint64_t)port_id) << 48, flags);
+}
+
 static __rte_always_inline uint16_t
-cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
+cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
+		       const uint32_t flags, void *lookup_mem)
 {
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t tstamp_ptr;
+	uint64_t mbuf;
 
 	gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64) && !defined(__clang__)
 	asm volatile(
 		PLT_CPU_FEATURE_PREAMBLE
 		"caspl %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-		: [wdata] "+r"(gw.get_work)
+		"sub %[mbuf], %H[wdata], #0x80				\n"
+		: [wdata] "+r"(gw.get_work), [mbuf] "=&r"(mbuf)
 		: [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
 		: "memory");
 #else
@@ -109,11 +126,34 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn10k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					  gw.u64[0] & 0xFFFFF, flags,
+					  lookup_mem);
+			/* Extracting tstamp, if PTP enabled*/
+			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
+							    gw.u64[1]) +
+						   CNXK_SSO_WQE_SG_PTR);
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						ws->tstamp,
+						flags & NIX_RX_OFFLOAD_TSTAMP_F,
+						flags & NIX_RX_MULTI_SEG_F,
+						(uint64_t *)tstamp_ptr);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -128,6 +168,7 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
@@ -138,7 +179,9 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		     "		ldp %[tag], %[wqp], [%[tag_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=			\n"
 		     "done%=:	dmb ld					\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80		\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
 		     : "memory");
 #else
@@ -146,12 +189,25 @@ cn10k_sso_hws_get_work_empty(struct cn10k_sso_hws *ws, struct rte_event *ev)
 		roc_load_pair(gw.u64[0], gw.u64[1],
 			      ws->base + SSOW_LF_GWS_WQE0);
 	} while (gw.u64[0] & BIT_ULL(63));
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn10k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					  gw.u64[0] & 0xFFFFF, 0, NULL);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -170,16 +226,29 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 					       const struct rte_event ev[],
 					       uint16_t nb_events);
 
-uint16_t __rte_hot cn10k_sso_hws_deq(void *port, struct rte_event *ev,
-				     uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_deq_burst(void *port, struct rte_event ev[],
-					   uint16_t nb_events,
-					   uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_tmo_deq(void *port, struct rte_event *ev,
-					 uint64_t timeout_ticks);
-uint16_t __rte_hot cn10k_sso_hws_tmo_deq_burst(void *port,
-					       struct rte_event ev[],
-					       uint16_t nb_events,
-					       uint64_t timeout_ticks);
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_##name(                           \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_burst_##name(                     \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
 
 #endif
diff --git a/drivers/event/cnxk/cn10k_worker_deq.c b/drivers/event/cnxk/cn10k_worker_deq.c
new file mode 100644
index 0000000000..36ec454ccc
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_deq.c
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_##name(                           \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);  \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn10k_sso_hws_get_work(                                 \
+			ws, ev, flags | NIX_RX_MULTI_SEG_F, ws->lookup_mem);   \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn10k_worker_deq_burst.c b/drivers/event/cnxk/cn10k_worker_deq_burst.c
new file mode 100644
index 0000000000..29ecc551cf
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_deq_burst.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_burst_##name(                     \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_##name(port, ev, timeout_ticks);      \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_seg_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_seg_##name(port, ev, timeout_ticks);  \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn10k_worker_deq_tmo.c b/drivers/event/cnxk/cn10k_worker_deq_tmo.c
new file mode 100644
index 0000000000..c8524a27bd
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_deq_tmo.c
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn10k_sso_hws_get_work(ws, ev, flags,            \
+						     ws->lookup_mem);          \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_tmo_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);  \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn10k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn10k_sso_hws_get_work(ws, ev, flags,            \
+						     ws->lookup_mem);          \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn10k_sso_hws_deq_tmo_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn10k_sso_hws_deq_tmo_seg_##name(port, ev,              \
+							timeout_ticks);        \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index 072800c243..e386cb784a 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -252,17 +252,202 @@ static void
 cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	/* Single WS modes */
+	const event_dequeue_t sso_hws_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_tmo[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_tmo_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_seg_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_deq_tmo_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_deq_tmo_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_deq_tmo_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
+
+	/* Dual WS modes */
+	const event_dequeue_t sso_hws_dual_deq[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t sso_hws_dual_deq_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_burst_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_t sso_hws_dual_deq_tmo[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_dual_deq_tmo_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
+
+	const event_dequeue_t sso_hws_dual_deq_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_dual_deq_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
+
+	const event_dequeue_t sso_hws_dual_deq_tmo_seg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_seg_##name,
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};
+
+	const event_dequeue_burst_t
+		sso_hws_dual_deq_tmo_seg_burst[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_deq_tmo_seg_burst_##name,
+			NIX_RX_FASTPATH_MODES
+#undef R
+		};
 
 	event_dev->enqueue = cn9k_sso_hws_enq;
 	event_dev->enqueue_burst = cn9k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn9k_sso_hws_enq_new_burst;
 	event_dev->enqueue_forward_burst = cn9k_sso_hws_enq_fwd_burst;
-
-	event_dev->dequeue = cn9k_sso_hws_deq;
-	event_dev->dequeue_burst = cn9k_sso_hws_deq_burst;
-	if (dev->deq_tmo_ns) {
-		event_dev->dequeue = cn9k_sso_hws_tmo_deq;
-		event_dev->dequeue_burst = cn9k_sso_hws_tmo_deq_burst;
+	if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+		event_dev->dequeue = sso_hws_deq_seg
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_seg_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_deq_tmo_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_deq_tmo_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
+	} else {
+		event_dev->dequeue = sso_hws_deq
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		event_dev->dequeue_burst = sso_hws_deq_burst
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_CHECKSUM_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+			[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		if (dev->is_timeout_deq) {
+			event_dev->dequeue = sso_hws_deq_tmo
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_deq_tmo_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+		}
 	}
 
 	if (dev->dual_ws) {
@@ -272,14 +457,110 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 		event_dev->enqueue_forward_burst =
 			cn9k_sso_hws_dual_enq_fwd_burst;
 
-		event_dev->dequeue = cn9k_sso_hws_dual_deq;
-		event_dev->dequeue_burst = cn9k_sso_hws_dual_deq_burst;
-		if (dev->deq_tmo_ns) {
-			event_dev->dequeue = cn9k_sso_hws_dual_tmo_deq;
-			event_dev->dequeue_burst =
-				cn9k_sso_hws_dual_tmo_deq_burst;
+		if (dev->rx_offloads & NIX_RX_MULTI_SEG_F) {
+			event_dev->dequeue = sso_hws_dual_deq_seg
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_dual_deq_seg_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			if (dev->is_timeout_deq) {
+				event_dev->dequeue = sso_hws_dual_deq_tmo_seg
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_TSTAMP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_CHECKSUM_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_PTYPE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_RSS_F)];
+				event_dev->dequeue_burst =
+					sso_hws_dual_deq_tmo_seg_burst
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_TSTAMP_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_CHECKSUM_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_PTYPE_F)]
+						[!!(dev->rx_offloads &
+						    NIX_RX_OFFLOAD_RSS_F)];
+			}
+		} else {
+			event_dev->dequeue = sso_hws_dual_deq
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			event_dev->dequeue_burst = sso_hws_dual_deq_burst
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+				[!!(dev->rx_offloads &
+				    NIX_RX_OFFLOAD_CHECKSUM_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_PTYPE_F)]
+				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
+			if (dev->is_timeout_deq) {
+				event_dev->dequeue = sso_hws_dual_deq_tmo
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_TSTAMP_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_CHECKSUM_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_PTYPE_F)]
+					[!!(dev->rx_offloads &
+					    NIX_RX_OFFLOAD_RSS_F)];
+				event_dev->dequeue_burst =
+					sso_hws_dual_deq_tmo_burst
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_VLAN_STRIP_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_TSTAMP_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_MARK_UPDATE_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_CHECKSUM_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_PTYPE_F)]
+						[!!(dev->rx_offloads &
+						  NIX_RX_OFFLOAD_RSS_F)];
+			}
 		}
 	}
+
+	rte_mb();
 }
 
 static void *
diff --git a/drivers/event/cnxk/cn9k_worker.c b/drivers/event/cnxk/cn9k_worker.c
index 9ceacc98dd..538bc4b0b3 100644
--- a/drivers/event/cnxk/cn9k_worker.c
+++ b/drivers/event/cnxk/cn9k_worker.c
@@ -60,60 +60,6 @@ cn9k_sso_hws_enq_fwd_burst(void *port, const struct rte_event ev[],
 	return 1;
 }
 
-uint16_t __rte_hot
-cn9k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws *ws = port;
-
-	RTE_SET_USED(timeout_ticks);
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_op);
-		return 1;
-	}
-
-	return cn9k_sso_hws_get_work(ws, ev);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
-		       uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws *ws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (ws->swtag_req) {
-		ws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(ws->tag_op);
-		return ret;
-	}
-
-	ret = cn9k_sso_hws_get_work(ws, ev);
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
-		ret = cn9k_sso_hws_get_work(ws, ev);
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-			   uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_tmo_deq(port, ev, timeout_ticks);
-}
-
 /* Dual ws ops. */
 
 uint16_t __rte_hot
@@ -171,66 +117,3 @@ cn9k_sso_hws_dual_enq_fwd_burst(void *port, const struct rte_event ev[],
 
 	return 1;
 }
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws_dual *dws = port;
-	uint16_t gw;
-
-	RTE_SET_USED(timeout_ticks);
-	if (dws->swtag_req) {
-		dws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(dws->ws_state[!dws->vws].tag_op);
-		return 1;
-	}
-
-	gw = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-					&dws->ws_state[!dws->vws], ev);
-	dws->vws = !dws->vws;
-	return gw;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_deq_burst(void *port, struct rte_event ev[],
-			    uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_dual_deq(port, ev, timeout_ticks);
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_tmo_deq(void *port, struct rte_event *ev,
-			  uint64_t timeout_ticks)
-{
-	struct cn9k_sso_hws_dual *dws = port;
-	uint16_t ret = 1;
-	uint64_t iter;
-
-	if (dws->swtag_req) {
-		dws->swtag_req = 0;
-		cnxk_sso_hws_swtag_wait(dws->ws_state[!dws->vws].tag_op);
-		return ret;
-	}
-
-	ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-					 &dws->ws_state[!dws->vws], ev);
-	dws->vws = !dws->vws;
-	for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {
-		ret = cn9k_sso_hws_dual_get_work(&dws->ws_state[dws->vws],
-						 &dws->ws_state[!dws->vws], ev);
-		dws->vws = !dws->vws;
-	}
-
-	return ret;
-}
-
-uint16_t __rte_hot
-cn9k_sso_hws_dual_tmo_deq_burst(void *port, struct rte_event ev[],
-				uint16_t nb_events, uint64_t timeout_ticks)
-{
-	RTE_SET_USED(nb_events);
-
-	return cn9k_sso_hws_dual_tmo_deq(port, ev, timeout_ticks);
-}
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index f5a4401465..c01c00e1da 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -128,17 +128,36 @@ cn9k_sso_hws_dual_forward_event(struct cn9k_sso_hws_dual *dws,
 	}
 }
 
+static __rte_always_inline void
+cn9k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
+		 const uint32_t tag, const uint32_t flags,
+		 const void *const lookup_mem)
+{
+	const uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM |
+				   (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
+
+	cn9k_nix_cqe_to_mbuf((struct nix_cqe_hdr_s *)wqe, tag,
+			     (struct rte_mbuf *)mbuf, lookup_mem,
+			     mbuf_init | ((uint64_t)port_id) << 48, flags);
+}
+
 static __rte_always_inline uint16_t
 cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 			   struct cn9k_sso_hws_state *ws_pair,
-			   struct rte_event *ev)
+			   struct rte_event *ev, const uint32_t flags,
+			   const void *const lookup_mem,
+			   struct cnxk_timesync_info *const tstamp)
 {
 	const uint64_t set_gw = BIT_ULL(16) | 1;
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t tstamp_ptr;
+	uint64_t mbuf;
 
+	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
+		rte_prefetch_non_temporal(lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "rty%=:					\n"
@@ -147,7 +166,10 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	str %[gw], [%[pong]]		\n"
 		     "		dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     "		prfm pldl1keep, [%[mbuf]]	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op),
 		       [gw] "r"(set_gw), [pong] "r"(ws_pair->getwrk_op));
 #else
@@ -156,12 +178,34 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 		gw.u64[0] = plt_read64(ws->tag_op);
 	gw.u64[1] = plt_read64(ws->wqp_op);
 	plt_write64(set_gw, ws_pair->getwrk_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, flags,
+					 lookup_mem);
+			/* Extracting tstamp, if PTP enabled*/
+			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
+							    gw.u64[1]) +
+						   CNXK_SSO_WQE_SG_PTR);
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+						flags & NIX_RX_OFFLOAD_TSTAMP_F,
+						flags & NIX_RX_MULTI_SEG_F,
+						(uint64_t *)tstamp_ptr);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -169,16 +213,22 @@ cn9k_sso_hws_dual_get_work(struct cn9k_sso_hws_state *ws,
 }
 
 static __rte_always_inline uint16_t
-cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
+cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev,
+		      const uint32_t flags, const void *const lookup_mem)
 {
 	union {
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t tstamp_ptr;
+	uint64_t mbuf;
 
 	plt_write64(BIT_ULL(16) | /* wait for work. */
 			    1,	  /* Use Mask set 0. */
 		    ws->getwrk_op);
+
+	if (flags & NIX_RX_OFFLOAD_PTYPE_F)
+		rte_prefetch_non_temporal(lookup_mem);
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
 		     "		ldr %[tag], [%[tag_loc]]	\n"
@@ -190,7 +240,10 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     "		prfm pldl1keep, [%[mbuf]]	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op));
 #else
 	gw.u64[0] = plt_read64(ws->tag_op);
@@ -198,12 +251,35 @@ cn9k_sso_hws_get_work(struct cn9k_sso_hws *ws, struct rte_event *ev)
 		gw.u64[0] = plt_read64(ws->tag_op);
 
 	gw.u64[1] = plt_read64(ws->wqp_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, flags,
+					 lookup_mem);
+			/* Extracting tstamp, if PTP enabled*/
+			tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)
+							    gw.u64[1]) +
+						   CNXK_SSO_WQE_SG_PTR);
+			cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf,
+						ws->tstamp,
+						flags & NIX_RX_OFFLOAD_TSTAMP_F,
+						flags & NIX_RX_MULTI_SEG_F,
+						(uint64_t *)tstamp_ptr);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -218,6 +294,7 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		__uint128_t get_work;
 		uint64_t u64[2];
 	} gw;
+	uint64_t mbuf;
 
 #ifdef RTE_ARCH_ARM64
 	asm volatile(PLT_CPU_FEATURE_PREAMBLE
@@ -230,7 +307,9 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		     "		ldr %[wqp], [%[wqp_loc]]	\n"
 		     "		tbnz %[tag], 63, rty%=		\n"
 		     "done%=:	dmb ld				\n"
-		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+		     "		sub %[mbuf], %[wqp], #0x80	\n"
+		     : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1]),
+		       [mbuf] "=&r"(mbuf)
 		     : [tag_loc] "r"(ws->tag_op), [wqp_loc] "r"(ws->wqp_op));
 #else
 	gw.u64[0] = plt_read64(ws->tag_op);
@@ -238,12 +317,25 @@ cn9k_sso_hws_get_work_empty(struct cn9k_sso_hws_state *ws, struct rte_event *ev)
 		gw.u64[0] = plt_read64(ws->tag_op);
 
 	gw.u64[1] = plt_read64(ws->wqp_op);
+	mbuf = (uint64_t)((char *)gw.u64[1] - sizeof(struct rte_mbuf));
 #endif
 
 	gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
 		    (gw.u64[0] & (0x3FFull << 36)) << 4 |
 		    (gw.u64[0] & 0xffffffff);
 
+	if (CNXK_TT_FROM_EVENT(gw.u64[0]) != SSO_TT_EMPTY) {
+		if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+		    RTE_EVENT_TYPE_ETHDEV) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+
+			gw.u64[0] = CNXK_CLR_SUB_EVENT(gw.u64[0]);
+			cn9k_wqe_to_mbuf(gw.u64[1], mbuf, port,
+					 gw.u64[0] & 0xFFFFF, 0, NULL);
+			gw.u64[1] = mbuf;
+		}
+	}
+
 	ev->event = gw.u64[0];
 	ev->u64 = gw.u64[1];
 
@@ -274,28 +366,54 @@ uint16_t __rte_hot cn9k_sso_hws_dual_enq_fwd_burst(void *port,
 						   const struct rte_event ev[],
 						   uint16_t nb_events);
 
-uint16_t __rte_hot cn9k_sso_hws_deq(void *port, struct rte_event *ev,
-				    uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_deq_burst(void *port, struct rte_event ev[],
-					  uint16_t nb_events,
-					  uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_tmo_deq(void *port, struct rte_event *ev,
-					uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[],
-					      uint16_t nb_events,
-					      uint64_t timeout_ticks);
-
-uint16_t __rte_hot cn9k_sso_hws_dual_deq(void *port, struct rte_event *ev,
-					 uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst(void *port,
-					       struct rte_event ev[],
-					       uint16_t nb_events,
-					       uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq(void *port, struct rte_event *ev,
-					     uint64_t timeout_ticks);
-uint16_t __rte_hot cn9k_sso_hws_dual_tmo_deq_burst(void *port,
-						   struct rte_event ev[],
-						   uint16_t nb_events,
-						   uint64_t timeout_ticks);
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_##name(                            \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_burst_##name(                      \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_##name(                    \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_burst_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);                                       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_##name(               \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks);     \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_burst_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks);
+
+NIX_RX_FASTPATH_MODES
+#undef R
 
 #endif
diff --git a/drivers/event/cnxk/cn9k_worker_deq.c b/drivers/event/cnxk/cn9k_worker_deq.c
new file mode 100644
index 0000000000..51ccaf4ec4
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_deq.c
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_##name(                            \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);   \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return 1;                                              \
+		}                                                              \
+									       \
+		return cn9k_sso_hws_get_work(                                  \
+			ws, ev, flags | NIX_RX_MULTI_SEG_F, ws->lookup_mem);   \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_deq_burst.c b/drivers/event/cnxk/cn9k_worker_deq_burst.c
new file mode 100644
index 0000000000..4e2801459b
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_deq_burst.c
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_burst_##name(                      \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_##name(port, ev, timeout_ticks);       \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_seg_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_seg_##name(port, ev, timeout_ticks);   \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_deq_tmo.c b/drivers/event/cnxk/cn9k_worker_deq_tmo.c
new file mode 100644
index 0000000000..9713d1ef00
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_deq_tmo.c
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_##name(                        \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);    \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn9k_sso_hws_get_work(ws, ev, flags,             \
+						    ws->lookup_mem);           \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_burst_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_tmo_##name(port, ev, timeout_ticks);   \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_##name(                    \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (ws->swtag_req) {                                           \
+			ws->swtag_req = 0;                                     \
+			cnxk_sso_hws_swtag_wait(ws->tag_op);                   \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_get_work(ws, ev, flags, ws->lookup_mem);    \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)     \
+			ret = cn9k_sso_hws_get_work(ws, ev, flags,             \
+						    ws->lookup_mem);           \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_deq_tmo_seg_burst_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_deq_tmo_seg_##name(port, ev,               \
+						       timeout_ticks);         \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_dual_deq.c b/drivers/event/cnxk/cn9k_worker_dual_deq.c
new file mode 100644
index 0000000000..709fa2d9ef
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_deq.c
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_##name(                       \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t gw;                                                   \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return 1;                                              \
+		}                                                              \
+									       \
+		gw = cn9k_sso_hws_dual_get_work(                               \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		return gw;                                                     \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t gw;                                                   \
+									       \
+		RTE_SET_USED(timeout_ticks);                                   \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return 1;                                              \
+		}                                                              \
+									       \
+		gw = cn9k_sso_hws_dual_get_work(                               \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		return gw;                                                     \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_dual_deq_burst.c b/drivers/event/cnxk/cn9k_worker_dual_deq_burst.c
new file mode 100644
index 0000000000..d50e1cf83f
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_deq_burst.c
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_burst_##name(                 \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_##name(port, ev, timeout_ticks);  \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_seg_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_seg_##name(port, ev,              \
+							timeout_ticks);        \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c b/drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c
new file mode 100644
index 0000000000..a0508fdf0d
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_deq_tmo.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+#include "cnxk_eventdev.h"
+#include "cnxk_worker.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_##name(                   \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_dual_get_work(                              \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
+			ret = cn9k_sso_hws_dual_get_work(                      \
+				&dws->ws_state[dws->vws],                      \
+				&dws->ws_state[!dws->vws], ev, flags,          \
+				dws->lookup_mem, dws->tstamp);                 \
+			dws->vws = !dws->vws;                                  \
+		}                                                              \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_burst_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_tmo_##name(port, ev,              \
+							timeout_ticks);        \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_##name(               \
+		void *port, struct rte_event *ev, uint64_t timeout_ticks)      \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *dws = port;                          \
+		uint16_t ret = 1;                                              \
+		uint64_t iter;                                                 \
+									       \
+		if (dws->swtag_req) {                                          \
+			dws->swtag_req = 0;                                    \
+			cnxk_sso_hws_swtag_wait(                               \
+				dws->ws_state[!dws->vws].tag_op);              \
+			return ret;                                            \
+		}                                                              \
+									       \
+		ret = cn9k_sso_hws_dual_get_work(                              \
+			&dws->ws_state[dws->vws], &dws->ws_state[!dws->vws],   \
+			ev, flags, dws->lookup_mem, dws->tstamp);              \
+		dws->vws = !dws->vws;                                          \
+		for (iter = 1; iter < timeout_ticks && (ret == 0); iter++) {   \
+			ret = cn9k_sso_hws_dual_get_work(                      \
+				&dws->ws_state[dws->vws],                      \
+				&dws->ws_state[!dws->vws], ev, flags,          \
+				dws->lookup_mem, dws->tstamp);                 \
+			dws->vws = !dws->vws;                                  \
+		}                                                              \
+									       \
+		return ret;                                                    \
+	}                                                                      \
+									       \
+	uint16_t __rte_hot cn9k_sso_hws_dual_deq_tmo_seg_burst_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events,         \
+		uint64_t timeout_ticks)                                        \
+	{                                                                      \
+		RTE_SET_USED(nb_events);                                       \
+									       \
+		return cn9k_sso_hws_dual_deq_tmo_seg_##name(port, ev,          \
+							    timeout_ticks);    \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index b65d725f55..9d5d2d0339 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -33,6 +33,7 @@
 #define CNXK_SSO_MZ_NAME       "cnxk_evdev_mz"
 #define CNXK_SSO_XAQ_CACHE_CNT (0x7)
 #define CNXK_SSO_XAQ_SLACK     (8)
+#define CNXK_SSO_WQE_SG_PTR    (9)
 
 #define CNXK_TT_FROM_TAG(x)	    (((x) >> 32) & SSO_TT_EMPTY)
 #define CNXK_TT_FROM_EVENT(x)	    (((x) >> 38) & SSO_TT_EMPTY)
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index eda562f5b5..c5c1c0ee8e 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -11,8 +11,17 @@ endif
 sources = files(
         'cn9k_eventdev.c',
         'cn9k_worker.c',
+        'cn9k_worker_deq.c',
+        'cn9k_worker_deq_burst.c',
+        'cn9k_worker_deq_tmo.c',
+        'cn9k_worker_dual_deq.c',
+        'cn9k_worker_dual_deq_burst.c',
+        'cn9k_worker_dual_deq_tmo.c',
         'cn10k_eventdev.c',
         'cn10k_worker.c',
+        'cn10k_worker_deq.c',
+        'cn10k_worker_deq_burst.c',
+        'cn10k_worker_deq_tmo.c',
         'cnxk_eventdev.c',
         'cnxk_eventdev_adptr.c',
         'cnxk_eventdev_selftest.c',
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 09/13] event/cnxk: add Tx adapter support
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (6 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 08/13] event/cnxk: add Rx adapter fastpath ops pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 10/13] event/cnxk: add Tx adapter fastpath ops pbhagavatula
                       ` (6 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Tx adapter.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |   4 +-
 doc/guides/rel_notes/release_21_08.rst   |   6 +-
 drivers/event/cnxk/cn10k_eventdev.c      |  91 ++++++++++++++++++
 drivers/event/cnxk/cn9k_eventdev.c       | 117 +++++++++++++++++++++++
 drivers/event/cnxk/cnxk_eventdev.h       |  21 +++-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 106 ++++++++++++++++++++
 6 files changed, 339 insertions(+), 6 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index b7e82c1273..6fdccc2ab4 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -42,7 +42,9 @@ Features of the OCTEON cnxk SSO PMD are:
 - HW managed packets enqueued from ethdev to eventdev exposed through event eth
   RX adapter.
 - N:1 ethernet device Rx queue to Event queue mapping.
-- Full Rx offload support defined through ethdev queue configuration.
+- Lockfree Tx from event eth Tx adapter using ``DEV_TX_OFFLOAD_MT_LOCKFREE``
+  capability while maintaining receive packet order.
+- Full Rx/Tx offload support defined through ethdev queue configuration.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 3892c8017a..80ff93269c 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -60,10 +60,10 @@ New Features
   * Added net/cnxk driver which provides the support for the integrated ethernet
     device.
 
-* **Added support for Marvell CN10K, CN9K, event Rx adapter.**
+* **Added support for Marvell CN10K, CN9K, event Rx/Tx adapter.**
 
-  * Added Rx adapter support for event/cnxk when the ethernet device requested is
-    net/cnxk.
+  * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested
+    is net/cnxk.
 
 
 Removed Items
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index ba7d95fff7..8a9b04a3db 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -44,6 +44,7 @@ cn10k_sso_init_hws_mem(void *arg, uint8_t port_id)
 	/* First cache line is reserved for cookie */
 	ws = (struct cn10k_sso_hws *)((uint8_t *)ws + RTE_CACHE_LINE_SIZE);
 	ws->base = roc_sso_hws_base_get(&dev->sso, port_id);
+	ws->tx_base = ws->base;
 	ws->hws_id = port_id;
 	ws->swtag_req = 0;
 	ws->gw_wdata = cn10k_sso_gw_mode_wdata(dev);
@@ -233,6 +234,39 @@ cn10k_sso_rsrc_init(void *arg, uint8_t hws, uint8_t hwgrp)
 	return roc_sso_rsrc_init(&dev->sso, hws, hwgrp);
 }
 
+static int
+cn10k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	if (dev->tx_adptr_data == NULL)
+		return 0;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		struct cn10k_sso_hws *ws = event_dev->data->ports[i];
+		void *ws_cookie;
+
+		ws_cookie = cnxk_sso_hws_get_cookie(ws);
+		ws_cookie = rte_realloc_socket(
+			ws_cookie,
+			sizeof(struct cnxk_sso_hws_cookie) +
+				sizeof(struct cn10k_sso_hws) +
+				(sizeof(uint64_t) * (dev->max_port_id + 1) *
+				 RTE_MAX_QUEUES_PER_PORT),
+			RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+		if (ws_cookie == NULL)
+			return -ENOMEM;
+		ws = RTE_PTR_ADD(ws_cookie, sizeof(struct cnxk_sso_hws_cookie));
+		memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
+		       sizeof(uint64_t) * (dev->max_port_id + 1) *
+			       RTE_MAX_QUEUES_PER_PORT);
+		event_dev->data->ports[i] = ws;
+	}
+
+	return 0;
+}
+
 static void
 cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
@@ -493,6 +527,10 @@ cn10k_sso_start(struct rte_eventdev *event_dev)
 {
 	int rc;
 
+	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+
 	rc = cnxk_sso_start(event_dev, cn10k_sso_hws_reset,
 			    cn10k_sso_hws_flush_events);
 	if (rc < 0)
@@ -595,6 +633,55 @@ cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
+			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (ret)
+		*caps = 0;
+	else
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+
+	return 0;
+}
+
+static int
+cn10k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	rc = cn10k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+	cn10k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn10k_sso_tx_adapter_queue_del(uint8_t id, const struct rte_eventdev *event_dev,
+			       const struct rte_eth_dev *eth_dev,
+			       int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_del(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	return cn10k_sso_updt_tx_adptr_data(event_dev);
+}
+
 static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.dev_infos_get = cn10k_sso_info_get,
 	.dev_configure = cn10k_sso_dev_configure,
@@ -614,6 +701,10 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_tx_adapter_caps_get = cn10k_sso_tx_adapter_caps_get,
+	.eth_tx_adapter_queue_add = cn10k_sso_tx_adapter_queue_add,
+	.eth_tx_adapter_queue_del = cn10k_sso_tx_adapter_queue_del,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index e386cb784a..bdc5632235 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -248,6 +248,66 @@ cn9k_sso_rsrc_init(void *arg, uint8_t hws, uint8_t hwgrp)
 	return roc_sso_rsrc_init(&dev->sso, hws, hwgrp);
 }
 
+static int
+cn9k_sso_updt_tx_adptr_data(const struct rte_eventdev *event_dev)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	int i;
+
+	if (dev->tx_adptr_data == NULL)
+		return 0;
+
+	for (i = 0; i < dev->nb_event_ports; i++) {
+		if (dev->dual_ws) {
+			struct cn9k_sso_hws_dual *dws =
+				event_dev->data->ports[i];
+			void *ws_cookie;
+
+			ws_cookie = cnxk_sso_hws_get_cookie(dws);
+			ws_cookie = rte_realloc_socket(
+				ws_cookie,
+				sizeof(struct cnxk_sso_hws_cookie) +
+					sizeof(struct cn9k_sso_hws_dual) +
+					(sizeof(uint64_t) *
+					 (dev->max_port_id + 1) *
+					 RTE_MAX_QUEUES_PER_PORT),
+				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+			if (ws_cookie == NULL)
+				return -ENOMEM;
+			dws = RTE_PTR_ADD(ws_cookie,
+					  sizeof(struct cnxk_sso_hws_cookie));
+			memcpy(&dws->tx_adptr_data, dev->tx_adptr_data,
+			       sizeof(uint64_t) * (dev->max_port_id + 1) *
+				       RTE_MAX_QUEUES_PER_PORT);
+			event_dev->data->ports[i] = dws;
+		} else {
+			struct cn9k_sso_hws *ws = event_dev->data->ports[i];
+			void *ws_cookie;
+
+			ws_cookie = cnxk_sso_hws_get_cookie(ws);
+			ws_cookie = rte_realloc_socket(
+				ws_cookie,
+				sizeof(struct cnxk_sso_hws_cookie) +
+					sizeof(struct cn9k_sso_hws_dual) +
+					(sizeof(uint64_t) *
+					 (dev->max_port_id + 1) *
+					 RTE_MAX_QUEUES_PER_PORT),
+				RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
+			if (ws_cookie == NULL)
+				return -ENOMEM;
+			ws = RTE_PTR_ADD(ws_cookie,
+					 sizeof(struct cnxk_sso_hws_cookie));
+			memcpy(&ws->tx_adptr_data, dev->tx_adptr_data,
+			       sizeof(uint64_t) * (dev->max_port_id + 1) *
+				       RTE_MAX_QUEUES_PER_PORT);
+			event_dev->data->ports[i] = ws;
+		}
+	}
+	rte_mb();
+
+	return 0;
+}
+
 static void
 cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
@@ -734,6 +794,10 @@ cn9k_sso_start(struct rte_eventdev *event_dev)
 {
 	int rc;
 
+	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+
 	rc = cnxk_sso_start(event_dev, cn9k_sso_hws_reset,
 			    cn9k_sso_hws_flush_events);
 	if (rc < 0)
@@ -844,6 +908,55 @@ cn9k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn9k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
+			     const struct rte_eth_dev *eth_dev, uint32_t *caps)
+{
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn9k", 8);
+	if (ret)
+		*caps = 0;
+	else
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+
+	return 0;
+}
+
+static int
+cn9k_sso_tx_adapter_queue_add(uint8_t id, const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_add(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	rc = cn9k_sso_updt_tx_adptr_data(event_dev);
+	if (rc < 0)
+		return rc;
+	cn9k_sso_fp_fns_set((struct rte_eventdev *)(uintptr_t)event_dev);
+
+	return 0;
+}
+
+static int
+cn9k_sso_tx_adapter_queue_del(uint8_t id, const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	int rc;
+
+	RTE_SET_USED(id);
+	rc = cnxk_sso_tx_adapter_queue_del(event_dev, eth_dev, tx_queue_id);
+	if (rc < 0)
+		return rc;
+	return cn9k_sso_updt_tx_adptr_data(event_dev);
+}
+
 static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.dev_infos_get = cn9k_sso_info_get,
 	.dev_configure = cn9k_sso_dev_configure,
@@ -863,6 +976,10 @@ static struct rte_eventdev_ops cn9k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_tx_adapter_caps_get = cn9k_sso_tx_adapter_caps_get,
+	.eth_tx_adapter_queue_add = cn9k_sso_tx_adapter_queue_add,
+	.eth_tx_adapter_queue_del = cn9k_sso_tx_adapter_queue_del,
+
 	.timer_adapter_caps_get = cnxk_tim_caps_get,
 
 	.dump = cnxk_sso_dump,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 9d5d2d0339..458fdc8d92 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -8,6 +8,7 @@
 #include <rte_devargs.h>
 #include <rte_ethdev.h>
 #include <rte_event_eth_rx_adapter.h>
+#include <rte_event_eth_tx_adapter.h>
 #include <rte_kvargs.h>
 #include <rte_mbuf_pool_ops.h>
 #include <rte_pci.h>
@@ -86,9 +87,12 @@ struct cnxk_sso_evdev {
 	rte_iova_t fc_iova;
 	struct rte_mempool *xaq_pool;
 	uint64_t rx_offloads;
+	uint64_t tx_offloads;
 	uint64_t adptr_xae_cnt;
 	uint16_t rx_adptr_pool_cnt;
 	uint64_t *rx_adptr_pools;
+	uint64_t *tx_adptr_data;
+	uint16_t max_port_id;
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
@@ -115,7 +119,10 @@ struct cn10k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
+	/* Tx Fastpath data */
+	uint64_t tx_base __rte_cache_aligned;
 	uintptr_t lmt_base;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 /* CN9K HWS ops */
@@ -140,7 +147,9 @@ struct cn9k_sso_hws {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base;
+	/* Tx Fastpath data */
+	uint64_t base __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 struct cn9k_sso_hws_state {
@@ -160,7 +169,9 @@ struct cn9k_sso_hws_dual {
 	uint64_t xaq_lmt __rte_cache_aligned;
 	uint64_t *fc_mem;
 	uintptr_t grps_base[CNXK_SSO_MAX_HWGRP];
-	uint64_t base[2];
+	/* Tx Fastpath data */
+	uint64_t base[2] __rte_cache_aligned;
+	uint8_t tx_adptr_data[];
 } __rte_cache_aligned;
 
 struct cnxk_sso_hws_cookie {
@@ -267,5 +278,11 @@ int cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
 			      const struct rte_eth_dev *eth_dev);
 int cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
 			     const struct rte_eth_dev *eth_dev);
+int cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t tx_queue_id);
+int cnxk_sso_tx_adapter_queue_del(const struct rte_eventdev *event_dev,
+				  const struct rte_eth_dev *eth_dev,
+				  int32_t tx_queue_id);
 
 #endif /* __CNXK_EVENTDEV_H__ */
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 24bfd985e7..548d7b81ce 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -5,6 +5,8 @@
 #include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 
+#define CNXK_SSO_SQB_LIMIT (0x180)
+
 void
 cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		      uint32_t event_type)
@@ -241,3 +243,107 @@ cnxk_sso_rx_adapter_stop(const struct rte_eventdev *event_dev,
 
 	return 0;
 }
+
+static int
+cnxk_sso_sqb_aura_limit_edit(struct roc_nix_sq *sq, uint16_t nb_sqb_bufs)
+{
+	uint16_t sqb_limit;
+
+	sqb_limit = RTE_MIN(nb_sqb_bufs, sq->nb_sqb_bufs);
+	return roc_npa_aura_limit_modify(sq->aura_handle, sqb_limit);
+}
+
+static int
+cnxk_sso_updt_tx_queue_data(const struct rte_eventdev *event_dev,
+			    uint16_t eth_port_id, uint16_t tx_queue_id,
+			    void *txq)
+{
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	uint16_t max_port_id = dev->max_port_id;
+	uint64_t *txq_data = dev->tx_adptr_data;
+
+	if (txq_data == NULL || eth_port_id > max_port_id) {
+		max_port_id = RTE_MAX(max_port_id, eth_port_id);
+		txq_data = rte_realloc_socket(
+			txq_data,
+			(sizeof(uint64_t) * (max_port_id + 1) *
+			 RTE_MAX_QUEUES_PER_PORT),
+			RTE_CACHE_LINE_SIZE, event_dev->data->socket_id);
+		if (txq_data == NULL)
+			return -ENOMEM;
+	}
+
+	((uint64_t(*)[RTE_MAX_QUEUES_PER_PORT])
+		 txq_data)[eth_port_id][tx_queue_id] = (uint64_t)txq;
+	dev->max_port_id = max_port_id;
+	dev->tx_adptr_data = txq_data;
+	return 0;
+}
+
+int
+cnxk_sso_tx_adapter_queue_add(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
+	struct roc_nix_sq *sq;
+	int i, ret;
+	void *txq;
+
+	if (tx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
+			txq = eth_dev->data->tx_queues[i];
+			sq = &cnxk_eth_dev->sqs[i];
+			cnxk_sso_sqb_aura_limit_edit(sq, CNXK_SSO_SQB_LIMIT);
+			ret = cnxk_sso_updt_tx_queue_data(
+				event_dev, eth_dev->data->port_id, i, txq);
+			if (ret < 0)
+				return ret;
+		}
+	} else {
+		txq = eth_dev->data->tx_queues[tx_queue_id];
+		sq = &cnxk_eth_dev->sqs[tx_queue_id];
+		cnxk_sso_sqb_aura_limit_edit(sq, CNXK_SSO_SQB_LIMIT);
+		ret = cnxk_sso_updt_tx_queue_data(
+			event_dev, eth_dev->data->port_id, tx_queue_id, txq);
+		if (ret < 0)
+			return ret;
+	}
+
+	dev->tx_offloads |= cnxk_eth_dev->tx_offload_flags;
+
+	return 0;
+}
+
+int
+cnxk_sso_tx_adapter_queue_del(const struct rte_eventdev *event_dev,
+			      const struct rte_eth_dev *eth_dev,
+			      int32_t tx_queue_id)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
+	struct roc_nix_sq *sq;
+	int i, ret;
+
+	RTE_SET_USED(event_dev);
+	if (tx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
+			sq = &cnxk_eth_dev->sqs[i];
+			cnxk_sso_sqb_aura_limit_edit(sq, sq->nb_sqb_bufs);
+			ret = cnxk_sso_updt_tx_queue_data(
+				event_dev, eth_dev->data->port_id, tx_queue_id,
+				NULL);
+			if (ret < 0)
+				return ret;
+		}
+	} else {
+		sq = &cnxk_eth_dev->sqs[tx_queue_id];
+		cnxk_sso_sqb_aura_limit_edit(sq, sq->nb_sqb_bufs);
+		ret = cnxk_sso_updt_tx_queue_data(
+			event_dev, eth_dev->data->port_id, tx_queue_id, NULL);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 10/13] event/cnxk: add Tx adapter fastpath ops
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (7 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 09/13] event/cnxk: add Tx adapter support pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 11/13] event/cnxk: add Rx adapter vector support pbhagavatula
                       ` (5 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton; +Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add support for event eth Tx adapter fastpath operations.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_eventdev.c           | 38 ++++++++
 drivers/event/cnxk/cn10k_worker.h             | 67 ++++++++++++++
 drivers/event/cnxk/cn10k_worker_tx_enq.c      | 23 +++++
 drivers/event/cnxk/cn10k_worker_tx_enq_seg.c  | 23 +++++
 drivers/event/cnxk/cn9k_eventdev.c            | 81 +++++++++++++++++
 drivers/event/cnxk/cn9k_worker.h              | 87 +++++++++++++++++++
 drivers/event/cnxk/cn9k_worker_dual_tx_enq.c  | 23 +++++
 .../event/cnxk/cn9k_worker_dual_tx_enq_seg.c  | 23 +++++
 drivers/event/cnxk/cn9k_worker_tx_enq.c       | 23 +++++
 drivers/event/cnxk/cn9k_worker_tx_enq_seg.c   | 23 +++++
 drivers/event/cnxk/meson.build                |  6 ++
 11 files changed, 417 insertions(+)
 create mode 100644 drivers/event/cnxk/cn10k_worker_tx_enq.c
 create mode 100644 drivers/event/cnxk/cn10k_worker_tx_enq_seg.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_tx_enq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_tx_enq.c
 create mode 100644 drivers/event/cnxk/cn9k_worker_tx_enq_seg.c

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 8a9b04a3db..e462f770c5 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -328,6 +328,23 @@ cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 #undef R
 		};
 
+	/* Tx modes */
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq_seg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_sso_hws_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
 	event_dev->enqueue = cn10k_sso_hws_enq;
 	event_dev->enqueue_burst = cn10k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn10k_sso_hws_enq_new_burst;
@@ -407,6 +424,27 @@ cn10k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 				[!!(dev->rx_offloads & NIX_RX_OFFLOAD_RSS_F)];
 		}
 	}
+
+	if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+		/* [SEC] [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM] */
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq_seg
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	} else {
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	}
+
+	event_dev->txa_enqueue_same_dest = event_dev->txa_enqueue;
 }
 
 static void
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index b724083caa..3c90c85009 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -11,6 +11,7 @@
 
 #include "cn10k_ethdev.h"
 #include "cn10k_rx.h"
+#include "cn10k_tx.h"
 
 /* SSO Operations */
 
@@ -251,4 +252,70 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 NIX_RX_FASTPATH_MODES
 #undef R
 
+static __rte_always_inline const struct cn10k_eth_txq *
+cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
+			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+{
+	return (const struct cn10k_eth_txq *)
+		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+}
+
+static __rte_always_inline uint16_t
+cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
+		       uint64_t *cmd,
+		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		       const uint32_t flags)
+{
+	const struct cn10k_eth_txq *txq;
+	struct rte_mbuf *m = ev->mbuf;
+	uint16_t ref_cnt = m->refcnt;
+	uintptr_t lmt_addr;
+	uint16_t lmt_id;
+	uintptr_t pa;
+
+	lmt_addr = ws->lmt_base;
+	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
+	cn10k_nix_tx_skeleton(txq, cmd, flags);
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F)
+		cn10k_nix_xmit_prepare_tso(m, flags);
+
+	cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags, txq->lso_tun_fmt);
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		const uint16_t segdw =
+			cn10k_nix_prepare_mseg(m, (uint64_t *)lmt_addr, flags);
+		pa = txq->io_addr | ((segdw - 1) << 4);
+	} else {
+		pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
+	}
+	if (!ev->sched_type)
+		cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
+
+	roc_lmt_submit_steorl(lmt_id, pa);
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if (ref_cnt > 1)
+			return 1;
+	}
+
+	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
+				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+
+	return 1;
+}
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_seg_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_dual_tx_adptr_enq_##name(             \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn10k_sso_hws_dual_tx_adptr_enq_seg_##name(         \
+		void *port, struct rte_event ev[], uint16_t nb_events);
+
+NIX_TX_FASTPATH_MODES
+#undef T
+
 #endif
diff --git a/drivers/event/cnxk/cn10k_worker_tx_enq.c b/drivers/event/cnxk/cn10k_worker_tx_enq.c
new file mode 100644
index 0000000000..f9968ac0d0
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_tx_enq.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_##name(                  \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn10k_sso_hws *ws = port;                               \
+		uint64_t cmd[sz];                                              \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn10k_sso_hws_event_tx(                                 \
+			ws, &ev[0], cmd,                                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn10k_worker_tx_enq_seg.c b/drivers/event/cnxk/cn10k_worker_tx_enq_seg.c
new file mode 100644
index 0000000000..a24fc42e5a
--- /dev/null
+++ b/drivers/event/cnxk/cn10k_worker_tx_enq_seg.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn10k_sso_hws_tx_adptr_enq_seg_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn10k_sso_hws *ws = port;                               \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn10k_sso_hws_event_tx(                                 \
+			ws, &ev[0], cmd,                                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_eventdev.c b/drivers/event/cnxk/cn9k_eventdev.c
index bdc5632235..af97020f2f 100644
--- a/drivers/event/cnxk/cn9k_eventdev.c
+++ b/drivers/event/cnxk/cn9k_eventdev.c
@@ -430,6 +430,39 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 #undef R
 		};
 
+	/* Tx modes */
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_tx_adptr_enq_seg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_dual_tx_adptr_enq[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_tx_adptr_enq_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
+	const event_tx_adapter_enqueue
+		sso_hws_dual_tx_adptr_enq_seg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn9k_sso_hws_dual_tx_adptr_enq_seg_##name,
+			NIX_TX_FASTPATH_MODES
+#undef T
+		};
+
 	event_dev->enqueue = cn9k_sso_hws_enq;
 	event_dev->enqueue_burst = cn9k_sso_hws_enq_burst;
 	event_dev->enqueue_new_burst = cn9k_sso_hws_enq_new_burst;
@@ -510,6 +543,25 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 		}
 	}
 
+	if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+		/* [SEC] [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM] */
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq_seg
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	} else {
+		event_dev->txa_enqueue = sso_hws_tx_adptr_enq
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+			[!!(dev->tx_offloads & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+	}
+
 	if (dev->dual_ws) {
 		event_dev->enqueue = cn9k_sso_hws_dual_enq;
 		event_dev->enqueue_burst = cn9k_sso_hws_dual_enq_burst;
@@ -618,8 +670,37 @@ cn9k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 						  NIX_RX_OFFLOAD_RSS_F)];
 			}
 		}
+
+		if (dev->tx_offloads & NIX_TX_MULTI_SEG_F) {
+			/* [TSMP] [MBUF_NOFF] [VLAN] [OL3_L4_CSUM] [L3_L4_CSUM]
+			 */
+			event_dev->txa_enqueue = sso_hws_dual_tx_adptr_enq_seg
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+		} else {
+			event_dev->txa_enqueue = sso_hws_dual_tx_adptr_enq
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSTAMP_F)]
+				[!!(dev->tx_offloads & NIX_TX_OFFLOAD_TSO_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_MBUF_NOFF_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_VLAN_QINQ_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
+				[!!(dev->tx_offloads &
+				    NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
+		}
 	}
 
+	event_dev->txa_enqueue_same_dest = event_dev->txa_enqueue;
 	rte_mb();
 }
 
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index c01c00e1da..5aa053c586 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -11,6 +11,7 @@
 
 #include "cn9k_ethdev.h"
 #include "cn9k_rx.h"
+#include "cn9k_tx.h"
 
 /* SSO Operations */
 
@@ -416,4 +417,90 @@ NIX_RX_FASTPATH_MODES
 NIX_RX_FASTPATH_MODES
 #undef R
 
+static __rte_always_inline const struct cn9k_eth_txq *
+cn9k_sso_hws_xtract_meta(struct rte_mbuf *m,
+			 const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
+{
+	return (const struct cn9k_eth_txq *)
+		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
+}
+
+static __rte_always_inline void
+cn9k_sso_hws_prepare_pkt(const struct cn9k_eth_txq *txq, struct rte_mbuf *m,
+			 uint64_t *cmd, const uint32_t flags)
+{
+	roc_lmt_mov(cmd, txq->cmd, cn9k_nix_tx_ext_subs(flags));
+	cn9k_nix_xmit_prepare(m, cmd, flags, txq->lso_tun_fmt);
+}
+
+static __rte_always_inline uint16_t
+cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
+		      const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+		      const uint32_t flags)
+{
+	struct rte_mbuf *m = ev->mbuf;
+	const struct cn9k_eth_txq *txq;
+	uint16_t ref_cnt = m->refcnt;
+
+	/* Perform header writes before barrier for TSO */
+	cn9k_nix_xmit_prepare_tso(m, flags);
+	/* Lets commit any changes in the packet here in case when
+	 * fast free is set as no further changes will be made to mbuf.
+	 * In case of fast free is not set, both cn9k_nix_prepare_mseg()
+	 * and cn9k_nix_xmit_prepare() has a barrier after refcnt update.
+	 */
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+		rte_io_wmb();
+	txq = cn9k_sso_hws_xtract_meta(m, txq_data);
+	cn9k_sso_hws_prepare_pkt(txq, m, cmd, flags);
+
+	if (flags & NIX_TX_MULTI_SEG_F) {
+		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
+		if (!CNXK_TT_FROM_EVENT(ev->event)) {
+			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
+			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
+						       txq->io_addr, segdw);
+		} else {
+			cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr,
+					       segdw);
+		}
+	} else {
+		if (!CNXK_TT_FROM_EVENT(ev->event)) {
+			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
+			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
+						  txq->io_addr, flags);
+		} else {
+			cn9k_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr,
+					  flags);
+		}
+	}
+
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
+		if (ref_cnt > 1)
+			return 1;
+	}
+
+	cnxk_sso_hws_swtag_flush(base + SSOW_LF_GWS_TAG,
+				 base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
+
+	return 1;
+}
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_##name(                   \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_seg_##name(               \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events);        \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_seg_##name(          \
+		void *port, struct rte_event ev[], uint16_t nb_events);
+
+NIX_TX_FASTPATH_MODES
+#undef T
+
 #endif
diff --git a/drivers/event/cnxk/cn9k_worker_dual_tx_enq.c b/drivers/event/cnxk/cn9k_worker_dual_tx_enq.c
new file mode 100644
index 0000000000..92e2981f02
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_tx_enq.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_##name(              \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn9k_sso_hws_dual *ws = port;                           \
+		uint64_t cmd[sz];                                              \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base[!ws->vws], &ev[0], cmd,                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c b/drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c
new file mode 100644
index 0000000000..dfb574cf95
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_dual_tx_enq_seg.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_dual_tx_adptr_enq_seg_##name(          \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn9k_sso_hws_dual *ws = port;                           \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base[!ws->vws], &ev[0], cmd,                       \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker_tx_enq.c b/drivers/event/cnxk/cn9k_worker_tx_enq.c
new file mode 100644
index 0000000000..3df649c0c8
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_tx_enq.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_##name(                   \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		struct cn9k_sso_hws *ws = port;                                \
+		uint64_t cmd[sz];                                              \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base, &ev[0], cmd,                                 \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			flags);                                                \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/cn9k_worker_tx_enq_seg.c b/drivers/event/cnxk/cn9k_worker_tx_enq_seg.c
new file mode 100644
index 0000000000..0efe29113e
--- /dev/null
+++ b/drivers/event/cnxk/cn9k_worker_tx_enq_seg.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_worker.h"
+
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	uint16_t __rte_hot cn9k_sso_hws_tx_adptr_enq_seg_##name(               \
+		void *port, struct rte_event ev[], uint16_t nb_events)         \
+	{                                                                      \
+		uint64_t cmd[(sz) + CNXK_NIX_TX_MSEG_SG_DWORDS - 2];           \
+		struct cn9k_sso_hws *ws = port;                                \
+									       \
+		RTE_SET_USED(nb_events);                                       \
+		return cn9k_sso_hws_event_tx(                                  \
+			ws->base, &ev[0], cmd,                                 \
+			(const uint64_t(*)[RTE_MAX_QUEUES_PER_PORT]) &         \
+				ws->tx_adptr_data,                             \
+			(flags) | NIX_TX_MULTI_SEG_F);                         \
+	}
+
+NIX_TX_FASTPATH_MODES
+#undef T
diff --git a/drivers/event/cnxk/meson.build b/drivers/event/cnxk/meson.build
index c5c1c0ee8e..13e0634e86 100644
--- a/drivers/event/cnxk/meson.build
+++ b/drivers/event/cnxk/meson.build
@@ -17,11 +17,17 @@ sources = files(
         'cn9k_worker_dual_deq.c',
         'cn9k_worker_dual_deq_burst.c',
         'cn9k_worker_dual_deq_tmo.c',
+        'cn9k_worker_tx_enq.c',
+        'cn9k_worker_tx_enq_seg.c',
+        'cn9k_worker_dual_tx_enq.c',
+        'cn9k_worker_dual_tx_enq_seg.c',
         'cn10k_eventdev.c',
         'cn10k_worker.c',
         'cn10k_worker_deq.c',
         'cn10k_worker_deq_burst.c',
         'cn10k_worker_deq_tmo.c',
+        'cn10k_worker_tx_enq.c',
+        'cn10k_worker_tx_enq_seg.c',
         'cnxk_eventdev.c',
         'cnxk_eventdev_adptr.c',
         'cnxk_eventdev_selftest.c',
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 11/13] event/cnxk: add Rx adapter vector support
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (8 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 10/13] event/cnxk: add Tx adapter fastpath ops pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 12/13] event/cnxk: add Rx event vector fastpath pbhagavatula
                       ` (4 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add event vector support for cnxk event Rx adapter, add control path
APIs to get vector limits and ability to configure event vectorization
on a given Rx queue.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst            |   2 +
 drivers/event/cnxk/cn10k_eventdev.c      | 106 ++++++++++++++++++++++-
 drivers/event/cnxk/cnxk_eventdev.h       |   2 +
 drivers/event/cnxk/cnxk_eventdev_adptr.c |  25 ++++++
 drivers/net/cnxk/cnxk_ethdev.h           |   2 +-
 5 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 6fdccc2ab4..0297cd3d5f 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -45,6 +45,8 @@ Features of the OCTEON cnxk SSO PMD are:
 - Lockfree Tx from event eth Tx adapter using ``DEV_TX_OFFLOAD_MT_LOCKFREE``
   capability while maintaining receive packet order.
 - Full Rx/Tx offload support defined through ethdev queue configuration.
+- HW managed event vectorization on CN10K for packets enqueued from ethdev to
+  eventdev configurable per each Rx queue in Rx adapter.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index e462f770c5..e85fa4785d 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -610,7 +610,8 @@ cn10k_sso_rx_adapter_caps_get(const struct rte_eventdev *event_dev,
 	else
 		*caps = RTE_EVENT_ETH_RX_ADAPTER_CAP_INTERNAL_PORT |
 			RTE_EVENT_ETH_RX_ADAPTER_CAP_MULTI_EVENTQ |
-			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID;
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_OVERRIDE_FLOW_ID |
+			RTE_EVENT_ETH_RX_ADAPTER_CAP_EVENT_VECTOR;
 
 	return 0;
 }
@@ -671,6 +672,105 @@ cn10k_sso_rx_adapter_queue_del(const struct rte_eventdev *event_dev,
 	return cnxk_sso_rx_adapter_queue_del(event_dev, eth_dev, rx_queue_id);
 }
 
+static int
+cn10k_sso_rx_adapter_vector_limits(
+	const struct rte_eventdev *dev, const struct rte_eth_dev *eth_dev,
+	struct rte_event_eth_rx_adapter_vector_limits *limits)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev;
+	int ret;
+
+	RTE_SET_USED(dev);
+	ret = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (ret)
+		return -ENOTSUP;
+
+	cnxk_eth_dev = cnxk_eth_pmd_priv(eth_dev);
+	limits->log2_sz = true;
+	limits->min_sz = 1 << ROC_NIX_VWQE_MIN_SIZE_LOG2;
+	limits->max_sz = 1 << ROC_NIX_VWQE_MAX_SIZE_LOG2;
+	limits->min_timeout_ns =
+		(roc_nix_get_vwqe_interval(&cnxk_eth_dev->nix) + 1) * 100;
+	limits->max_timeout_ns = BITMASK_ULL(8, 0) * limits->min_timeout_ns;
+
+	return 0;
+}
+
+static int
+cnxk_sso_rx_adapter_vwqe_enable(struct cnxk_eth_dev *cnxk_eth_dev,
+				uint16_t port_id, uint16_t rq_id, uint16_t sz,
+				uint64_t tmo_ns, struct rte_mempool *vmp)
+{
+	struct roc_nix_rq *rq;
+
+	rq = &cnxk_eth_dev->rqs[rq_id];
+
+	if (!rq->sso_ena)
+		return -EINVAL;
+	if (rq->flow_tag_width == 0)
+		return -EINVAL;
+
+	rq->vwqe_ena = 1;
+	rq->vwqe_first_skip = 0;
+	rq->vwqe_aura_handle = roc_npa_aura_handle_to_aura(vmp->pool_id);
+	rq->vwqe_max_sz_exp = rte_log2_u32(sz);
+	rq->vwqe_wait_tmo =
+		tmo_ns /
+		((roc_nix_get_vwqe_interval(&cnxk_eth_dev->nix) + 1) * 100);
+	rq->tag_mask = (port_id & 0xF) << 20;
+	rq->tag_mask |=
+		(((port_id >> 4) & 0xF) | (RTE_EVENT_TYPE_ETHDEV_VECTOR << 4))
+		<< 24;
+
+	return roc_nix_rq_modify(&cnxk_eth_dev->nix, rq, 0);
+}
+
+static int
+cn10k_sso_rx_adapter_vector_config(
+	const struct rte_eventdev *event_dev, const struct rte_eth_dev *eth_dev,
+	int32_t rx_queue_id,
+	const struct rte_event_eth_rx_adapter_event_vector_config *config)
+{
+	struct cnxk_eth_dev *cnxk_eth_dev;
+	struct cnxk_sso_evdev *dev;
+	int i, rc;
+
+	rc = strncmp(eth_dev->device->driver->name, "net_cn10k", 8);
+	if (rc)
+		return -EINVAL;
+
+	dev = cnxk_sso_pmd_priv(event_dev);
+	cnxk_eth_dev = cnxk_eth_pmd_priv(eth_dev);
+	if (rx_queue_id < 0) {
+		for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
+			cnxk_sso_updt_xae_cnt(dev, config->vector_mp,
+					      RTE_EVENT_TYPE_ETHDEV_VECTOR);
+			rc = cnxk_sso_xae_reconfigure(
+				(struct rte_eventdev *)(uintptr_t)event_dev);
+			rc = cnxk_sso_rx_adapter_vwqe_enable(
+				cnxk_eth_dev, eth_dev->data->port_id, i,
+				config->vector_sz, config->vector_timeout_ns,
+				config->vector_mp);
+			if (rc)
+				return -EINVAL;
+		}
+	} else {
+
+		cnxk_sso_updt_xae_cnt(dev, config->vector_mp,
+				      RTE_EVENT_TYPE_ETHDEV_VECTOR);
+		rc = cnxk_sso_xae_reconfigure(
+			(struct rte_eventdev *)(uintptr_t)event_dev);
+		rc = cnxk_sso_rx_adapter_vwqe_enable(
+			cnxk_eth_dev, eth_dev->data->port_id, rx_queue_id,
+			config->vector_sz, config->vector_timeout_ns,
+			config->vector_mp);
+		if (rc)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int
 cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
 			      const struct rte_eth_dev *eth_dev, uint32_t *caps)
@@ -739,6 +839,10 @@ static struct rte_eventdev_ops cn10k_sso_dev_ops = {
 	.eth_rx_adapter_start = cnxk_sso_rx_adapter_start,
 	.eth_rx_adapter_stop = cnxk_sso_rx_adapter_stop,
 
+	.eth_rx_adapter_vector_limits_get = cn10k_sso_rx_adapter_vector_limits,
+	.eth_rx_adapter_event_vector_config =
+		cn10k_sso_rx_adapter_vector_config,
+
 	.eth_tx_adapter_caps_get = cn10k_sso_tx_adapter_caps_get,
 	.eth_tx_adapter_queue_add = cn10k_sso_tx_adapter_queue_add,
 	.eth_tx_adapter_queue_del = cn10k_sso_tx_adapter_queue_del,
diff --git a/drivers/event/cnxk/cnxk_eventdev.h b/drivers/event/cnxk/cnxk_eventdev.h
index 458fdc8d92..3783e0c95b 100644
--- a/drivers/event/cnxk/cnxk_eventdev.h
+++ b/drivers/event/cnxk/cnxk_eventdev.h
@@ -96,6 +96,8 @@ struct cnxk_sso_evdev {
 	uint16_t tim_adptr_ring_cnt;
 	uint16_t *timer_adptr_rings;
 	uint64_t *timer_adptr_sz;
+	uint16_t vec_pool_cnt;
+	uint64_t *vec_pools;
 	/* Dev args */
 	uint32_t xae_cnt;
 	uint8_t qos_queue_cnt;
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 548d7b81ce..c4c4f5a7f4 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -40,6 +40,31 @@ cnxk_sso_updt_xae_cnt(struct cnxk_sso_evdev *dev, void *data,
 		dev->adptr_xae_cnt += rxq->qconf.mp->size;
 		break;
 	}
+	case RTE_EVENT_TYPE_ETHDEV_VECTOR: {
+		struct rte_mempool *mp = data;
+		uint64_t *old_ptr;
+
+		for (i = 0; i < dev->vec_pool_cnt; i++) {
+			if ((uint64_t)mp == dev->vec_pools[i])
+				return;
+		}
+
+		dev->vec_pool_cnt++;
+		old_ptr = dev->vec_pools;
+		dev->vec_pools =
+			rte_realloc(dev->vec_pools,
+				    sizeof(uint64_t) * dev->vec_pool_cnt, 0);
+		if (dev->vec_pools == NULL) {
+			dev->adptr_xae_cnt += mp->size;
+			dev->vec_pools = old_ptr;
+			dev->vec_pool_cnt--;
+			return;
+		}
+		dev->vec_pools[dev->vec_pool_cnt - 1] = (uint64_t)mp;
+
+		dev->adptr_xae_cnt += mp->size;
+		break;
+	}
 	case RTE_EVENT_TYPE_TIMER: {
 		struct cnxk_tim_ring *timr = data;
 		uint16_t *old_ring_ptr;
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index 4eead03905..2528b3cdaa 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -238,7 +238,7 @@ struct cnxk_eth_txq_sp {
 } __plt_cache_aligned;
 
 static inline struct cnxk_eth_dev *
-cnxk_eth_pmd_priv(struct rte_eth_dev *eth_dev)
+cnxk_eth_pmd_priv(const struct rte_eth_dev *eth_dev)
 {
 	return eth_dev->data->dev_private;
 }
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 12/13] event/cnxk: add Rx event vector fastpath
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (9 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 11/13] event/cnxk: add Rx adapter vector support pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 13/13] event/cnxk: add Tx " pbhagavatula
                       ` (3 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Rx event vector fastpath to convert HW defined metadata into
rte_mbuf and rte_event_vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/rel_notes/release_21_08.rst |   1 +
 drivers/event/cnxk/cn10k_worker.h      |  56 +++++++
 drivers/net/cnxk/cn10k_rx.h            | 200 +++++++++++++++----------
 drivers/net/cnxk/cn10k_rx_vec.c        |   2 +-
 drivers/net/cnxk/cn10k_rx_vec_mseg.c   |   5 +-
 5 files changed, 179 insertions(+), 85 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 80ff93269c..11ccc9bcb5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -64,6 +64,7 @@ New Features
 
   * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested
     is net/cnxk.
+  * Add support for event vectorization for Rx adapter.
 
 
 Removed Items
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 3c90c85009..7a48a6b17d 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -5,6 +5,8 @@
 #ifndef __CN10K_WORKER_H__
 #define __CN10K_WORKER_H__
 
+#include <rte_vect.h>
+
 #include "cnxk_ethdev.h"
 #include "cnxk_eventdev.h"
 #include "cnxk_worker.h"
@@ -101,6 +103,49 @@ cn10k_wqe_to_mbuf(uint64_t wqe, const uint64_t mbuf, uint8_t port_id,
 			      mbuf_init | ((uint64_t)port_id) << 48, flags);
 }
 
+static __rte_always_inline void
+cn10k_process_vwqe(uintptr_t vwqe, uint16_t port_id, const uint32_t flags,
+		   void *lookup_mem, void *tstamp)
+{
+	uint64_t mbuf_init = 0x100010000ULL | RTE_PKTMBUF_HEADROOM |
+			     (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 8 : 0);
+	struct rte_event_vector *vec;
+	uint16_t nb_mbufs, non_vec;
+	uint64_t **wqe;
+
+	mbuf_init |= ((uint64_t)port_id) << 48;
+	vec = (struct rte_event_vector *)vwqe;
+	wqe = vec->u64s;
+
+	nb_mbufs = RTE_ALIGN_FLOOR(vec->nb_elem, NIX_DESCS_PER_LOOP);
+	nb_mbufs = cn10k_nix_recv_pkts_vector(&mbuf_init, vec->mbufs, nb_mbufs,
+					      flags | NIX_RX_VWQE_F, lookup_mem,
+					      tstamp);
+	wqe += nb_mbufs;
+	non_vec = vec->nb_elem - nb_mbufs;
+
+	while (non_vec) {
+		struct nix_cqe_hdr_s *cqe = (struct nix_cqe_hdr_s *)wqe[0];
+		struct rte_mbuf *mbuf;
+		uint64_t tstamp_ptr;
+
+		mbuf = (struct rte_mbuf *)((char *)cqe -
+					   sizeof(struct rte_mbuf));
+		cn10k_nix_cqe_to_mbuf(cqe, cqe->tag, mbuf, lookup_mem,
+				      mbuf_init, flags);
+		/* Extracting tstamp, if PTP enabled*/
+		tstamp_ptr = *(uint64_t *)(((struct nix_wqe_hdr_s *)cqe) +
+					   CNXK_SSO_WQE_SG_PTR);
+		cnxk_nix_mbuf_to_tstamp((struct rte_mbuf *)mbuf, tstamp,
+					flags & NIX_RX_OFFLOAD_TSTAMP_F,
+					flags & NIX_RX_MULTI_SEG_F,
+					(uint64_t *)tstamp_ptr);
+		wqe[0] = (uint64_t *)mbuf;
+		non_vec--;
+		wqe++;
+	}
+}
+
 static __rte_always_inline uint16_t
 cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		       const uint32_t flags, void *lookup_mem)
@@ -152,6 +197,17 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev,
 						flags & NIX_RX_MULTI_SEG_F,
 						(uint64_t *)tstamp_ptr);
 			gw.u64[1] = mbuf;
+		} else if (CNXK_EVENT_TYPE_FROM_TAG(gw.u64[0]) ==
+			   RTE_EVENT_TYPE_ETHDEV_VECTOR) {
+			uint8_t port = CNXK_SUB_EVENT_FROM_TAG(gw.u64[0]);
+			__uint128_t vwqe_hdr = *(__uint128_t *)gw.u64[1];
+
+			vwqe_hdr = ((vwqe_hdr >> 64) & 0xFFF) | BIT_ULL(31) |
+				   ((vwqe_hdr & 0xFFFF) << 48) |
+				   ((uint64_t)port << 32);
+			*(uint64_t *)gw.u64[1] = (uint64_t)vwqe_hdr;
+			cn10k_process_vwqe(gw.u64[1], port, flags, lookup_mem,
+					   ws->tstamp);
 		}
 	}
 
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index d9572b19e7..a506a867ca 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -21,6 +21,7 @@
  * Defining it from backwards to denote its been
  * not used as offload flags to pick function
  */
+#define NIX_RX_VWQE_F	   BIT(14)
 #define NIX_RX_MULTI_SEG_F BIT(15)
 
 #define CNXK_NIX_CQ_ENTRY_SZ 128
@@ -28,6 +29,11 @@
 #define CQE_CAST(x)	     ((struct nix_cqe_hdr_s *)(x))
 #define CQE_SZ(x)	     ((x) * CNXK_NIX_CQ_ENTRY_SZ)
 
+#define CQE_PTR_OFF(b, i, o, f)                                                \
+	(((f) & NIX_RX_VWQE_F) ?                                               \
+		       (uint64_t *)(((uintptr_t)((uint64_t *)(b))[i]) + (o)) : \
+		       (uint64_t *)(((uintptr_t)(b)) + CQE_SZ(i) + (o)))
+
 union mbuf_initializer {
 	struct {
 		uint16_t data_off;
@@ -317,61 +323,87 @@ nix_qinq_update(const uint64_t w2, uint64_t ol_flags, struct rte_mbuf *mbuf)
 }
 
 static __rte_always_inline uint16_t
-cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
-			   uint16_t pkts, const uint16_t flags)
+cn10k_nix_recv_pkts_vector(void *args, struct rte_mbuf **mbufs, uint16_t pkts,
+			   const uint16_t flags, void *lookup_mem,
+			   struct cnxk_timesync_info *tstamp)
 {
-	struct cn10k_eth_rxq *rxq = rx_queue;
-	uint16_t packets = 0;
+	struct cn10k_eth_rxq *rxq = args;
+	const uint64_t mbuf_initializer = (flags & NIX_RX_VWQE_F) ?
+							*(uint64_t *)args :
+							rxq->mbuf_initializer;
+	const uint64x2_t data_off = flags & NIX_RX_VWQE_F ?
+						  vdupq_n_u64(0x80ULL) :
+						  vdupq_n_u64(rxq->data_off);
+	const uint32_t qmask = flags & NIX_RX_VWQE_F ? 0 : rxq->qmask;
+	const uint64_t wdata = flags & NIX_RX_VWQE_F ? 0 : rxq->wdata;
+	const uintptr_t desc = flags & NIX_RX_VWQE_F ? 0 : rxq->desc;
 	uint64x2_t cq0_w8, cq1_w8, cq2_w8, cq3_w8, mbuf01, mbuf23;
-	const uint64_t mbuf_initializer = rxq->mbuf_initializer;
-	const uint64x2_t data_off = vdupq_n_u64(rxq->data_off);
 	uint64_t ol_flags0, ol_flags1, ol_flags2, ol_flags3;
 	uint64x2_t rearm0 = vdupq_n_u64(mbuf_initializer);
 	uint64x2_t rearm1 = vdupq_n_u64(mbuf_initializer);
 	uint64x2_t rearm2 = vdupq_n_u64(mbuf_initializer);
 	uint64x2_t rearm3 = vdupq_n_u64(mbuf_initializer);
 	struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3;
-	const uint16_t *lookup_mem = rxq->lookup_mem;
-	const uint32_t qmask = rxq->qmask;
-	const uint64_t wdata = rxq->wdata;
-	const uintptr_t desc = rxq->desc;
 	uint8x16_t f0, f1, f2, f3;
-	uint32_t head = rxq->head;
+	uint16_t packets = 0;
 	uint16_t pkts_left;
-
-	pkts = nix_rx_nb_pkts(rxq, wdata, pkts, qmask);
-	pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
-
-	/* Packets has to be floor-aligned to NIX_DESCS_PER_LOOP */
-	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	uint32_t head;
+	uintptr_t cq0;
+
+	if (!(flags & NIX_RX_VWQE_F)) {
+		lookup_mem = rxq->lookup_mem;
+		head = rxq->head;
+
+		pkts = nix_rx_nb_pkts(rxq, wdata, pkts, qmask);
+		pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
+		/* Packets has to be floor-aligned to NIX_DESCS_PER_LOOP */
+		pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F)
+			tstamp = rxq->tstamp;
+	} else {
+		RTE_SET_USED(head);
+	}
 
 	while (packets < pkts) {
-		/* Exit loop if head is about to wrap and become unaligned */
-		if (((head + NIX_DESCS_PER_LOOP - 1) & qmask) <
-		    NIX_DESCS_PER_LOOP) {
-			pkts_left += (pkts - packets);
-			break;
-		}
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Exit loop if head is about to wrap and become
+			 * unaligned.
+			 */
+			if (((head + NIX_DESCS_PER_LOOP - 1) & qmask) <
+			    NIX_DESCS_PER_LOOP) {
+				pkts_left += (pkts - packets);
+				break;
+			}
 
-		const uintptr_t cq0 = desc + CQE_SZ(head);
+			cq0 = desc + CQE_SZ(head);
+		} else {
+			cq0 = (uintptr_t)&mbufs[packets];
+		}
 
 		/* Prefetch N desc ahead */
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(8)));
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(9)));
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(10)));
-		rte_prefetch_non_temporal((void *)(cq0 + CQE_SZ(11)));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 8, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 9, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 10, 0, flags));
+		rte_prefetch_non_temporal(CQE_PTR_OFF(cq0, 11, 0, flags));
 
 		/* Get NIX_RX_SG_S for size and buffer pointer */
-		cq0_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(0) + 64));
-		cq1_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(1) + 64));
-		cq2_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(2) + 64));
-		cq3_w8 = vld1q_u64((uint64_t *)(cq0 + CQE_SZ(3) + 64));
-
-		/* Extract mbuf from NIX_RX_SG_S */
-		mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
-		mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
-		mbuf01 = vqsubq_u64(mbuf01, data_off);
-		mbuf23 = vqsubq_u64(mbuf23, data_off);
+		cq0_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 0, 64, flags));
+		cq1_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 1, 64, flags));
+		cq2_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 2, 64, flags));
+		cq3_w8 = vld1q_u64(CQE_PTR_OFF(cq0, 3, 64, flags));
+
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Extract mbuf from NIX_RX_SG_S */
+			mbuf01 = vzip2q_u64(cq0_w8, cq1_w8);
+			mbuf23 = vzip2q_u64(cq2_w8, cq3_w8);
+			mbuf01 = vqsubq_u64(mbuf01, data_off);
+			mbuf23 = vqsubq_u64(mbuf23, data_off);
+		} else {
+			mbuf01 =
+				vsubq_u64(vld1q_u64((uint64_t *)cq0), data_off);
+			mbuf23 = vsubq_u64(vld1q_u64((uint64_t *)(cq0 + 16)),
+					   data_off);
+		}
 
 		/* Move mbufs to scalar registers for future use */
 		mbuf0 = (struct rte_mbuf *)vgetq_lane_u64(mbuf01, 0);
@@ -395,14 +427,14 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		f3 = vqtbl1q_u8(cq3_w8, shuf_msk);
 
 		/* Load CQE word0 and word 1 */
-		uint64_t cq0_w0 = ((uint64_t *)(cq0 + CQE_SZ(0)))[0];
-		uint64_t cq0_w1 = ((uint64_t *)(cq0 + CQE_SZ(0)))[1];
-		uint64_t cq1_w0 = ((uint64_t *)(cq0 + CQE_SZ(1)))[0];
-		uint64_t cq1_w1 = ((uint64_t *)(cq0 + CQE_SZ(1)))[1];
-		uint64_t cq2_w0 = ((uint64_t *)(cq0 + CQE_SZ(2)))[0];
-		uint64_t cq2_w1 = ((uint64_t *)(cq0 + CQE_SZ(2)))[1];
-		uint64_t cq3_w0 = ((uint64_t *)(cq0 + CQE_SZ(3)))[0];
-		uint64_t cq3_w1 = ((uint64_t *)(cq0 + CQE_SZ(3)))[1];
+		const uint64_t cq0_w0 = *CQE_PTR_OFF(cq0, 0, 0, flags);
+		const uint64_t cq0_w1 = *CQE_PTR_OFF(cq0, 0, 1, flags);
+		const uint64_t cq1_w0 = *CQE_PTR_OFF(cq0, 1, 0, flags);
+		const uint64_t cq1_w1 = *CQE_PTR_OFF(cq0, 1, 1, flags);
+		const uint64_t cq2_w0 = *CQE_PTR_OFF(cq0, 2, 0, flags);
+		const uint64_t cq2_w1 = *CQE_PTR_OFF(cq0, 2, 1, flags);
+		const uint64_t cq3_w0 = *CQE_PTR_OFF(cq0, 3, 0, flags);
+		const uint64_t cq3_w1 = *CQE_PTR_OFF(cq0, 3, 1, flags);
 
 		if (flags & NIX_RX_OFFLOAD_RSS_F) {
 			/* Fill rss in the rx_descriptor_fields1 */
@@ -459,17 +491,17 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 		if (flags & NIX_RX_OFFLOAD_MARK_UPDATE_F) {
 			ol_flags0 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(0) + 38), ol_flags0,
-				mbuf0);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 0, 38, flags),
+				ol_flags0, mbuf0);
 			ol_flags1 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(1) + 38), ol_flags1,
-				mbuf1);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 1, 38, flags),
+				ol_flags1, mbuf1);
 			ol_flags2 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(2) + 38), ol_flags2,
-				mbuf2);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 2, 38, flags),
+				ol_flags2, mbuf2);
 			ol_flags3 = nix_update_match_id(
-				*(uint16_t *)(cq0 + CQE_SZ(3) + 38), ol_flags3,
-				mbuf3);
+				*(uint16_t *)CQE_PTR_OFF(cq0, 3, 38, flags),
+				ol_flags3, mbuf3);
 		}
 
 		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
@@ -488,7 +520,7 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 						  RTE_PTYPE_L2_ETHER_TIMESYNC};
 			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
 						PKT_RX_IEEE1588_TMST |
-						rxq->tstamp->rx_tstamp_dynflag;
+						tstamp->rx_tstamp_dynflag;
 			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
 			uint64x2_t ts01, ts23, mask;
 			uint64_t ts[4];
@@ -526,14 +558,10 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			ts[3] = vgetq_lane_u64(ts23, 1);
 
 			/* Store timestamp into dynfield. */
-			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
-				ts[0];
-			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
-				ts[1];
-			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
-				ts[2];
-			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
-				ts[3];
+			*cnxk_nix_timestamp_dynfield(mbuf0, tstamp) = ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, tstamp) = ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, tstamp) = ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, tstamp) = ts[3];
 
 			/* Generate ptype mask to filter L2 ether timesync */
 			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
@@ -559,9 +587,8 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				/* Update Rxq timestamp with the latest
 				 * timestamp.
 				 */
-				rxq->tstamp->rx_ready = 1;
-				rxq->tstamp->rx_tstamp =
-					ts[31 - __builtin_clz(res)];
+				tstamp->rx_ready = 1;
+				tstamp->rx_tstamp = ts[31 - __builtin_clz(res)];
 			}
 		}
 
@@ -584,25 +611,25 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
 
 		/* Store the mbufs to rx_pkts */
-		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
-		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
+		vst1q_u64((uint64_t *)&mbufs[packets], mbuf01);
+		vst1q_u64((uint64_t *)&mbufs[packets + 2], mbuf23);
 
 		if (flags & NIX_RX_MULTI_SEG_F) {
 			/* Multi segment is enable build mseg list for
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 0, 8, flags)),
+					    mbuf0, mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 1, 8, flags)),
+					    mbuf1, mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 2, 8, flags)),
+					    mbuf2, mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-						(cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer, flags);
+					    (CQE_PTR_OFF(cq0, 3, 8, flags)),
+					    mbuf3, mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
@@ -623,12 +650,18 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		__mempool_check_cookies(mbuf2->pool, (void **)&mbuf2, 1, 1);
 		__mempool_check_cookies(mbuf3->pool, (void **)&mbuf3, 1, 1);
 
-		/* Advance head pointer and packets */
-		head += NIX_DESCS_PER_LOOP;
-		head &= qmask;
 		packets += NIX_DESCS_PER_LOOP;
+
+		if (!(flags & NIX_RX_VWQE_F)) {
+			/* Advance head pointer and packets */
+			head += NIX_DESCS_PER_LOOP;
+			head &= qmask;
+		}
 	}
 
+	if (flags & NIX_RX_VWQE_F)
+		return packets;
+
 	rxq->head = head;
 	rxq->available -= packets;
 
@@ -637,8 +670,8 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 	plt_write64((rxq->wdata | packets), rxq->cq_door);
 
 	if (unlikely(pkts_left))
-		packets += cn10k_nix_recv_pkts(rx_queue, &rx_pkts[packets],
-					       pkts_left, flags);
+		packets += cn10k_nix_recv_pkts(args, &mbufs[packets], pkts_left,
+					       flags);
 
 	return packets;
 }
@@ -647,12 +680,15 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 static inline uint16_t
 cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
-			   uint16_t pkts, const uint16_t flags)
+			   uint16_t pkts, const uint16_t flags,
+			   void *lookup_mem, void *tstamp)
 {
+	RTE_SET_USED(lookup_mem);
 	RTE_SET_USED(rx_queue);
 	RTE_SET_USED(rx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(flags);
+	RTE_SET_USED(tstamp);
 
 	return 0;
 }
diff --git a/drivers/net/cnxk/cn10k_rx_vec.c b/drivers/net/cnxk/cn10k_rx_vec.c
index 93528a44f9..166735ad59 100644
--- a/drivers/net/cnxk/cn10k_rx_vec.c
+++ b/drivers/net/cnxk/cn10k_rx_vec.c
@@ -12,7 +12,7 @@
 					       uint16_t pkts)                  \
 	{                                                                      \
 		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
-						  (flags));		       \
+						  (flags), NULL, NULL);        \
 	}
 
 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_rx_vec_mseg.c b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
index 04d1e46c82..1f44dddddd 100644
--- a/drivers/net/cnxk/cn10k_rx_vec_mseg.c
+++ b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
@@ -9,8 +9,9 @@
 	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
 	{                                                                      \
-		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
-					  (flags) | NIX_RX_MULTI_SEG_F);       \
+		return cn10k_nix_recv_pkts_vector(                             \
+			rx_queue, rx_pkts, pkts, (flags) | NIX_RX_MULTI_SEG_F, \
+			NULL, NULL);                                           \
 	}
 
 NIX_RX_FASTPATH_MODES
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v3 13/13] event/cnxk: add Tx event vector fastpath
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (10 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 12/13] event/cnxk: add Rx event vector fastpath pbhagavatula
@ 2021-06-20 20:29     ` pbhagavatula
  2021-06-27  6:57     ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine Jerin Jacob
                       ` (2 subsequent siblings)
  14 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-20 20:29 UTC (permalink / raw)
  To: jerinj, Pavan Nikhilesh, Shijith Thotton, Nithin Dabilpuram,
	Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add Tx event vector fastpath, integrate event vector Tx routine
into Tx burst.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst          |   1 +
 doc/guides/rel_notes/release_21_08.rst |   2 +-
 drivers/common/cnxk/roc_sso.h          |  23 ++++++
 drivers/event/cnxk/cn10k_eventdev.c    |   3 +-
 drivers/event/cnxk/cn10k_worker.h      | 104 +++++++++++++++++++++++--
 drivers/event/cnxk/cn9k_worker.h       |   4 +-
 drivers/event/cnxk/cnxk_worker.h       |  22 ------
 drivers/net/cnxk/cn10k_tx.c            |   2 +-
 drivers/net/cnxk/cn10k_tx.h            |  52 +++++++++----
 drivers/net/cnxk/cn10k_tx_mseg.c       |   3 +-
 drivers/net/cnxk/cn10k_tx_vec.c        |   2 +-
 drivers/net/cnxk/cn10k_tx_vec_mseg.c   |   2 +-
 12 files changed, 167 insertions(+), 53 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index 0297cd3d5f..53560d3830 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -47,6 +47,7 @@ Features of the OCTEON cnxk SSO PMD are:
 - Full Rx/Tx offload support defined through ethdev queue configuration.
 - HW managed event vectorization on CN10K for packets enqueued from ethdev to
   eventdev configurable per each Rx queue in Rx adapter.
+- Event vector transmission via Tx adapter.
 
 Prerequisites and Compilation procedure
 ---------------------------------------
diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index 11ccc9bcb5..9e49cb27d7 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -64,7 +64,7 @@ New Features
 
   * Added Rx/Tx adapter support for event/cnxk when the ethernet device requested
     is net/cnxk.
-  * Add support for event vectorization for Rx adapter.
+  * Add support for event vectorization for Rx/Tx adapter.
 
 
 Removed Items
diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
index a6030e7d8a..316c6ccd59 100644
--- a/drivers/common/cnxk/roc_sso.h
+++ b/drivers/common/cnxk/roc_sso.h
@@ -44,6 +44,29 @@ struct roc_sso {
 	uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
 } __plt_cache_aligned;
 
+static __rte_always_inline void
+roc_sso_hws_head_wait(uintptr_t tag_op)
+{
+#ifdef RTE_ARCH_ARM64
+	uint64_t tag;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldr %[tag], [%[tag_op]]	\n"
+		     "		tbnz %[tag], 35, done%=		\n"
+		     "		sevl				\n"
+		     "rty%=:	wfe				\n"
+		     "		ldr %[tag], [%[tag_op]]	\n"
+		     "		tbz %[tag], 35, rty%=		\n"
+		     "done%=:					\n"
+		     : [tag] "=&r"(tag)
+		     : [tag_op] "r"(tag_op));
+#else
+	/* Wait for the SWTAG/SWTAG_FULL operation */
+	while (!(plt_read64(tag_op) & BIT_ULL(35)))
+		;
+#endif
+}
+
 /* SSO device initialization */
 int __roc_api roc_sso_dev_init(struct roc_sso *roc_sso);
 int __roc_api roc_sso_dev_fini(struct roc_sso *roc_sso);
diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index e85fa4785d..6f37c5bd23 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -782,7 +782,8 @@ cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
 	if (ret)
 		*caps = 0;
 	else
-		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+		*caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT |
+			RTE_EVENT_ETH_TX_ADAPTER_CAP_EVENT_VECTOR;
 
 	return 0;
 }
diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
index 7a48a6b17d..9cc0992063 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -308,29 +308,120 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
 NIX_RX_FASTPATH_MODES
 #undef R
 
-static __rte_always_inline const struct cn10k_eth_txq *
+static __rte_always_inline struct cn10k_eth_txq *
 cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
 			  const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
 {
-	return (const struct cn10k_eth_txq *)
+	return (struct cn10k_eth_txq *)
 		txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
 }
 
+static __rte_always_inline void
+cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
+			uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
+			uint8_t sched_type, uintptr_t base,
+			const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+			const uint32_t flags)
+{
+	uint16_t port[4], queue[4];
+	struct cn10k_eth_txq *txq;
+	uint16_t i, j;
+	uintptr_t pa;
+
+	for (i = 0; i < nb_mbufs; i += 4) {
+		port[0] = mbufs[i]->port;
+		port[1] = mbufs[i + 1]->port;
+		port[2] = mbufs[i + 2]->port;
+		port[3] = mbufs[i + 3]->port;
+
+		queue[0] = rte_event_eth_tx_adapter_txq_get(mbufs[i]);
+		queue[1] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 1]);
+		queue[2] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 2]);
+		queue[3] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 3]);
+
+		if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
+		    ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
+
+			for (j = 0; j < 4; j++) {
+				struct rte_mbuf *m = mbufs[i + j];
+
+				txq = (struct cn10k_eth_txq *)
+					txq_data[port[j]][queue[j]];
+				cn10k_nix_tx_skeleton(txq, cmd, flags);
+				/* Perform header writes before barrier
+				 * for TSO
+				 */
+				if (flags & NIX_TX_OFFLOAD_TSO_F)
+					cn10k_nix_xmit_prepare_tso(m, flags);
+
+				cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags,
+						       txq->lso_tun_fmt);
+				if (flags & NIX_TX_MULTI_SEG_F) {
+					const uint16_t segdw =
+						cn10k_nix_prepare_mseg(
+							m, (uint64_t *)lmt_addr,
+							flags);
+					pa = txq->io_addr | ((segdw - 1) << 4);
+				} else {
+					pa = txq->io_addr |
+					     (cn10k_nix_tx_ext_subs(flags) + 1)
+						     << 4;
+				}
+				if (!sched_type)
+					roc_sso_hws_head_wait(base +
+							      SSOW_LF_GWS_TAG);
+
+				roc_lmt_submit_steorl(lmt_id, pa);
+			}
+		} else {
+			txq = (struct cn10k_eth_txq *)
+				txq_data[port[0]][queue[0]];
+			cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd, base
+					+ SSOW_LF_GWS_TAG,
+						   flags | NIX_TX_VWQE_F);
+		}
+	}
+}
+
 static __rte_always_inline uint16_t
 cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		       uint64_t *cmd,
 		       const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
 		       const uint32_t flags)
 {
-	const struct cn10k_eth_txq *txq;
-	struct rte_mbuf *m = ev->mbuf;
-	uint16_t ref_cnt = m->refcnt;
+	struct cn10k_eth_txq *txq;
+	struct rte_mbuf *m;
 	uintptr_t lmt_addr;
+	uint16_t ref_cnt;
 	uint16_t lmt_id;
 	uintptr_t pa;
 
 	lmt_addr = ws->lmt_base;
 	ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+	if (ev->event_type & RTE_EVENT_TYPE_VECTOR) {
+		struct rte_mbuf **mbufs = ev->vec->mbufs;
+		uint64_t meta = *(uint64_t *)ev->vec;
+
+		if (meta & BIT(31)) {
+			txq = (struct cn10k_eth_txq *)
+				txq_data[meta >> 32][meta >> 48];
+
+			cn10k_nix_xmit_pkts_vector(
+				txq, mbufs, meta & 0xFFFF, cmd,
+				ws->tx_base + SSOW_LF_GWS_TAG,
+				flags | NIX_TX_VWQE_F);
+		} else {
+			cn10k_sso_vwqe_split_tx(
+				mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
+				ev->sched_type, ws->tx_base, txq_data, flags);
+		}
+		rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
+		return (meta & 0xFFFF);
+	}
+
+	m = ev->mbuf;
+	ref_cnt = m->refcnt;
 	txq = cn10k_sso_hws_xtract_meta(m, txq_data);
 	cn10k_nix_tx_skeleton(txq, cmd, flags);
 	/* Perform header writes before barrier for TSO */
@@ -346,7 +437,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 		pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
 	}
 	if (!ev->sched_type)
-		cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
+		roc_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
 
 	roc_lmt_submit_steorl(lmt_id, pa);
 
@@ -357,7 +448,6 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
 
 	cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
 				 ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
-
 	return 1;
 }
 
diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 5aa053c586..ef1e83741a 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -458,7 +458,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
-			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
 						       txq->io_addr, segdw);
@@ -469,7 +469,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	} else {
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
 			cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
-			cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+			roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
 			if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
 				cn9k_nix_xmit_one(cmd, txq->lmt_addr,
 						  txq->io_addr, flags);
diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
index 4eb46ae162..945132b748 100644
--- a/drivers/event/cnxk/cnxk_worker.h
+++ b/drivers/event/cnxk/cnxk_worker.h
@@ -75,27 +75,5 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
 #endif
 }
 
-static __rte_always_inline void
-cnxk_sso_hws_head_wait(uintptr_t tag_op)
-{
-#ifdef RTE_ARCH_ARM64
-	uint64_t swtp;
-
-	asm volatile(PLT_CPU_FEATURE_PREAMBLE
-		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbz %[swtb], 35, done%=		\n"
-		     "		sevl				\n"
-		     "rty%=:	wfe				\n"
-		     "		ldr %[swtb], [%[swtp_loc]]	\n"
-		     "		tbnz %[swtb], 35, rty%=		\n"
-		     "done%=:					\n"
-		     : [swtb] "=&r"(swtp)
-		     : [swtp_loc] "r"(tag_op));
-#else
-	/* Wait for the SWTAG/SWTAG_FULL operation */
-	while (plt_read64(tag_op) & BIT_ULL(35))
-		;
-#endif
-}
 
 #endif
diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 1f30bab59a..0e1276c60b 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -16,7 +16,7 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd,       \
-					   flags);			       \
+					   0, flags);			       \
 	}
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 532b53b319..d2a24120ef 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -18,6 +18,7 @@
  * Defining it from backwards to denote its been
  * not used as offload flags to pick function
  */
+#define NIX_TX_VWQE_F	   BIT(14)
 #define NIX_TX_MULTI_SEG_F BIT(15)
 
 #define NIX_TX_NEED_SEND_HDR_W1                                                \
@@ -519,7 +520,7 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
-		    uint64_t *cmd, const uint16_t flags)
+		    uint64_t *cmd, uintptr_t base, const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	const rte_iova_t io_addr = txq->io_addr;
@@ -528,14 +529,15 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 	uint64_t lso_tun_fmt;
 	uint64_t data;
 
-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	}
 
 	/* Get cmd skeleton */
 	cn10k_nix_tx_skeleton(txq, cmd, flags);
 
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
-
 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		lso_tun_fmt = txq->lso_tun_fmt;
 
@@ -558,6 +560,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
 	}
 
+	if (flags & NIX_TX_VWQE_F)
+		roc_sso_hws_head_wait(base);
+
 	/* Trigger LMTST */
 	if (burst > 16) {
 		data = cn10k_nix_tx_steor_data(flags);
@@ -604,7 +609,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
-			 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+			 uint16_t pkts, uint64_t *cmd, uintptr_t base,
+			 const uint16_t flags)
 {
 	struct cn10k_eth_txq *txq = tx_queue;
 	uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
@@ -652,6 +658,9 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 		shft += 3;
 	}
 
+	if (flags & NIX_TX_VWQE_F)
+		roc_sso_hws_head_wait(base);
+
 	data0 = (uint64_t)data128;
 	data1 = (uint64_t)(data128 >> 64);
 	/* Make data0 similar to data1 */
@@ -984,7 +993,8 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
 
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
+			   const uint16_t flags)
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -1013,13 +1023,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint64_t data[2];
 	} wd;
 
-	NIX_XMIT_FC_OR_RETURN(txq, pkts);
-
-	scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
-	pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	if (!(flags & NIX_TX_VWQE_F)) {
+		NIX_XMIT_FC_OR_RETURN(txq, pkts);
+		scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+		pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+		/* Reduce the cached count */
+		txq->fc_cache_pkts -= pkts;
+	} else {
+		scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+		pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+	}
 
-	/* Reduce the cached count */
-	txq->fc_cache_pkts -= pkts;
 	/* Perform header writes before barrier for TSO */
 	if (flags & NIX_TX_OFFLOAD_TSO_F) {
 		for (i = 0; i < pkts; i++)
@@ -1972,6 +1986,9 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (flags & NIX_TX_MULTI_SEG_F)
 		wd.data[0] >>= 16;
 
+	if (flags & NIX_TX_VWQE_F)
+		roc_sso_hws_head_wait(base);
+
 	/* Trigger LMTST */
 	if (lnum > 16) {
 		if (!(flags & NIX_TX_MULTI_SEG_F))
@@ -2028,10 +2045,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	if (unlikely(scalar)) {
 		if (flags & NIX_TX_MULTI_SEG_F)
 			pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
-							 scalar, cmd, flags);
+							 scalar, cmd, base,
+							 flags);
 		else
 			pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
-						    cmd, flags);
+						    cmd, base, flags);
 	}
 
 	return pkts;
@@ -2040,13 +2058,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 #else
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
-			   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+			   uint16_t pkts, uint64_t *cmd, uintptr_t base,
+			   const uint16_t flags)
 {
 	RTE_SET_USED(tx_queue);
 	RTE_SET_USED(tx_pkts);
 	RTE_SET_USED(pkts);
 	RTE_SET_USED(cmd);
 	RTE_SET_USED(flags);
+	RTE_SET_USED(base);
 	return 0;
 }
 #endif
diff --git a/drivers/net/cnxk/cn10k_tx_mseg.c b/drivers/net/cnxk/cn10k_tx_mseg.c
index 33f6754722..4ea4c8a4e5 100644
--- a/drivers/net/cnxk/cn10k_tx_mseg.c
+++ b/drivers/net/cnxk/cn10k_tx_mseg.c
@@ -18,7 +18,8 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,  \
-						(flags) | NIX_TX_MULTI_SEG_F); \
+						0, (flags)		       \
+							| NIX_TX_MULTI_SEG_F); \
 	}
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 34e3737501..a0350496ab 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -18,7 +18,7 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
-						  (flags));                    \
+						  0, (flags));                 \
 	}
 
 NIX_TX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
index 1fad81dbad..7f98f79b97 100644
--- a/drivers/net/cnxk/cn10k_tx_vec_mseg.c
+++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
@@ -16,7 +16,7 @@
 		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(                             \
-			tx_queue, tx_pkts, pkts, cmd,                          \
+			tx_queue, tx_pkts, pkts, cmd, 0,                       \
 			(flags) | NIX_TX_MULTI_SEG_F);                         \
 	}
 
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* Re: [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (11 preceding siblings ...)
  2021-06-20 20:29     ` [dpdk-dev] [PATCH v3 13/13] event/cnxk: add Tx " pbhagavatula
@ 2021-06-27  6:57     ` Jerin Jacob
  2021-06-28 19:41     ` [dpdk-dev] [PATCH v4 1/6] " pbhagavatula
  2021-06-28 19:52     ` [dpdk-dev] [PATCH v4 1/7] event/cnxk: add Rx adapter support pbhagavatula
  14 siblings, 0 replies; 93+ messages in thread
From: Jerin Jacob @ 2021-06-27  6:57 UTC (permalink / raw)
  To: Pavan Nikhilesh
  Cc: Jerin Jacob, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, dpdk-dev

On Mon, Jun 21, 2021 at 1:59 AM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add multi-segment Rx vector routine, form the primary mbufs using
> vector path switch to scalar path when extracting segments.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  Depends-on: http://patches.dpdk.org/project/dpdk/list/?series=17394


Now that net/cnxk related changes merged to dpdk-next-net-mrvl/for-next-net,
Could you rebase and send separate series based on
dpdk-next-net-mrvl/for-next-net for net/cnxk related changes.


>
>  v3 Changes:
>  - Spell check.
>
>  drivers/net/cnxk/cn10k_rx.c          | 31 +++++++++++------
>  drivers/net/cnxk/cn10k_rx.h          | 51 +++++++++++++++++++++-------
>  drivers/net/cnxk/cn10k_rx_vec_mseg.c | 17 ++++++++++
>  drivers/net/cnxk/cn9k_rx.c           | 31 +++++++++++------
>  drivers/net/cnxk/cn9k_rx.h           | 51 +++++++++++++++++++++-------
>  drivers/net/cnxk/cn9k_rx_vec_mseg.c  | 18 ++++++++++
>  drivers/net/cnxk/meson.build         |  2 ++
>  7 files changed, 157 insertions(+), 44 deletions(-)
>  create mode 100644 drivers/net/cnxk/cn10k_rx_vec_mseg.c
>  create mode 100644 drivers/net/cnxk/cn9k_rx_vec_mseg.c
>
> diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
> index 5c956c06b4..3a9fd71309 100644
> --- a/drivers/net/cnxk/cn10k_rx.c
> +++ b/drivers/net/cnxk/cn10k_rx.c
> @@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
>                 [!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
>                 [!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
>                 [!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
> +
> +       rte_atomic_thread_fence(__ATOMIC_RELEASE);
>  }
>
>  void
> @@ -60,20 +62,29 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
>  #undef R
>         };
>
> -       /* For PTP enabled, scalar rx function should be chosen as most of the
> -        * PTP apps are implemented to rx burst 1 pkt.
> -        */
> -       if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
> -               pick_rx_func(eth_dev, nix_eth_rx_burst);
> -       else
> -               pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
> +       const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
> +#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
> +       [f5][f4][f3][f2][f1][f0] = cn10k_nix_recv_pkts_vec_mseg_##name,
>
> -       if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
> -               pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
> +               NIX_RX_FASTPATH_MODES
> +#undef R
> +       };
>
>         /* Copy multi seg version with no offload for tear down sequence */
>         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
>                 dev->rx_pkt_burst_no_offload =
>                         nix_eth_rx_burst_mseg[0][0][0][0][0][0];
> -       rte_mb();
> +
> +       /* For PTP enabled, scalar rx function should be chosen as most of the
> +        * PTP apps are implemented to rx burst 1 pkt.
> +        */
> +       if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
> +               if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
> +                       return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
> +               return pick_rx_func(eth_dev, nix_eth_rx_burst);
> +       }
> +
> +       if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
> +               return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
> +       return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
>  }
> diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
> index 1cc37cbaa0..5926ff7f46 100644
> --- a/drivers/net/cnxk/cn10k_rx.h
> +++ b/drivers/net/cnxk/cn10k_rx.h
> @@ -119,8 +119,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
>
>         sg = *(const uint64_t *)(rx + 1);
>         nb_segs = (sg >> 48) & 0x3;
> -       mbuf->nb_segs = nb_segs;
> +
> +       if (nb_segs == 1) {
> +               mbuf->next = NULL;
> +               return;
> +       }
> +
> +       mbuf->pkt_len = rx->pkt_lenm1 + 1;
>         mbuf->data_len = sg & 0xFFFF;
> +       mbuf->nb_segs = nb_segs;
>         sg = sg >> 16;
>
>         eol = ((const rte_iova_t *)(rx + 1) + ((rx->desc_sizem1 + 1) << 1));
> @@ -195,15 +202,14 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
>                 ol_flags = nix_update_match_id(rx->match_id, ol_flags, mbuf);
>
>         mbuf->ol_flags = ol_flags;
> -       *(uint64_t *)(&mbuf->rearm_data) = val;
>         mbuf->pkt_len = len;
> +       mbuf->data_len = len;
> +       *(uint64_t *)(&mbuf->rearm_data) = val;
>
> -       if (flag & NIX_RX_MULTI_SEG_F) {
> +       if (flag & NIX_RX_MULTI_SEG_F)
>                 nix_cqe_xtract_mseg(rx, mbuf, val);
> -       } else {
> -               mbuf->data_len = len;
> +       else
>                 mbuf->next = NULL;
> -       }
>  }
>
>  static inline uint16_t
> @@ -481,16 +487,34 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
>                 vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
>                 vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
>
> -               /* Update that no more segments */
> -               mbuf0->next = NULL;
> -               mbuf1->next = NULL;
> -               mbuf2->next = NULL;
> -               mbuf3->next = NULL;
> -
>                 /* Store the mbufs to rx_pkts */
>                 vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
>                 vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
>
> +               if (flags & NIX_RX_MULTI_SEG_F) {
> +                       /* Multi segment is enable build mseg list for
> +                        * individual mbufs in scalar mode.
> +                        */
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(0) + 8), mbuf0,
> +                                           mbuf_initializer);
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(1) + 8), mbuf1,
> +                                           mbuf_initializer);
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(2) + 8), mbuf2,
> +                                           mbuf_initializer);
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(3) + 8), mbuf3,
> +                                           mbuf_initializer);
> +               } else {
> +                       /* Update that no more segments */
> +                       mbuf0->next = NULL;
> +                       mbuf1->next = NULL;
> +                       mbuf2->next = NULL;
> +                       mbuf3->next = NULL;
> +               }
> +
>                 /* Prefetch mbufs */
>                 roc_prefetch_store_keep(mbuf0);
>                 roc_prefetch_store_keep(mbuf1);
> @@ -645,6 +669,9 @@ R(vlan_ts_mark_cksum_ptype_rss,     1, 1, 1, 1, 1, 1,                              \
>                 void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
>                                                                                \
>         uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_##name(      \
> +               void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
> +                                                                              \
> +       uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
>                 void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);
>
>  NIX_RX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_rx_vec_mseg.c b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
> new file mode 100644
> index 0000000000..04d1e46c82
> --- /dev/null
> +++ b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2021 Marvell.
> + */
> +
> +#include "cn10k_ethdev.h"
> +#include "cn10k_rx.h"
> +
> +#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
> +       uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
> +               void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
> +       {                                                                      \
> +               return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
> +                                         (flags) | NIX_RX_MULTI_SEG_F);       \
> +       }
> +
> +NIX_RX_FASTPATH_MODES
> +#undef R
> diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
> index 0acedd0a1f..d293d4eac3 100644
> --- a/drivers/net/cnxk/cn9k_rx.c
> +++ b/drivers/net/cnxk/cn9k_rx.c
> @@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
>                 [!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
>                 [!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
>                 [!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
> +
> +       rte_atomic_thread_fence(__ATOMIC_RELEASE);
>  }
>
>  void
> @@ -60,20 +62,29 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
>  #undef R
>         };
>
> -       /* For PTP enabled, scalar rx function should be chosen as most of the
> -        * PTP apps are implemented to rx burst 1 pkt.
> -        */
> -       if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
> -               pick_rx_func(eth_dev, nix_eth_rx_burst);
> -       else
> -               pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
> +       const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
> +#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
> +       [f5][f4][f3][f2][f1][f0] = cn9k_nix_recv_pkts_vec_mseg_##name,
>
> -       if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
> -               pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
> +               NIX_RX_FASTPATH_MODES
> +#undef R
> +       };
>
>         /* Copy multi seg version with no offload for tear down sequence */
>         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
>                 dev->rx_pkt_burst_no_offload =
>                         nix_eth_rx_burst_mseg[0][0][0][0][0][0];
> -       rte_mb();
> +
> +       /* For PTP enabled, scalar rx function should be chosen as most of the
> +        * PTP apps are implemented to rx burst 1 pkt.
> +        */
> +       if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
> +               if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
> +                       return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
> +               return pick_rx_func(eth_dev, nix_eth_rx_burst);
> +       }
> +
> +       if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
> +               return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
> +       return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
>  }
> diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
> index 10ef5c6905..5ae9e8195c 100644
> --- a/drivers/net/cnxk/cn9k_rx.h
> +++ b/drivers/net/cnxk/cn9k_rx.h
> @@ -120,8 +120,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
>
>         sg = *(const uint64_t *)(rx + 1);
>         nb_segs = (sg >> 48) & 0x3;
> -       mbuf->nb_segs = nb_segs;
> +
> +       if (nb_segs == 1) {
> +               mbuf->next = NULL;
> +               return;
> +       }
> +
> +       mbuf->pkt_len = rx->pkt_lenm1 + 1;
>         mbuf->data_len = sg & 0xFFFF;
> +       mbuf->nb_segs = nb_segs;
>         sg = sg >> 16;
>
>         eol = ((const rte_iova_t *)(rx + 1) +
> @@ -198,15 +205,14 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
>                         nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);
>
>         mbuf->ol_flags = ol_flags;
> -       *(uint64_t *)(&mbuf->rearm_data) = val;
>         mbuf->pkt_len = len;
> +       mbuf->data_len = len;
> +       *(uint64_t *)(&mbuf->rearm_data) = val;
>
> -       if (flag & NIX_RX_MULTI_SEG_F) {
> +       if (flag & NIX_RX_MULTI_SEG_F)
>                 nix_cqe_xtract_mseg(rx, mbuf, val);
> -       } else {
> -               mbuf->data_len = len;
> +       else
>                 mbuf->next = NULL;
> -       }
>  }
>
>  static inline uint16_t
> @@ -484,16 +490,34 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
>                 vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
>                 vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);
>
> -               /* Update that no more segments */
> -               mbuf0->next = NULL;
> -               mbuf1->next = NULL;
> -               mbuf2->next = NULL;
> -               mbuf3->next = NULL;
> -
>                 /* Store the mbufs to rx_pkts */
>                 vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
>                 vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);
>
> +               if (flags & NIX_RX_MULTI_SEG_F) {
> +                       /* Multi segment is enable build mseg list for
> +                        * individual mbufs in scalar mode.
> +                        */
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(0) + 8), mbuf0,
> +                                           mbuf_initializer);
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(1) + 8), mbuf1,
> +                                           mbuf_initializer);
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(2) + 8), mbuf2,
> +                                           mbuf_initializer);
> +                       nix_cqe_xtract_mseg((union nix_rx_parse_u *)
> +                                           (cq0 + CQE_SZ(3) + 8), mbuf3,
> +                                           mbuf_initializer);
> +               } else {
> +                       /* Update that no more segments */
> +                       mbuf0->next = NULL;
> +                       mbuf1->next = NULL;
> +                       mbuf2->next = NULL;
> +                       mbuf3->next = NULL;
> +               }
> +
>                 /* Prefetch mbufs */
>                 roc_prefetch_store_keep(mbuf0);
>                 roc_prefetch_store_keep(mbuf1);
> @@ -647,6 +671,9 @@ R(vlan_ts_mark_cksum_ptype_rss,     1, 1, 1, 1, 1, 1,                              \
>                 void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
>                                                                                \
>         uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
> +               void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
> +                                                                              \
> +       uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
>                 void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);
>
>  NIX_RX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn9k_rx_vec_mseg.c b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
> new file mode 100644
> index 0000000000..e46d8a4749
> --- /dev/null
> +++ b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
> @@ -0,0 +1,18 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2021 Marvell.
> + */
> +
> +#include "cn9k_ethdev.h"
> +#include "cn9k_rx.h"
> +
> +#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
> +       uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
> +               void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
> +       {                                                                      \
> +               return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
> +                                                (flags) |                     \
> +                                                        NIX_RX_MULTI_SEG_F);  \
> +       }
> +
> +NIX_RX_FASTPATH_MODES
> +#undef R
> diff --git a/drivers/net/cnxk/meson.build b/drivers/net/cnxk/meson.build
> index 2071d0dcb2..aa8c7253fb 100644
> --- a/drivers/net/cnxk/meson.build
> +++ b/drivers/net/cnxk/meson.build
> @@ -23,6 +23,7 @@ sources += files('cn9k_ethdev.c',
>                  'cn9k_rx.c',
>                  'cn9k_rx_mseg.c',
>                  'cn9k_rx_vec.c',
> +                'cn9k_rx_vec_mseg.c',
>                  'cn9k_tx.c',
>                  'cn9k_tx_mseg.c',
>                  'cn9k_tx_vec.c')
> @@ -32,6 +33,7 @@ sources += files('cn10k_ethdev.c',
>                  'cn10k_rx.c',
>                  'cn10k_rx_mseg.c',
>                  'cn10k_rx_vec.c',
> +                'cn10k_rx_vec_mseg.c',
>                  'cn10k_tx.c',
>                  'cn10k_tx_mseg.c',
>                  'cn10k_tx_vec.c')
> --
> 2.17.1
>

^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v4 1/6] net/cnxk: add multi seg Rx vector routine
  2021-06-20 20:28   ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine pbhagavatula
                       ` (12 preceding siblings ...)
  2021-06-27  6:57     ` [dpdk-dev] [PATCH v3 01/13] net/cnxk: add multi seg Rx vector routine Jerin Jacob
@ 2021-06-28 19:41     ` pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 2/6] net/cnxk: enable ptp processing in vector Rx pbhagavatula
                         ` (5 more replies)
  2021-06-28 19:52     ` [dpdk-dev] [PATCH v4 1/7] event/cnxk: add Rx adapter support pbhagavatula
  14 siblings, 6 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-28 19:41 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add multi-segment Rx vector routine, form the primary mbufs using
vector path switch to scalar path when extracting segments.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v4 Changes:
 - Split patches for easier merge.
 - Rebase on dpdk-next-net-mrvl.
 v3 Changes:
 - Spell check.

 drivers/net/cnxk/cn10k_rx.c          | 31 +++++++++++------
 drivers/net/cnxk/cn10k_rx.h          | 51 +++++++++++++++++++++-------
 drivers/net/cnxk/cn10k_rx_vec_mseg.c | 17 ++++++++++
 drivers/net/cnxk/cn9k_rx.c           | 31 +++++++++++------
 drivers/net/cnxk/cn9k_rx.h           | 51 +++++++++++++++++++++-------
 drivers/net/cnxk/cn9k_rx_vec_mseg.c  | 18 ++++++++++
 drivers/net/cnxk/meson.build         |  2 ++
 7 files changed, 157 insertions(+), 44 deletions(-)
 create mode 100644 drivers/net/cnxk/cn10k_rx_vec_mseg.c
 create mode 100644 drivers/net/cnxk/cn9k_rx_vec_mseg.c

diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
index 5c956c06b..3a9fd7130 100644
--- a/drivers/net/cnxk/cn10k_rx.c
+++ b/drivers/net/cnxk/cn10k_rx.c
@@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
+
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 }

 void
@@ -60,20 +62,29 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 #undef R
 	};

-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
-		pick_rx_func(eth_dev, nix_eth_rx_burst);
-	else
-		pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
+	const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn10k_nix_recv_pkts_vec_mseg_##name,

-	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
-		pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};

 	/* Copy multi seg version with no offload for tear down sequence */
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
-	rte_mb();
+
+	/* For PTP enabled, scalar rx function should be chosen as most of the
+	 * PTP apps are implemented to rx burst 1 pkt.
+	 */
+	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		return pick_rx_func(eth_dev, nix_eth_rx_burst);
+	}
+
+	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+		return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
+	return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
 }
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 1cc37cbaa..5926ff7f4 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -119,8 +119,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,

 	sg = *(const uint64_t *)(rx + 1);
 	nb_segs = (sg >> 48) & 0x3;
-	mbuf->nb_segs = nb_segs;
+
+	if (nb_segs == 1) {
+		mbuf->next = NULL;
+		return;
+	}
+
+	mbuf->pkt_len = rx->pkt_lenm1 + 1;
 	mbuf->data_len = sg & 0xFFFF;
+	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;

 	eol = ((const rte_iova_t *)(rx + 1) + ((rx->desc_sizem1 + 1) << 1));
@@ -195,15 +202,14 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		ol_flags = nix_update_match_id(rx->match_id, ol_flags, mbuf);

 	mbuf->ol_flags = ol_flags;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
 	mbuf->pkt_len = len;
+	mbuf->data_len = len;
+	*(uint64_t *)(&mbuf->rearm_data) = val;

-	if (flag & NIX_RX_MULTI_SEG_F) {
+	if (flag & NIX_RX_MULTI_SEG_F)
 		nix_cqe_xtract_mseg(rx, mbuf, val);
-	} else {
-		mbuf->data_len = len;
+	else
 		mbuf->next = NULL;
-	}
 }

 static inline uint16_t
@@ -481,16 +487,34 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Update that no more segments */
-		mbuf0->next = NULL;
-		mbuf1->next = NULL;
-		mbuf2->next = NULL;
-		mbuf3->next = NULL;
-
 		/* Store the mbufs to rx_pkts */
 		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
 		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);

+		if (flags & NIX_RX_MULTI_SEG_F) {
+			/* Multi segment is enable build mseg list for
+			 * individual mbufs in scalar mode.
+			 */
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer);
+		} else {
+			/* Update that no more segments */
+			mbuf0->next = NULL;
+			mbuf1->next = NULL;
+			mbuf2->next = NULL;
+			mbuf3->next = NULL;
+		}
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
@@ -645,6 +669,9 @@ R(vlan_ts_mark_cksum_ptype_rss,	1, 1, 1, 1, 1, 1,			       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_##name(      \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);

 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn10k_rx_vec_mseg.c b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
new file mode 100644
index 000000000..04d1e46c8
--- /dev/null
+++ b/drivers/net/cnxk/cn10k_rx_vec_mseg.c
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn10k_ethdev.h"
+#include "cn10k_rx.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_noinline __rte_hot cn10k_nix_recv_pkts_vec_mseg_##name( \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
+					  (flags) | NIX_RX_MULTI_SEG_F);       \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
index 0acedd0a1..d293d4eac 100644
--- a/drivers/net/cnxk/cn9k_rx.c
+++ b/drivers/net/cnxk/cn9k_rx.c
@@ -29,6 +29,8 @@ pick_rx_func(struct rte_eth_dev *eth_dev,
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_CHECKSUM_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F)]
 		[!!(dev->rx_offload_flags & NIX_RX_OFFLOAD_RSS_F)];
+
+	rte_atomic_thread_fence(__ATOMIC_RELEASE);
 }

 void
@@ -60,20 +62,29 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 #undef R
 	};

-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP)
-		pick_rx_func(eth_dev, nix_eth_rx_burst);
-	else
-		pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
+	const eth_rx_burst_t nix_eth_rx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	[f5][f4][f3][f2][f1][f0] = cn9k_nix_recv_pkts_vec_mseg_##name,

-	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
-		pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		NIX_RX_FASTPATH_MODES
+#undef R
+	};

 	/* Copy multi seg version with no offload for tear down sequence */
 	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
-	rte_mb();
+
+	/* For PTP enabled, scalar rx function should be chosen as most of the
+	 * PTP apps are implemented to rx burst 1 pkt.
+	 */
+	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
+		return pick_rx_func(eth_dev, nix_eth_rx_burst);
+	}
+
+	if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
+		return pick_rx_func(eth_dev, nix_eth_rx_vec_burst_mseg);
+	return pick_rx_func(eth_dev, nix_eth_rx_vec_burst);
 }
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 10ef5c690..5ae9e8195 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -120,8 +120,15 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,

 	sg = *(const uint64_t *)(rx + 1);
 	nb_segs = (sg >> 48) & 0x3;
-	mbuf->nb_segs = nb_segs;
+
+	if (nb_segs == 1) {
+		mbuf->next = NULL;
+		return;
+	}
+
+	mbuf->pkt_len = rx->pkt_lenm1 + 1;
 	mbuf->data_len = sg & 0xFFFF;
+	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;

 	eol = ((const rte_iova_t *)(rx + 1) +
@@ -198,15 +205,14 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 			nix_update_match_id(rx->cn9k.match_id, ol_flags, mbuf);

 	mbuf->ol_flags = ol_flags;
-	*(uint64_t *)(&mbuf->rearm_data) = val;
 	mbuf->pkt_len = len;
+	mbuf->data_len = len;
+	*(uint64_t *)(&mbuf->rearm_data) = val;

-	if (flag & NIX_RX_MULTI_SEG_F) {
+	if (flag & NIX_RX_MULTI_SEG_F)
 		nix_cqe_xtract_mseg(rx, mbuf, val);
-	} else {
-		mbuf->data_len = len;
+	else
 		mbuf->next = NULL;
-	}
 }

 static inline uint16_t
@@ -484,16 +490,34 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 		vst1q_u64((uint64_t *)mbuf2->rearm_data, rearm2);
 		vst1q_u64((uint64_t *)mbuf3->rearm_data, rearm3);

-		/* Update that no more segments */
-		mbuf0->next = NULL;
-		mbuf1->next = NULL;
-		mbuf2->next = NULL;
-		mbuf3->next = NULL;
-
 		/* Store the mbufs to rx_pkts */
 		vst1q_u64((uint64_t *)&rx_pkts[packets], mbuf01);
 		vst1q_u64((uint64_t *)&rx_pkts[packets + 2], mbuf23);

+		if (flags & NIX_RX_MULTI_SEG_F) {
+			/* Multi segment is enable build mseg list for
+			 * individual mbufs in scalar mode.
+			 */
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer);
+			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
+					    (cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer);
+		} else {
+			/* Update that no more segments */
+			mbuf0->next = NULL;
+			mbuf1->next = NULL;
+			mbuf2->next = NULL;
+			mbuf3->next = NULL;
+		}
+
 		/* Prefetch mbufs */
 		roc_prefetch_store_keep(mbuf0);
 		roc_prefetch_store_keep(mbuf1);
@@ -647,6 +671,9 @@ R(vlan_ts_mark_cksum_ptype_rss,	1, 1, 1, 1, 1, 1,			       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
 									       \
 	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);     \
+									       \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts);

 NIX_RX_FASTPATH_MODES
diff --git a/drivers/net/cnxk/cn9k_rx_vec_mseg.c b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
new file mode 100644
index 000000000..e46d8a474
--- /dev/null
+++ b/drivers/net/cnxk/cn9k_rx_vec_mseg.c
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 Marvell.
+ */
+
+#include "cn9k_ethdev.h"
+#include "cn9k_rx.h"
+
+#define R(name, f5, f4, f3, f2, f1, f0, flags)                                 \
+	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_mseg_##name(  \
+		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
+	{                                                                      \
+		return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
+						 (flags) |                     \
+							 NIX_RX_MULTI_SEG_F);  \
+	}
+
+NIX_RX_FASTPATH_MODES
+#undef R
diff --git a/drivers/net/cnxk/meson.build b/drivers/net/cnxk/meson.build
index 2071d0dcb..aa8c7253f 100644
--- a/drivers/net/cnxk/meson.build
+++ b/drivers/net/cnxk/meson.build
@@ -23,6 +23,7 @@ sources += files('cn9k_ethdev.c',
 		 'cn9k_rx.c',
 		 'cn9k_rx_mseg.c',
 		 'cn9k_rx_vec.c',
+		 'cn9k_rx_vec_mseg.c',
 		 'cn9k_tx.c',
 		 'cn9k_tx_mseg.c',
 		 'cn9k_tx_vec.c')
@@ -32,6 +33,7 @@ sources += files('cn10k_ethdev.c',
 		 'cn10k_rx.c',
 		 'cn10k_rx_mseg.c',
 		 'cn10k_rx_vec.c',
+		 'cn10k_rx_vec_mseg.c',
 		 'cn10k_tx.c',
 		 'cn10k_tx_mseg.c',
 		 'cn10k_tx_vec.c')
--
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v4 2/6] net/cnxk: enable ptp processing in vector Rx
  2021-06-28 19:41     ` [dpdk-dev] [PATCH v4 1/6] " pbhagavatula
@ 2021-06-28 19:41       ` pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 3/6] net/cnxk: enable VLAN processing in vector Tx pbhagavatula
                         ` (4 subsequent siblings)
  5 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-28 19:41 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable PTP offload in vector Rx burst function, use vector path
for processing mbufs and finally switch to scalar when extracting
timestamp.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_ethdev.c |   1 -
 drivers/net/cnxk/cn10k_rx.c     |   5 +-
 drivers/net/cnxk/cn10k_rx.h     | 124 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn10k_rx_vec.c |   3 -
 drivers/net/cnxk/cn9k_ethdev.c  |   1 -
 drivers/net/cnxk/cn9k_rx.c      |   5 +-
 drivers/net/cnxk/cn9k_rx.h      | 124 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_rx_vec.c  |   3 -
 drivers/net/cnxk/cnxk_ethdev.h  |  19 ++---
 9 files changed, 232 insertions(+), 53 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_ethdev.c b/drivers/net/cnxk/cn10k_ethdev.c
index b079edbd3..7caec6cf1 100644
--- a/drivers/net/cnxk/cn10k_ethdev.c
+++ b/drivers/net/cnxk/cn10k_ethdev.c
@@ -301,7 +301,6 @@ nix_ptp_enable_vf(struct rte_eth_dev *eth_dev)
 	if (nix_recalc_mtu(eth_dev))
 		plt_err("Failed to set MTU size for ptp");
 
-	dev->scalar_ena = true;
 	dev->rx_offload_flags |= NIX_RX_OFFLOAD_TSTAMP_F;
 
 	/* Setting up the function pointers as per new offload flags */
diff --git a/drivers/net/cnxk/cn10k_rx.c b/drivers/net/cnxk/cn10k_rx.c
index 3a9fd7130..69e767ac3 100644
--- a/drivers/net/cnxk/cn10k_rx.c
+++ b/drivers/net/cnxk/cn10k_rx.c
@@ -75,10 +75,7 @@ cn10k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
 
-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+	if (dev->scalar_ena) {
 		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
 			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
 		return pick_rx_func(eth_dev, nix_eth_rx_burst);
diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 5926ff7f4..d9572b19e 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -109,7 +109,7 @@ nix_update_match_id(const uint16_t match_id, uint64_t ol_flags,
 
 static __rte_always_inline void
 nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
-		    uint64_t rearm)
+		    uint64_t rearm, const uint16_t flags)
 {
 	const rte_iova_t *iova_list;
 	struct rte_mbuf *head;
@@ -125,8 +125,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
 		return;
 	}
 
-	mbuf->pkt_len = rx->pkt_lenm1 + 1;
-	mbuf->data_len = sg & 0xFFFF;
+	mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					       CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+	mbuf->data_len = (sg & 0xFFFF) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
 	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;
 
@@ -207,7 +209,7 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 	*(uint64_t *)(&mbuf->rearm_data) = val;
 
 	if (flag & NIX_RX_MULTI_SEG_F)
-		nix_cqe_xtract_mseg(rx, mbuf, val);
+		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
 	else
 		mbuf->next = NULL;
 }
@@ -272,8 +274,9 @@ cn10k_nix_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts,
 				      flags);
 		cnxk_nix_mbuf_to_tstamp(mbuf, rxq->tstamp,
 					(flags & NIX_RX_OFFLOAD_TSTAMP_F),
-					(uint64_t *)((uint8_t *)mbuf + data_off)
-					);
+					(flags & NIX_RX_MULTI_SEG_F),
+					(uint64_t *)((uint8_t *)mbuf
+								+ data_off));
 		rx_pkts[packets++] = mbuf;
 		roc_prefetch_store_keep(mbuf);
 		head++;
@@ -469,6 +472,99 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				mbuf3);
 		}
 
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
+			const uint16x8_t len_off = {
+				0,			     /* ptype   0:15 */
+				0,			     /* ptype  16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* pktlen  0:15*/
+				0,			     /* pktlen 16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* datalen 0:15 */
+				0,
+				0,
+				0};
+			const uint32x4_t ptype = {RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC};
+			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
+						PKT_RX_IEEE1588_TMST |
+						rxq->tstamp->rx_tstamp_dynflag;
+			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
+			uint64x2_t ts01, ts23, mask;
+			uint64_t ts[4];
+			uint8_t res;
+
+			/* Subtract timesync length from total pkt length. */
+			f0 = vsubq_u16(f0, len_off);
+			f1 = vsubq_u16(f1, len_off);
+			f2 = vsubq_u16(f2, len_off);
+			f3 = vsubq_u16(f3, len_off);
+
+			/* Get the address of actual timestamp. */
+			ts01 = vaddq_u64(mbuf01, data_off);
+			ts23 = vaddq_u64(mbuf23, data_off);
+			/* Load timestamp from address. */
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  0),
+					      ts01, 0);
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  1),
+					      ts01, 1);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  0),
+					      ts23, 0);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  1),
+					      ts23, 1);
+			/* Convert from be to cpu byteorder. */
+			ts01 = vrev64q_u8(ts01);
+			ts23 = vrev64q_u8(ts23);
+			/* Store timestamp into scalar for later use. */
+			ts[0] = vgetq_lane_u64(ts01, 0);
+			ts[1] = vgetq_lane_u64(ts01, 1);
+			ts[2] = vgetq_lane_u64(ts23, 0);
+			ts[3] = vgetq_lane_u64(ts23, 1);
+
+			/* Store timestamp into dynfield. */
+			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
+				ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
+				ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
+				ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
+				ts[3];
+
+			/* Generate ptype mask to filter L2 ether timesync */
+			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
+			mask = vsetq_lane_u32(vgetq_lane_u32(f1, 0), mask, 1);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f2, 0), mask, 2);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f3, 0), mask, 3);
+
+			/* Match against L2 ether timesync. */
+			mask = vceqq_u32(mask, ptype);
+			/* Convert from vector from scalar mask */
+			res = vaddvq_u32(vandq_u32(mask, and_mask));
+			res &= 0xF;
+
+			if (res) {
+				/* Fill in the ol_flags for any packets that
+				 * matched.
+				 */
+				ol_flags0 |= ((res & 0x1) ? ts_olf : 0);
+				ol_flags1 |= ((res & 0x2) ? ts_olf : 0);
+				ol_flags2 |= ((res & 0x4) ? ts_olf : 0);
+				ol_flags3 |= ((res & 0x8) ? ts_olf : 0);
+
+				/* Update Rxq timestamp with the latest
+				 * timestamp.
+				 */
+				rxq->tstamp->rx_ready = 1;
+				rxq->tstamp->rx_tstamp =
+					ts[31 - __builtin_clz(res)];
+			}
+		}
+
 		/* Form rearm_data with ol_flags */
 		rearm0 = vsetq_lane_u64(ol_flags0, rearm0, 1);
 		rearm1 = vsetq_lane_u64(ol_flags1, rearm1, 1);
@@ -496,17 +592,17 @@ cn10k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
diff --git a/drivers/net/cnxk/cn10k_rx_vec.c b/drivers/net/cnxk/cn10k_rx_vec.c
index 65ffa9784..93528a44f 100644
--- a/drivers/net/cnxk/cn10k_rx_vec.c
+++ b/drivers/net/cnxk/cn10k_rx_vec.c
@@ -11,9 +11,6 @@
 					       struct rte_mbuf **rx_pkts,      \
 					       uint16_t pkts)                  \
 	{                                                                      \
-		/* TSTMP is not supported by vector */                         \
-		if ((flags) & NIX_RX_OFFLOAD_TSTAMP_F)                         \
-			return 0;                                              \
 		return cn10k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,     \
 						  (flags));		       \
 	}
diff --git a/drivers/net/cnxk/cn9k_ethdev.c b/drivers/net/cnxk/cn9k_ethdev.c
index 994fdb7c3..115e67891 100644
--- a/drivers/net/cnxk/cn9k_ethdev.c
+++ b/drivers/net/cnxk/cn9k_ethdev.c
@@ -309,7 +309,6 @@ nix_ptp_enable_vf(struct rte_eth_dev *eth_dev)
 	if (nix_recalc_mtu(eth_dev))
 		plt_err("Failed to set MTU size for ptp");
 
-	dev->scalar_ena = true;
 	dev->rx_offload_flags |= NIX_RX_OFFLOAD_TSTAMP_F;
 
 	/* Setting up the function pointers as per new offload flags */
diff --git a/drivers/net/cnxk/cn9k_rx.c b/drivers/net/cnxk/cn9k_rx.c
index d293d4eac..7d9f1bd61 100644
--- a/drivers/net/cnxk/cn9k_rx.c
+++ b/drivers/net/cnxk/cn9k_rx.c
@@ -75,10 +75,7 @@ cn9k_eth_set_rx_function(struct rte_eth_dev *eth_dev)
 		dev->rx_pkt_burst_no_offload =
 			nix_eth_rx_burst_mseg[0][0][0][0][0][0];
 
-	/* For PTP enabled, scalar rx function should be chosen as most of the
-	 * PTP apps are implemented to rx burst 1 pkt.
-	 */
-	if (dev->scalar_ena || dev->rx_offloads & DEV_RX_OFFLOAD_TIMESTAMP) {
+	if (dev->scalar_ena) {
 		if (dev->rx_offloads & DEV_RX_OFFLOAD_SCATTER)
 			return pick_rx_func(eth_dev, nix_eth_rx_burst_mseg);
 		return pick_rx_func(eth_dev, nix_eth_rx_burst);
diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 5ae9e8195..beb52f39d 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -110,7 +110,7 @@ nix_update_match_id(const uint16_t match_id, uint64_t ol_flags,
 
 static __rte_always_inline void
 nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
-		    uint64_t rearm)
+		    uint64_t rearm, const uint16_t flags)
 {
 	const rte_iova_t *iova_list;
 	struct rte_mbuf *head;
@@ -126,8 +126,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct rte_mbuf *mbuf,
 		return;
 	}
 
-	mbuf->pkt_len = rx->pkt_lenm1 + 1;
-	mbuf->data_len = sg & 0xFFFF;
+	mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					       CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+	mbuf->data_len = (sg & 0xFFFF) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
+					  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
 	mbuf->nb_segs = nb_segs;
 	sg = sg >> 16;
 
@@ -210,7 +212,7 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 	*(uint64_t *)(&mbuf->rearm_data) = val;
 
 	if (flag & NIX_RX_MULTI_SEG_F)
-		nix_cqe_xtract_mseg(rx, mbuf, val);
+		nix_cqe_xtract_mseg(rx, mbuf, val, flag);
 	else
 		mbuf->next = NULL;
 }
@@ -275,8 +277,9 @@ cn9k_nix_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts,
 				     flags);
 		cnxk_nix_mbuf_to_tstamp(mbuf, rxq->tstamp,
 					(flags & NIX_RX_OFFLOAD_TSTAMP_F),
-					(uint64_t *)((uint8_t *)mbuf + data_off)
-					);
+					(flags & NIX_RX_MULTI_SEG_F),
+					(uint64_t *)((uint8_t *)mbuf
+								+ data_off));
 		rx_pkts[packets++] = mbuf;
 		roc_prefetch_store_keep(mbuf);
 		head++;
@@ -472,6 +475,99 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 				mbuf3);
 		}
 
+		if (flags & NIX_RX_OFFLOAD_TSTAMP_F) {
+			const uint16x8_t len_off = {
+				0,			     /* ptype   0:15 */
+				0,			     /* ptype  16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* pktlen  0:15*/
+				0,			     /* pktlen 16:32 */
+				CNXK_NIX_TIMESYNC_RX_OFFSET, /* datalen 0:15 */
+				0,
+				0,
+				0};
+			const uint32x4_t ptype = {RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC,
+						  RTE_PTYPE_L2_ETHER_TIMESYNC};
+			const uint64_t ts_olf = PKT_RX_IEEE1588_PTP |
+						PKT_RX_IEEE1588_TMST |
+						rxq->tstamp->rx_tstamp_dynflag;
+			const uint32x4_t and_mask = {0x1, 0x2, 0x4, 0x8};
+			uint64x2_t ts01, ts23, mask;
+			uint64_t ts[4];
+			uint8_t res;
+
+			/* Subtract timesync length from total pkt length. */
+			f0 = vsubq_u16(f0, len_off);
+			f1 = vsubq_u16(f1, len_off);
+			f2 = vsubq_u16(f2, len_off);
+			f3 = vsubq_u16(f3, len_off);
+
+			/* Get the address of actual timestamp. */
+			ts01 = vaddq_u64(mbuf01, data_off);
+			ts23 = vaddq_u64(mbuf23, data_off);
+			/* Load timestamp from address. */
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  0),
+					      ts01, 0);
+			ts01 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts01,
+									  1),
+					      ts01, 1);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  0),
+					      ts23, 0);
+			ts23 = vsetq_lane_u64(*(uint64_t *)vgetq_lane_u64(ts23,
+									  1),
+					      ts23, 1);
+			/* Convert from be to cpu byteorder. */
+			ts01 = vrev64q_u8(ts01);
+			ts23 = vrev64q_u8(ts23);
+			/* Store timestamp into scalar for later use. */
+			ts[0] = vgetq_lane_u64(ts01, 0);
+			ts[1] = vgetq_lane_u64(ts01, 1);
+			ts[2] = vgetq_lane_u64(ts23, 0);
+			ts[3] = vgetq_lane_u64(ts23, 1);
+
+			/* Store timestamp into dynfield. */
+			*cnxk_nix_timestamp_dynfield(mbuf0, rxq->tstamp) =
+				ts[0];
+			*cnxk_nix_timestamp_dynfield(mbuf1, rxq->tstamp) =
+				ts[1];
+			*cnxk_nix_timestamp_dynfield(mbuf2, rxq->tstamp) =
+				ts[2];
+			*cnxk_nix_timestamp_dynfield(mbuf3, rxq->tstamp) =
+				ts[3];
+
+			/* Generate ptype mask to filter L2 ether timesync */
+			mask = vdupq_n_u32(vgetq_lane_u32(f0, 0));
+			mask = vsetq_lane_u32(vgetq_lane_u32(f1, 0), mask, 1);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f2, 0), mask, 2);
+			mask = vsetq_lane_u32(vgetq_lane_u32(f3, 0), mask, 3);
+
+			/* Match against L2 ether timesync. */
+			mask = vceqq_u32(mask, ptype);
+			/* Convert from vector from scalar mask */
+			res = vaddvq_u32(vandq_u32(mask, and_mask));
+			res &= 0xF;
+
+			if (res) {
+				/* Fill in the ol_flags for any packets that
+				 * matched.
+				 */
+				ol_flags0 |= ((res & 0x1) ? ts_olf : 0);
+				ol_flags1 |= ((res & 0x2) ? ts_olf : 0);
+				ol_flags2 |= ((res & 0x4) ? ts_olf : 0);
+				ol_flags3 |= ((res & 0x8) ? ts_olf : 0);
+
+				/* Update Rxq timestamp with the latest
+				 * timestamp.
+				 */
+				rxq->tstamp->rx_ready = 1;
+				rxq->tstamp->rx_tstamp =
+					ts[31 - __builtin_clz(res)];
+			}
+		}
+
 		/* Form rearm_data with ol_flags */
 		rearm0 = vsetq_lane_u64(ol_flags0, rearm0, 1);
 		rearm1 = vsetq_lane_u64(ol_flags1, rearm1, 1);
@@ -499,17 +595,17 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			 * individual mbufs in scalar mode.
 			 */
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(0) + 8), mbuf0,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(0) + 8), mbuf0,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(1) + 8), mbuf1,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(1) + 8), mbuf1,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(2) + 8), mbuf2,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(2) + 8), mbuf2,
+					    mbuf_initializer, flags);
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
-					    (cq0 + CQE_SZ(3) + 8), mbuf3,
-					    mbuf_initializer);
+						(cq0 + CQE_SZ(3) + 8), mbuf3,
+					    mbuf_initializer, flags);
 		} else {
 			/* Update that no more segments */
 			mbuf0->next = NULL;
diff --git a/drivers/net/cnxk/cn9k_rx_vec.c b/drivers/net/cnxk/cn9k_rx_vec.c
index e61c2225c..ef5f771ef 100644
--- a/drivers/net/cnxk/cn9k_rx_vec.c
+++ b/drivers/net/cnxk/cn9k_rx_vec.c
@@ -9,9 +9,6 @@
 	uint16_t __rte_noinline __rte_hot cn9k_nix_recv_pkts_vec_##name(       \
 		void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t pkts)      \
 	{                                                                      \
-		/* TSTMP is not supported by vector */                         \
-		if ((flags) & NIX_RX_OFFLOAD_TSTAMP_F)                         \
-			return 0;                                              \
 		return cn9k_nix_recv_pkts_vector(rx_queue, rx_pkts, pkts,      \
 						 (flags));                     \
 	}
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index 67b1f4253..4eead0390 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -136,13 +136,12 @@ struct cnxk_eth_qconf {
 };
 
 struct cnxk_timesync_info {
+	uint8_t rx_ready;
+	uint64_t rx_tstamp;
 	uint64_t rx_tstamp_dynflag;
+	int tstamp_dynfield_offset;
 	rte_iova_t tx_tstamp_iova;
 	uint64_t *tx_tstamp;
-	uint64_t rx_tstamp;
-	int tstamp_dynfield_offset;
-	uint8_t tx_ready;
-	uint8_t rx_ready;
 } __plt_cache_aligned;
 
 struct cnxk_eth_dev {
@@ -465,13 +464,15 @@ cnxk_nix_timestamp_dynfield(struct rte_mbuf *mbuf,
 
 static __rte_always_inline void
 cnxk_nix_mbuf_to_tstamp(struct rte_mbuf *mbuf,
-			struct cnxk_timesync_info *tstamp, bool ts_enable,
+			struct cnxk_timesync_info *tstamp,
+			const uint8_t ts_enable, const uint8_t mseg_enable,
 			uint64_t *tstamp_ptr)
 {
-	if (ts_enable &&
-	    (mbuf->data_off ==
-	     RTE_PKTMBUF_HEADROOM + CNXK_NIX_TIMESYNC_RX_OFFSET)) {
-		mbuf->pkt_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+	if (ts_enable) {
+		if (!mseg_enable) {
+			mbuf->pkt_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+			mbuf->data_len -= CNXK_NIX_TIMESYNC_RX_OFFSET;
+		}
 
 		/* Reading the rx timestamp inserted by CGX, viz at
 		 * starting of the packet data.
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v4 3/6] net/cnxk: enable VLAN processing in vector Tx
  2021-06-28 19:41     ` [dpdk-dev] [PATCH v4 1/6] " pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 2/6] net/cnxk: enable ptp processing in vector Rx pbhagavatula
@ 2021-06-28 19:41       ` pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 4/6] net/cnxk: enable ptp " pbhagavatula
                         ` (3 subsequent siblings)
  5 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-28 19:41 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable VLAN offload in vector Tx burst function.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |   3 +-
 drivers/net/cnxk/cn10k_tx.h     | 125 +++++++++++++++++++++++++++----
 drivers/net/cnxk/cn10k_tx_vec.c |   3 +-
 drivers/net/cnxk/cn9k_tx.c      |   3 +-
 drivers/net/cnxk/cn9k_tx.h      | 128 ++++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_tx_vec.c  |   3 +-
 6 files changed, 227 insertions(+), 38 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 18694dc70..05bc163a4 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -69,8 +69,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 
 	if (dev->scalar_ena ||
 	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
-	      NIX_TX_OFFLOAD_TSO_F)))
+	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8b1446f25..1e1697858 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -62,9 +62,14 @@ cn10k_nix_tx_ext_subs(const uint16_t flags)
 static __rte_always_inline uint8_t
 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
 {
-	RTE_SET_USED(flags);
-	/* We can pack up to 4 packets per LMTLINE if there are no offloads. */
-	return 4 << ROC_LMT_LINES_PER_CORE_LOG2;
+	return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
+	       << ROC_LMT_LINES_PER_CORE_LOG2;
+}
+
+static __rte_always_inline uint8_t
+cn10k_nix_tx_dwords_per_line(const uint16_t flags)
+{
+	return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
 }
 
 static __rte_always_inline uint64_t
@@ -98,10 +103,9 @@ cn10k_nix_tx_steor_data(const uint16_t flags)
 static __rte_always_inline uint64_t
 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
 {
-	const uint64_t dw_m1 = 0x7;
+	const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
 	uint64_t data;
 
-	RTE_SET_USED(flags);
 	/* This will be moved to addr area */
 	data = dw_m1;
 	/* 15 vector sizes for single seg */
@@ -690,11 +694,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
-	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
+	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
+		cmd2[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
+	uint64x2_t sendext01_w0, sendext23_w0;
+	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -720,6 +727,14 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
 	sgdesc23_w0 = sgdesc01_w0;
 
+	/* Load command defaults into vector variables. */
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
+		sendext23_w0 = sendext01_w0;
+		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		sendext23_w1 = sendext01_w1;
+	}
+
 	/* Get LMT base address and LMT ID as lcore id */
 	ROC_LMT_BASE_ID_GET(laddr, lmt_id);
 	left = pkts;
@@ -738,6 +753,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc23_w0 = senddesc01_w0;
 		sgdesc23_w0 = sgdesc01_w0;
 
+		/* Clear vlan enables. */
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			sendext01_w1 = vbicq_u64(sendext01_w1,
+						 vdupq_n_u64(0x3FFFF00FFFF00));
+			sendext23_w1 = sendext01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1303,6 +1325,52 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 		senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+			/* Tx ol_flag for vlan. */
+			const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
+			/* Bit enable for VLAN1 */
+			const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+			/* Tx ol_flag for QnQ. */
+			const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
+			/* Bit enable for VLAN0 */
+			const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+			/* Load vlan values from packet. outer is VLAN 0 */
+			uint64x2_t ext01 = {
+				((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+			};
+			uint64x2_t ext23 = {
+				((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+			};
+
+			/* Get ol_flags of the packets. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* ORR vlan outer/inner values into cmd. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+			/* Test for offload enable bits and generate masks. */
+			xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(xtmp128, olq),
+						      mlq));
+			ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(ytmp128, olq),
+						      mlq));
+
+			/* Set vlan enable bits into cmd based on mask. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1381,16 +1449,41 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 		cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 
-		/* Store the prepared send desc to LMT lines */
-		vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
-		vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
-		lnum += 1;
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+			cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+			cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+		}
+
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			/* Store the prepared send desc to LMT lines */
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+			lnum += 1;
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			lnum += 1;
+		} else {
+			/* Store the prepared send desc to LMT lines */
+			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
+			vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
+			lnum += 1;
+		}
 
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 7453f3bc9..beb5c649b 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -14,8 +14,7 @@
 		uint64_t cmd[sz];                                              \
 									       \
 		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
+		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
 		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index b80260607..4b43cdaff 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -68,8 +68,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 
 	if (dev->scalar_ena ||
 	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
-	      NIX_TX_OFFLOAD_TSO_F)))
+	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 1899d6670..d5715bb52 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -552,10 +552,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 {
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
-	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
+	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
+		cmd2[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
+	uint64x2_t sendext01_w0, sendext23_w0;
+	uint64x2_t sendext01_w1, sendext23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
@@ -585,8 +588,19 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	senddesc23_w0 = senddesc01_w0;
 	senddesc01_w1 = vdupq_n_u64(0);
 	senddesc23_w1 = senddesc01_w1;
-	sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
-	sgdesc23_w0 = sgdesc01_w0;
+
+	/* Load command defaults into vector variables. */
+	if (flags & NIX_TX_NEED_EXT_HDR) {
+		sendext01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+		sendext23_w0 = sendext01_w0;
+		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
+		sendext23_w1 = sendext01_w1;
+		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
+		sgdesc23_w0 = sgdesc01_w0;
+	} else {
+		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+		sgdesc23_w0 = sgdesc01_w0;
+	}
 
 	for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
 		/* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
@@ -597,6 +611,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc23_w0 = senddesc01_w0;
 		sgdesc23_w0 = sgdesc01_w0;
 
+		/* Clear vlan enables. */
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			sendext01_w1 = vbicq_u64(sendext01_w1,
+						 vdupq_n_u64(0x3FFFF00FFFF00));
+			sendext23_w1 = sendext01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1162,6 +1183,52 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 		senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 
+		if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
+			/* Tx ol_flag for vlan. */
+			const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
+			/* Bit enable for VLAN1 */
+			const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
+			/* Tx ol_flag for QnQ. */
+			const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
+			/* Bit enable for VLAN0 */
+			const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
+			/* Load vlan values from packet. outer is VLAN 0 */
+			uint64x2_t ext01 = {
+				((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[0]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[1]->vlan_tci) << 32,
+			};
+			uint64x2_t ext23 = {
+				((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[2]->vlan_tci) << 32,
+				((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
+					((uint64_t)tx_pkts[3]->vlan_tci) << 32,
+			};
+
+			/* Get ol_flags of the packets. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* ORR vlan outer/inner values into cmd. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
+
+			/* Test for offload enable bits and generate masks. */
+			xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(xtmp128, olq),
+						      mlq));
+			ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
+						      mlv),
+					    vandq_u64(vtstq_u64(ytmp128, olq),
+						      mlq));
+
+			/* Set vlan enable bits into cmd based on mask. */
+			sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
+			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1247,17 +1314,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 		cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 
-		do {
-			vst1q_u64(lmt_addr, cmd0[0]);
-			vst1q_u64(lmt_addr + 2, cmd1[0]);
-			vst1q_u64(lmt_addr + 4, cmd0[1]);
-			vst1q_u64(lmt_addr + 6, cmd1[1]);
-			vst1q_u64(lmt_addr + 8, cmd0[2]);
-			vst1q_u64(lmt_addr + 10, cmd1[2]);
-			vst1q_u64(lmt_addr + 12, cmd0[3]);
-			vst1q_u64(lmt_addr + 14, cmd1[3]);
-			lmt_status = roc_lmt_submit_ldeor(io_addr);
-		} while (lmt_status == 0);
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
+			cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
+			cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
+			cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
+		}
+
+		if (flags & NIX_TX_NEED_EXT_HDR) {
+			/* With ext header in the command we can no longer send
+			 * all 4 packets together since LMTLINE is 128bytes.
+			 * Split and Tx twice.
+			 */
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd2[0]);
+				vst1q_u64(lmt_addr + 4, cmd1[0]);
+				vst1q_u64(lmt_addr + 6, cmd0[1]);
+				vst1q_u64(lmt_addr + 8, cmd2[1]);
+				vst1q_u64(lmt_addr + 10, cmd1[1]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+
+			do {
+				vst1q_u64(lmt_addr, cmd0[2]);
+				vst1q_u64(lmt_addr + 2, cmd2[2]);
+				vst1q_u64(lmt_addr + 4, cmd1[2]);
+				vst1q_u64(lmt_addr + 6, cmd0[3]);
+				vst1q_u64(lmt_addr + 8, cmd2[3]);
+				vst1q_u64(lmt_addr + 10, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+		} else {
+			do {
+				vst1q_u64(lmt_addr, cmd0[0]);
+				vst1q_u64(lmt_addr + 2, cmd1[0]);
+				vst1q_u64(lmt_addr + 4, cmd0[1]);
+				vst1q_u64(lmt_addr + 6, cmd1[1]);
+				vst1q_u64(lmt_addr + 8, cmd0[2]);
+				vst1q_u64(lmt_addr + 10, cmd1[2]);
+				vst1q_u64(lmt_addr + 12, cmd0[3]);
+				vst1q_u64(lmt_addr + 14, cmd1[3]);
+				lmt_status = roc_lmt_submit_ldeor(io_addr);
+			} while (lmt_status == 0);
+		}
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}
 
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index a6e7c9e54..5842facb5 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -14,8 +14,7 @@
 		uint64_t cmd[sz];                                              \
 									       \
 		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
+		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
 		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v4 4/6] net/cnxk: enable ptp processing in vector Tx
  2021-06-28 19:41     ` [dpdk-dev] [PATCH v4 1/6] " pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 2/6] net/cnxk: enable ptp processing in vector Rx pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 3/6] net/cnxk: enable VLAN processing in vector Tx pbhagavatula
@ 2021-06-28 19:41       ` pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 5/6] net/cnxk: enable TSO " pbhagavatula
                         ` (2 subsequent siblings)
  5 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-28 19:41 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable PTP offload in vector Tx burst function. Since, we can
no-longer use a single LMT line for burst of 4, split the LMT
into two and transmit twice.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |   4 +-
 drivers/net/cnxk/cn10k_tx.h     | 109 +++++++++++++++++++++++++++-----
 drivers/net/cnxk/cn10k_tx_vec.c |   5 +-
 drivers/net/cnxk/cn9k_tx.c      |   4 +-
 drivers/net/cnxk/cn9k_tx.h      | 105 ++++++++++++++++++++++++++----
 drivers/net/cnxk/cn9k_tx_vec.c  |   5 +-
 6 files changed, 192 insertions(+), 40 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index 05bc163a4..c4c3e6570 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,9 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena ||
-	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
+	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 1e1697858..8af6799ff 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -69,7 +69,9 @@ cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
 static __rte_always_inline uint8_t
 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
 {
-	return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
+	return (flags & NIX_TX_NEED_EXT_HDR) ?
+			     ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
+			     8;
 }
 
 static __rte_always_inline uint64_t
@@ -695,13 +697,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
-		cmd2[NIX_DESCS_PER_LOOP];
+		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint16_t left, scalar, burst, i, lmt_id;
 	uint64x2_t sendext01_w0, sendext23_w0;
 	uint64x2_t sendext01_w1, sendext23_w1;
+	uint64x2_t sendmem01_w0, sendmem23_w0;
+	uint64x2_t sendmem01_w1, sendmem23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn10k_eth_txq *txq = tx_queue;
@@ -733,6 +737,12 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		sendext23_w0 = sendext01_w0;
 		sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
 		sendext23_w1 = sendext01_w1;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
+			sendmem23_w0 = sendmem01_w0;
+			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
+			sendmem23_w1 = sendmem01_w1;
+		}
 	}
 
 	/* Get LMT base address and LMT ID as lcore id */
@@ -760,6 +770,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = sendext01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Reset send mem alg to SETTSTMP from SUB*/
+			sendmem01_w0 = vbicq_u64(sendmem01_w0,
+						 vdupq_n_u64(BIT_ULL(59)));
+			/* Reset send mem address to default. */
+			sendmem01_w1 =
+				vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+			sendmem23_w0 = sendmem01_w0;
+			sendmem23_w1 = sendmem01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1371,6 +1392,44 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Tx ol_flag for timestam. */
+			const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
+						PKT_TX_IEEE1588_TMST};
+			/* Set send mem alg to SUB. */
+			const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+			/* Increment send mem address by 8. */
+			const uint64x2_t addr = {0x8, 0x8};
+
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Check if timestamp is requested and generate inverted
+			 * mask as we need not make any changes to default cmd
+			 * value.
+			 */
+			xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+			ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+			/* Change send mem address to an 8 byte offset when
+			 * TSTMP is disabled.
+			 */
+			sendmem01_w1 = vaddq_u64(sendmem01_w1,
+						 vandq_u64(xtmp128, addr));
+			sendmem23_w1 = vaddq_u64(sendmem23_w1,
+						 vandq_u64(ytmp128, addr));
+			/* Change send mem alg to SUB when TSTMP is disabled. */
+			sendmem01_w0 = vorrq_u64(sendmem01_w0,
+						 vandq_u64(xtmp128, alg));
+			sendmem23_w0 = vorrq_u64(sendmem23_w0,
+						 vandq_u64(ytmp128, alg));
+
+			cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1458,19 +1517,39 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 		if (flags & NIX_TX_NEED_EXT_HDR) {
 			/* Store the prepared send desc to LMT lines */
-			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
-			lnum += 1;
-			vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
-			vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
+				lnum += 1;
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
+			} else {
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
+				lnum += 1;
+				vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
+				vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
+			}
 			lnum += 1;
 		} else {
 			/* Store the prepared send desc to LMT lines */
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index beb5c649b..0b4a4c7ba 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -13,9 +13,8 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* TSO is not supported by vec */                              \
+		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
 						  (flags));                    \
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index 4b43cdaff..c32681ed4 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,9 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena ||
-	    (dev->tx_offload_flags &
-	     (NIX_TX_OFFLOAD_TSTAMP_F | NIX_TX_OFFLOAD_TSO_F)))
+	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index d5715bb52..cb574a1c1 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -553,12 +553,14 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 	uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 	uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
-		cmd2[NIX_DESCS_PER_LOOP];
+		cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
 	uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 	uint64x2_t senddesc01_w0, senddesc23_w0;
 	uint64x2_t senddesc01_w1, senddesc23_w1;
 	uint64x2_t sendext01_w0, sendext23_w0;
 	uint64x2_t sendext01_w1, sendext23_w1;
+	uint64x2_t sendmem01_w0, sendmem23_w0;
+	uint64x2_t sendmem01_w1, sendmem23_w1;
 	uint64x2_t sgdesc01_w0, sgdesc23_w0;
 	uint64x2_t sgdesc01_w1, sgdesc23_w1;
 	struct cn9k_eth_txq *txq = tx_queue;
@@ -597,6 +599,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		sendext23_w1 = sendext01_w1;
 		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[4]);
 		sgdesc23_w0 = sgdesc01_w0;
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			sendmem01_w0 = vld1q_dup_u64(&txq->cmd[6]);
+			sendmem23_w0 = sendmem01_w0;
+			sendmem01_w1 = vld1q_dup_u64(&txq->cmd[7]);
+			sendmem23_w1 = sendmem01_w1;
+		}
 	} else {
 		sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
 		sgdesc23_w0 = sgdesc01_w0;
@@ -618,6 +626,17 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = sendext01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Reset send mem alg to SETTSTMP from SUB*/
+			sendmem01_w0 = vbicq_u64(sendmem01_w0,
+						 vdupq_n_u64(BIT_ULL(59)));
+			/* Reset send mem address to default. */
+			sendmem01_w1 =
+				vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
+			sendmem23_w0 = sendmem01_w0;
+			sendmem23_w1 = sendmem01_w1;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1229,6 +1248,44 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+			/* Tx ol_flag for timestam. */
+			const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
+						PKT_TX_IEEE1588_TMST};
+			/* Set send mem alg to SUB. */
+			const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
+			/* Increment send mem address by 8. */
+			const uint64x2_t addr = {0x8, 0x8};
+
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Check if timestamp is requested and generate inverted
+			 * mask as we need not make any changes to default cmd
+			 * value.
+			 */
+			xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
+			ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
+
+			/* Change send mem address to an 8 byte offset when
+			 * TSTMP is disabled.
+			 */
+			sendmem01_w1 = vaddq_u64(sendmem01_w1,
+						 vandq_u64(xtmp128, addr));
+			sendmem23_w1 = vaddq_u64(sendmem23_w1,
+						 vandq_u64(ytmp128, addr));
+			/* Change send mem alg to SUB when TSTMP is disabled. */
+			sendmem01_w0 = vorrq_u64(sendmem01_w0,
+						 vandq_u64(xtmp128, alg));
+			sendmem23_w0 = vorrq_u64(sendmem23_w0,
+						 vandq_u64(ytmp128, alg));
+
+			cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
+			cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
+			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
@@ -1327,22 +1384,44 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			 * Split and Tx twice.
 			 */
 			do {
-				vst1q_u64(lmt_addr, cmd0[0]);
-				vst1q_u64(lmt_addr + 2, cmd2[0]);
-				vst1q_u64(lmt_addr + 4, cmd1[0]);
-				vst1q_u64(lmt_addr + 6, cmd0[1]);
-				vst1q_u64(lmt_addr + 8, cmd2[1]);
-				vst1q_u64(lmt_addr + 10, cmd1[1]);
+				if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+					vst1q_u64(lmt_addr, cmd0[0]);
+					vst1q_u64(lmt_addr + 2, cmd2[0]);
+					vst1q_u64(lmt_addr + 4, cmd1[0]);
+					vst1q_u64(lmt_addr + 6, cmd3[0]);
+					vst1q_u64(lmt_addr + 8, cmd0[1]);
+					vst1q_u64(lmt_addr + 10, cmd2[1]);
+					vst1q_u64(lmt_addr + 12, cmd1[1]);
+					vst1q_u64(lmt_addr + 14, cmd3[1]);
+				} else {
+					vst1q_u64(lmt_addr, cmd0[0]);
+					vst1q_u64(lmt_addr + 2, cmd2[0]);
+					vst1q_u64(lmt_addr + 4, cmd1[0]);
+					vst1q_u64(lmt_addr + 6, cmd0[1]);
+					vst1q_u64(lmt_addr + 8, cmd2[1]);
+					vst1q_u64(lmt_addr + 10, cmd1[1]);
+				}
 				lmt_status = roc_lmt_submit_ldeor(io_addr);
 			} while (lmt_status == 0);
 
 			do {
-				vst1q_u64(lmt_addr, cmd0[2]);
-				vst1q_u64(lmt_addr + 2, cmd2[2]);
-				vst1q_u64(lmt_addr + 4, cmd1[2]);
-				vst1q_u64(lmt_addr + 6, cmd0[3]);
-				vst1q_u64(lmt_addr + 8, cmd2[3]);
-				vst1q_u64(lmt_addr + 10, cmd1[3]);
+				if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
+					vst1q_u64(lmt_addr, cmd0[2]);
+					vst1q_u64(lmt_addr + 2, cmd2[2]);
+					vst1q_u64(lmt_addr + 4, cmd1[2]);
+					vst1q_u64(lmt_addr + 6, cmd3[2]);
+					vst1q_u64(lmt_addr + 8, cmd0[3]);
+					vst1q_u64(lmt_addr + 10, cmd2[3]);
+					vst1q_u64(lmt_addr + 12, cmd1[3]);
+					vst1q_u64(lmt_addr + 14, cmd3[3]);
+				} else {
+					vst1q_u64(lmt_addr, cmd0[2]);
+					vst1q_u64(lmt_addr + 2, cmd2[2]);
+					vst1q_u64(lmt_addr + 4, cmd1[2]);
+					vst1q_u64(lmt_addr + 6, cmd0[3]);
+					vst1q_u64(lmt_addr + 8, cmd2[3]);
+					vst1q_u64(lmt_addr + 10, cmd1[3]);
+				}
 				lmt_status = roc_lmt_submit_ldeor(io_addr);
 			} while (lmt_status == 0);
 		} else {
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index 5842facb5..9ade66db2 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -13,9 +13,8 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* VLAN, TSTMP, TSO is not supported by vec */                 \
-		if ((flags) & NIX_TX_OFFLOAD_TSTAMP_F ||		       \
-		    (flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* TSO is not supported by vec */                              \
+		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
 						 (flags));		       \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v4 5/6] net/cnxk: enable TSO processing in vector Tx
  2021-06-28 19:41     ` [dpdk-dev] [PATCH v4 1/6] " pbhagavatula
                         ` (2 preceding siblings ...)
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 4/6] net/cnxk: enable ptp " pbhagavatula
@ 2021-06-28 19:41       ` pbhagavatula
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 6/6] net/cnxk: add multi seg Tx vector routine pbhagavatula
  2021-06-29  7:44       ` [dpdk-dev] [PATCH v5 1/6] net/cnxk: add multi seg Rx " pbhagavatula
  5 siblings, 0 replies; 93+ messages in thread
From: pbhagavatula @ 2021-06-28 19:41 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Enable TSO offload in vector Tx burst function.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c     |  2 +-
 drivers/net/cnxk/cn10k_tx.h     | 97 +++++++++++++++++++++++++++++++++
 drivers/net/cnxk/cn10k_tx_vec.c |  5 +-
 drivers/net/cnxk/cn9k_tx.c      |  2 +-
 drivers/net/cnxk/cn9k_tx.h      | 94 ++++++++++++++++++++++++++++++++
 drivers/net/cnxk/cn9k_tx_vec.c  |  5 +-
 6 files changed, 199 insertions(+), 6 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index c4c3e6570..d06879163 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,7 +67,7 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
+	if (dev->scalar_ena)
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8af6799ff..26797581e 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -689,6 +689,46 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 #if defined(RTE_ARCH_ARM64)
 
+static __rte_always_inline void
+cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
+		      union nix_send_ext_w0_u *w0, uint64_t ol_flags,
+		      const uint64_t flags, const uint64_t lso_tun_fmt)
+{
+	uint16_t lso_sb;
+	uint64_t mask;
+
+	if (!(ol_flags & PKT_TX_TCP_SEG))
+		return;
+
+	mask = -(!w1->il3type);
+	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+
+	w0->u |= BIT(14);
+	w0->lso_sb = lso_sb;
+	w0->lso_mps = m->tso_segsz;
+	w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
+	w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
+
+	/* Handle tunnel tso */
+	if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
+	    (ol_flags & PKT_TX_TUNNEL_MASK)) {
+		const uint8_t is_udp_tun =
+			(CNXK_NIX_UDP_TUN_BITMASK >>
+			 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
+			0x1;
+		uint8_t shift = is_udp_tun ? 32 : 0;
+
+		shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
+		shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
+
+		w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
+		w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
+		/* Update format for UDP tunneled packet */
+
+		w0->lso_format = (lso_tun_fmt >> shift);
+	}
+}
+
 #define NIX_DESCS_PER_LOOP 4
 static __rte_always_inline uint16_t
 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -723,6 +763,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F) {
+		for (i = 0; i < pkts; i++)
+			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+	}
 
 	senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
 	senddesc23_w0 = senddesc01_w0;
@@ -781,6 +826,13 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendmem23_w1 = sendmem01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			/* Clear the LSO enable bit. */
+			sendext01_w0 = vbicq_u64(sendext01_w0,
+						 vdupq_n_u64(BIT_ULL(14)));
+			sendext23_w0 = sendext01_w0;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1430,6 +1482,51 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			const uint64_t lso_fmt = txq->lso_tun_fmt;
+			uint64_t sx_w0[NIX_DESCS_PER_LOOP];
+			uint64_t sd_w1[NIX_DESCS_PER_LOOP];
+
+			/* Extract SD W1 as we need to set L4 types. */
+			vst1q_u64(sd_w1, senddesc01_w1);
+			vst1q_u64(sd_w1 + 2, senddesc23_w1);
+
+			/* Extract SX W0 as we need to set LSO fields. */
+			vst1q_u64(sx_w0, sendext01_w0);
+			vst1q_u64(sx_w0 + 2, sendext23_w0);
+
+			/* Extract ol_flags. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Prepare individual mbufs. */
+			cn10k_nix_prepare_tso(tx_pkts[0],
+				(union nix_send_hdr_w1_u *)&sd_w1[0],
+				(union nix_send_ext_w0_u *)&sx_w0[0],
+				vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[1],
+				(union nix_send_hdr_w1_u *)&sd_w1[1],
+				(union nix_send_ext_w0_u *)&sx_w0[1],
+				vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[2],
+				(union nix_send_hdr_w1_u *)&sd_w1[2],
+				(union nix_send_ext_w0_u *)&sx_w0[2],
+				vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
+
+			cn10k_nix_prepare_tso(tx_pkts[3],
+				(union nix_send_hdr_w1_u *)&sd_w1[3],
+				(union nix_send_ext_w0_u *)&sx_w0[3],
+				vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
+
+			senddesc01_w1 = vld1q_u64(sd_w1);
+			senddesc23_w1 = vld1q_u64(sd_w1 + 2);
+
+			sendext01_w0 = vld1q_u64(sx_w0);
+			sendext23_w0 = vld1q_u64(sx_w0 + 2);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
index 0b4a4c7ba..34e373750 100644
--- a/drivers/net/cnxk/cn10k_tx_vec.c
+++ b/drivers/net/cnxk/cn10k_tx_vec.c
@@ -13,8 +13,9 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* TSO is not supported by vec */                              \
-		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&			       \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))		       \
 			return 0;                                              \
 		return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
 						  (flags));                    \
diff --git a/drivers/net/cnxk/cn9k_tx.c b/drivers/net/cnxk/cn9k_tx.c
index c32681ed4..735e21cc6 100644
--- a/drivers/net/cnxk/cn9k_tx.c
+++ b/drivers/net/cnxk/cn9k_tx.c
@@ -66,7 +66,7 @@ cn9k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena || (dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F))
+	if (dev->scalar_ena)
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
 	else
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index cb574a1c1..dca732a9f 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -545,6 +545,43 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 
 #if defined(RTE_ARCH_ARM64)
 
+static __rte_always_inline void
+cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
+		     union nix_send_ext_w0_u *w0, uint64_t ol_flags,
+		     uint64_t flags)
+{
+	uint16_t lso_sb;
+	uint64_t mask;
+
+	if (!(ol_flags & PKT_TX_TCP_SEG))
+		return;
+
+	mask = -(!w1->il3type);
+	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+
+	w0->u |= BIT(14);
+	w0->lso_sb = lso_sb;
+	w0->lso_mps = m->tso_segsz;
+	w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
+	w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
+
+	/* Handle tunnel tso */
+	if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
+	    (ol_flags & PKT_TX_TUNNEL_MASK)) {
+		const uint8_t is_udp_tun =
+			(CNXK_NIX_UDP_TUN_BITMASK >>
+			 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
+			0x1;
+
+		w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
+		w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
+		/* Update format for UDP tunneled packet */
+		w0->lso_format += is_udp_tun ? 2 : 6;
+
+		w0->lso_format += !!(ol_flags & PKT_TX_OUTER_IPV6) << 1;
+	}
+}
+
 #define NIX_DESCS_PER_LOOP 4
 static __rte_always_inline uint16_t
 cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
@@ -580,6 +617,12 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;
 
+	/* Perform header writes before barrier for TSO */
+	if (flags & NIX_TX_OFFLOAD_TSO_F) {
+		for (i = 0; i < pkts; i++)
+			cn9k_nix_xmit_prepare_tso(tx_pkts[i], flags);
+	}
+
 	/* Lets commit any changes in the packet here as no further changes
 	 * to the packet will be done unless no fast free is enabled.
 	 */
@@ -637,6 +680,13 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			sendmem23_w1 = sendmem01_w1;
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			/* Clear the LSO enable bit. */
+			sendext01_w0 = vbicq_u64(sendext01_w0,
+						 vdupq_n_u64(BIT_ULL(14)));
+			sendext23_w0 = sendext01_w0;
+		}
+
 		/* Move mbufs to iova */
 		mbuf0 = (uint64_t *)tx_pkts[0];
 		mbuf1 = (uint64_t *)tx_pkts[1];
@@ -1286,6 +1336,50 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
 		}
 
+		if (flags & NIX_TX_OFFLOAD_TSO_F) {
+			uint64_t sx_w0[NIX_DESCS_PER_LOOP];
+			uint64_t sd_w1[NIX_DESCS_PER_LOOP];
+
+			/* Extract SD W1 as we need to set L4 types. */
+			vst1q_u64(sd_w1, senddesc01_w1);
+			vst1q_u64(sd_w1 + 2, senddesc23_w1);
+
+			/* Extract SX W0 as we need to set LSO fields. */
+			vst1q_u64(sx_w0, sendext01_w0);
+			vst1q_u64(sx_w0 + 2, sendext23_w0);
+
+			/* Extract ol_flags. */
+			xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
+			ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
+
+			/* Prepare individual mbufs. */
+			cn9k_nix_prepare_tso(tx_pkts[0],
+				(union nix_send_hdr_w1_u *)&sd_w1[0],
+				(union nix_send_ext_w0_u *)&sx_w0[0],
+				vgetq_lane_u64(xtmp128, 0), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[1],
+				(union nix_send_hdr_w1_u *)&sd_w1[1],
+				(union nix_send_ext_w0_u *)&sx_w0[1],
+				vgetq_lane_u64(xtmp128, 1), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[2],
+				(union nix_send_hdr_w1_u *)&sd_w1[2],
+				(union nix_send_ext_w0_u *)&sx_w0[2],
+				vgetq_lane_u64(ytmp128, 0), flags);
+
+			cn9k_nix_prepare_tso(tx_pkts[3],
+				(union nix_send_hdr_w1_u *)&sd_w1[3],
+				(union nix_send_ext_w0_u *)&sx_w0[3],
+				vgetq_lane_u64(ytmp128, 1), flags);
+
+			senddesc01_w1 = vld1q_u64(sd_w1);
+			senddesc23_w1 = vld1q_u64(sd_w1 + 2);
+
+			sendext01_w0 = vld1q_u64(sx_w0);
+			sendext23_w0 = vld1q_u64(sx_w0 + 2);
+		}
+
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			/* Set don't free bit if reference count > 1 */
 			xmask01 = vdupq_n_u64(0);
diff --git a/drivers/net/cnxk/cn9k_tx_vec.c b/drivers/net/cnxk/cn9k_tx_vec.c
index 9ade66db2..56a3e2514 100644
--- a/drivers/net/cnxk/cn9k_tx_vec.c
+++ b/drivers/net/cnxk/cn9k_tx_vec.c
@@ -13,8 +13,9 @@
 	{                                                                      \
 		uint64_t cmd[sz];                                              \
 									       \
-		/* TSO is not supported by vec */                              \
-		if ((flags) & NIX_TX_OFFLOAD_TSO_F)			       \
+		/* For TSO inner checksum is a must */                         \
+		if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                        \
+		    !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                  \
 			return 0;                                              \
 		return cn9k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, \
 						 (flags));		       \
-- 
2.17.1


^ permalink raw reply	[flat|nested] 93+ messages in thread

* [dpdk-dev] [PATCH v4 6/6] net/cnxk: add multi seg Tx vector routine
  2021-06-28 19:41     ` [dpdk-dev] [PATCH v4 1/6] " pbhagavatula
                         ` (3 preceding siblings ...)
  2021-06-28 19:41       ` [dpdk-dev] [PATCH v4 5/6] net/cnxk: enable TSO " pbhagavatula
@ 2021-06-28 19:41       ` pbhagavatula
  2021-06-29  7:25         ` Nithin Dabilpuram
  2021-06-29  7:44       ` [dpdk-dev] [PATCH v5 1/6] net/cnxk: add multi seg Rx " pbhagavatula
  5 siblings, 1 reply; 93+ messages in thread
From: pbhagavatula @ 2021-06-28 19:41 UTC (permalink / raw)
  To: jerinj, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori, Satha Rao
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add multi segment Tx vector routine.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.c          |  20 +-
 drivers/net/cnxk/cn10k_tx.h          | 388 +++++++++++++++++++++++++--
 drivers/net/cnxk/cn10k_tx_vec_mseg.c |  24 ++
 drivers/net/cnxk/cn9k_tx.c           |  20 +-
 drivers/net/cnxk/cn9k_tx.h           | 272 ++++++++++++++++++-
 drivers/net/cnxk/cn9k_tx_vec_mseg.c  |  24 ++
 drivers/net/cnxk/meson.build         |   6 +-
 7 files changed, 709 insertions(+), 45 deletions(-)
 create mode 100644 drivers/net/cnxk/cn10k_tx_vec_mseg.c
 create mode 100644 drivers/net/cnxk/cn9k_tx_vec_mseg.c

diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
index d06879163..1f30bab59 100644
--- a/drivers/net/cnxk/cn10k_tx.c
+++ b/drivers/net/cnxk/cn10k_tx.c
@@ -67,13 +67,23 @@ cn10k_eth_set_tx_function(struct rte_eth_dev *eth_dev)
 #undef T
 	};
 
-	if (dev->scalar_ena)
+	const eth_tx_burst_t nix_eth_tx_vec_burst_mseg[2][2][2][2][2][2] = {
+#define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
+	[f5][f4][f3][f2][f1][f0] = cn10k_nix_xmit_pkts_vec_mseg_##name,
+
+		NIX_TX_FASTPATH_MODES
+#undef T
+	};
+
+	if (dev->scalar_ena) {
 		pick_tx_func(eth_dev, nix_eth_tx_burst);
-	else
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+	} else {
 		pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
-
-	if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
-		pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
+		if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
+			pick_tx_func(eth_dev, nix_eth_tx_vec_burst_mseg);
+	}
 
 	rte_mb();
 }
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 26797581e..532b53b31 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -42,6 +42,13 @@
 		}                                                              \
 	} while (0)
 
+/* Encoded number of segments to number of dwords macro, each value of nb_segs
+ * is encoded as 4bits.
+ */
+#define NIX_SEGDW_MAGIC 0x76654432210ULL
+
+#define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
+
 #define LMT_OFF(lmt_addr, lmt_num, offset)                                     \
 	(void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
 
@@ -102,6 +109,14 @@ cn10k_nix_tx_steor_data(const uint16_t flags)
 	return data;
 }
 
+static __rte_always_inline uint8_t
+cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
+{
+	return ((flags & NIX_TX_NEED_EXT_HDR) ?
+			      (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
+			      4);
+}
+
 static __rte_always_inline uint64_t
 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
 {
@@ -729,7 +744,244 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 	}
 }
 
+static __rte_always_inline void
+cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
+				union nix_send_hdr_w0_u *sh,
+				union nix_send_sg_s *sg, const uint32_t flags)
+{
+	struct rte_mbuf *m_next;
+	uint64_t *slist, sg_u;
+	uint16_t nb_segs;
+	int i = 1;
+
+	sh->total = m->pkt_len;
+	/* Clear sg->u header before use */
+	sg->u &= 0xFC00000000000000;
+	sg_u = sg->u;
+	slist = &cmd[0];
+
+	sg_u = sg_u | ((uint64_t)m->data_len);
+
+	nb_segs = m->nb_segs - 1;
+	m_next = m->next;
+
+	/* Set invert df if buffer is not to be freed by H/W */
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+		sg_u |= (cnxk_nix_prefree_seg(m) << 55);
+		/* Mark mempool object as "put" since it is freed by NIX */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+	if (!(sg_u & (1ULL << 55)))
+		__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+	rte_io_wmb();
+#endif
+
+	m = m_next;
+	/* Fill mbuf segments */
+	do {
+		m_next = m->next;
+		sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
+		*slist = rte_mbuf_data_iova(m);
+		/* Set invert df if buffer is not to be freed by H/W */
+		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
+			sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
+			/* Mark mempool object as "put" since it is freed by NIX
+			 */
+#ifdef RTE_LIBRTE_MEMPOOL_DEBUG
+		if (!(sg_u & (1ULL << (i + 55))))
+			__mempool_check_cookies(m->pool, (void **)&m, 1, 0);
+		rt