DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH 1/5] examples/l3fwd: fix port group mask generation
@ 2022-08-29  9:44 pbhagavatula
  2022-08-29  9:44 ` [PATCH 2/5] examples/l3fwd: split processing and send stages pbhagavatula
                   ` (4 more replies)
  0 siblings, 5 replies; 41+ messages in thread
From: pbhagavatula @ 2022-08-29  9:44 UTC (permalink / raw)
  To: jerinj, David Christensen; +Cc: dev, Pavan Nikhilesh, stable

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/common/altivec/port_group.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
index 5e209b02fa..7a6ef390ff 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,19 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
 	} *pnum = (void *)pn;
+	union u_vec {
+		__vector unsigned short v_us;
+		unsigned short s[8];
+	};
 
+	union u_vec res;
 	int32_t v;
 
-	v = vec_any_eq(dp1, dp2);
-
+	dp1 = vec_cmpeq(dp1, dp2);
+	res.v_us = dp1;
 
+	v = (res.s[0] & 0x1) | (res.s[1] & 0x2) | (res.s[2] & 0x4) |
+	    (res.s[3] & 0x8);
 	/* update last port counter. */
 	lp[0] += gptbl[v].lpv;
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 2/5] examples/l3fwd: split processing and send stages
  2022-08-29  9:44 [PATCH 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
@ 2022-08-29  9:44 ` pbhagavatula
  2022-08-29  9:44 ` [PATCH 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-08-29  9:44 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
 examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
 examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
 	return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			 uint16_t *dst_port, uint16_t portid,
+			 struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i, j, pos;
-	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				dst_port[j + i] = em_get_dst_port(qconf,
 						pkts_burst[j + i], portid);
 		}
+
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 	}
 
-	for (; j < nb_rx; j++)
+	for (; j < nb_rx; j++) {
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
+	}
+}
 
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		      struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+	for (j = 0; j < nb_rx; j++)
 		pkts_burst[j] = ev[j]->mbuf;
-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-					       struct rte_ether_hdr *) + 1);
-	}
 
 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 			}
 			continue;
 		}
-		processx4_step3(&pkts_burst[j], &dst_port[j]);
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
 			pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint8_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				ipv4_flag[j / FWDSTEP],
 				portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
index 78ee83b76c..2a68c4c15e 100644
--- a/examples/l3fwd/l3fwd_lpm_neon.h
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
 	}
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i = 0, j = 0;
-	uint16_t dst_port[MAX_PKT_BURST];
 	int32x4_t dip;
 	uint32_t ipv4_flag;
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
 							void *));
 		}
-
 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
 			for (i = 0; i < FWDSTEP; i++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 			processx4_step2(qconf, dip, ipv4_flag, portid,
 					&pkts_burst[j], &dst_port[j]);
+			if (do_step3)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 		}
 
 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
 				&dst_port[j]);
+		if (do_step3)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 		j += FWDSTEP;
 	}
@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 							void *));
 			j++;
 		}
-
 		j -= m;
 		/* Classify last up to 3 packets one by one */
 		switch (m) {
 		case 3:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 2:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 1:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 		}
 	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index 3f637a23d1..db15030320 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		processx4_step2(qconf, dip[j / FWDSTEP],
 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-08-29  9:44 [PATCH 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-08-29  9:44 ` [PATCH 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-08-29  9:44 ` pbhagavatula
  2022-08-29  9:44 ` [PATCH 4/5] examples/l3fwd: use em " pbhagavatula
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-08-29  9:44 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_altivec.h | 28 ++++++++++++++++
 examples/l3fwd/l3fwd_event.h   | 58 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.c     | 33 +++++++++----------
 examples/l3fwd/l3fwd_neon.h    | 43 +++++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h     | 44 ++++++++++++++++++++++++++
 5 files changed, 190 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..00a80225cd 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,32 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__vector unsigned short dp = vec_splats((short)dst_ports[0]);
+		__vector unsigned short dp1;
+
+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
+		res = vec_all_eq(dp1, dp);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..26c3254004 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -14,6 +14,14 @@
 
 #include "l3fwd.h"
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#endif
+
 #define L3FWD_EVENT_SINGLE     0x1
 #define L3FWD_EVENT_BURST      0x2
 #define L3FWD_EVENT_TX_DIRECT  0x4
@@ -103,7 +111,57 @@ event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
 	}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+		   uint16_t nb_pkts)
+{
+	uint16_t *des_pos, free = 0;
+	struct rte_mbuf **pos;
+	int i;
+
+	/* Filter out and free bad packets */
+	for (i = 0; i < nb_pkts; i++) {
+		if (dst_port[i] == BAD_PORT) {
+			rte_pktmbuf_free(mbufs[i]);
+			if (!free) {
+				pos = &mbufs[i];
+				des_pos = &dst_port[i];
+			}
+			free++;
+			continue;
+		}
+
+		if (free) {
+			*pos = mbufs[i];
+			pos++;
+			*des_pos = dst_port[i];
+			des_pos++;
+		}
+	}
+
+	return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+	uint16_t port, i;
 
+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+	/* Verify destination array */
+	port = process_dst_port(dst_port, vec->nb_elem);
+	if (port == BAD_PORT) {
+		vec->attr_valid = 0;
+		for (i = 0; i < vec->nb_elem; i++) {
+			vec->mbufs[i]->port = dst_port[i];
+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+		}
+	} else {
+		vec->attr_valid = 1;
+		vec->port = port;
+		vec->queue = 0;
+	}
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index d1b850dd5b..3f67ab01d4 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,22 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf,
+			 uint16_t *dst_port)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int i;
 
-	/* Process first packet to init vector attributes */
-	lpm_process_event_pkt(lconf, mbufs[0]);
 	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
+					  dst_port, lconf, 1);
+	} else {
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
+						  &dst_port[i], lconf, 1);
 	}
 
-	for (i = 1; i < vec->nb_elem; i++) {
-		lpm_process_event_pkt(lconf, mbufs[i]);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	process_event_vector(vec, dst_port);
 }
 
 /* Same eventdev loop for single and burst of vector */
@@ -458,6 +456,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	struct rte_event events[MAX_PKT_BURST];
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
+	uint16_t *dst_port_list;
 	unsigned int lcore_id;
 
 	if (event_p_id < 0)
@@ -465,7 +464,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
-
+	dst_port_list =
+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+			    RTE_CACHE_LINE_SIZE);
+	if (dst_port_list == NULL)
+		return;
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
 
 	while (!force_quit) {
@@ -483,10 +486,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			lpm_process_event_vector(events[i].vec, lconf);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			lpm_process_event_vector(events[i].vec, lconf,
+						 dst_port_list);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index ce515e0bc4..60e6a310e0 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -194,4 +194,47 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
+		uint16x8_t dp1;
+
+		dp1 = vld1q_u16(&dst_ports[i]);
+		dp1 = vceqq_u16(dp1, dp);
+		res = vminvq_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
+		uint16x4_t dp1;
+
+		dp1 = vld1_u16(&dst_ports[i]);
+		dp1 = vceq_u16(dp1, dp);
+		res = vminv_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_NEON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 0f0d0323a2..083729cdef 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		res = _mm_movemask_epi8(dp1);
+		if (res != 0xFFFF)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
+		res = _mm_movemask_ps((__m128)dp1);
+		if (res != 0xF)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_SSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 4/5] examples/l3fwd: use em vector path for event vector
  2022-08-29  9:44 [PATCH 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-08-29  9:44 ` [PATCH 2/5] examples/l3fwd: split processing and send stages pbhagavatula
  2022-08-29  9:44 ` [PATCH 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-08-29  9:44 ` pbhagavatula
  2022-08-29  9:44 ` [PATCH 5/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-08-29  9:44 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em.c            | 10 ++--
 examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
 examples/l3fwd/l3fwd_event.h         | 21 --------
 4 files changed, 35 insertions(+), 93 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 10be24c61d..ac475073d7 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -852,10 +852,15 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
 	unsigned int lcore_id;
+	uint16_t *dst_ports;
 
 	if (event_p_id < 0)
 		return;
 
+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+				RTE_CACHE_LINE_SIZE);
+	if (dst_ports == NULL)
+		return;
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
 
@@ -877,13 +882,12 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 			}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-			l3fwd_em_process_event_vector(events[i].vec, lconf);
+			l3fwd_em_process_event_vector(events[i].vec, lconf,
+						      dst_ports);
 #else
 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
 							     lconf);
 #endif
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_port)
 {
-	struct rte_mbuf **mbufs = vec->mbufs;
-	uint16_t dst_port[MAX_PKT_BURST];
-	int32_t i, j, n, pos;
-
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
-		rte_prefetch0(
-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
+	uint16_t i;
 
 	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
-		uint32_t pkt_type =
-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
-		uint32_t l3_type, tcp_or_udp;
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
-			pkt_type &= mbufs[j + i]->packet_type;
-
-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
-		     i++, pos++) {
-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
-						       struct rte_ether_hdr *) +
-				      1);
-		}
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else {
-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-				mbufs[j + i]->port =
-					em_get_dst_port(qconf, mbufs[j + i],
-							mbufs[j + i]->port);
-				process_packet(mbufs[j + i],
-					       &mbufs[j + i]->port);
-				event_vector_attr_validate(vec, mbufs[j + i]);
-			}
-			continue;
-		}
-		processx4_step3(&mbufs[j], &dst_port[j]);
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-			mbufs[j + i]->port = dst_port[j + i];
-			event_vector_attr_validate(vec, mbufs[j + i]);
-		}
-	}
-
-	for (; j < vec->nb_elem; j++) {
-		mbufs[j]->port =
-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
-		process_packet(mbufs[j], &mbufs[j]->port);
-		event_vector_attr_validate(vec, mbufs[j]);
-	}
+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs, dst_port,
+					 vec->port, qconf, 1);
+	else
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_em_process_packets(1, &vec->mbufs[i],
+						 &dst_port[i],
+						 vec->mbufs[i]->port, qconf, 1);
+
+	process_event_vector(vec, dst_port);
 }
 
 #endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index d2f75edb8a..067f23889a 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **events,
 
 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
 		struct rte_mbuf *mbuf = events[j]->mbuf;
+		uint16_t port;
 
 		if (i < nb_rx) {
 			rte_prefetch0(rte_pktmbuf_mtod(
 					events[i]->mbuf,
 					struct rte_ether_hdr *) + 1);
 		}
+		port = mbuf->port;
 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
 		process_packet(mbuf, &mbuf->port);
+		if (mbuf->port == BAD_PORT)
+			mbuf->port = port;
 	}
 }
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_ports)
 {
+	const uint8_t attr_valid = vec->attr_valid;
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i, j;
 
 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
 
-	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
 		if (j < vec->nb_elem)
 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
 						       struct rte_ether_hdr *) +
 				      1);
-		mbufs[i]->port =
-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
-		process_packet(mbufs[i], &mbufs[i]->port);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
+					       attr_valid ? vec->port :
+							    mbufs[i]->port);
 	}
+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != j; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &dst_ports[i]);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index 26c3254004..2844cc4dd6 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -90,27 +90,6 @@ struct l3fwd_event_resources {
 	uint64_t vector_tmo_ns;
 };
 
-static inline void
-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
-{
-	/* l3fwd application only changes mbuf port while processing */
-	if (vec->attr_valid && (vec->port != mbuf->port))
-		vec->attr_valid = 0;
-}
-
-static inline void
-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
-{
-	if (vec->attr_valid) {
-		vec->queue = txq;
-	} else {
-		int i;
-
-		for (i = 0; i < vec->nb_elem; i++)
-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
-	}
-}
-
 static inline uint16_t
 filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
 		   uint16_t nb_pkts)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH 5/5] examples/l3fwd: fix event vector processing in fib
  2022-08-29  9:44 [PATCH 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                   ` (2 preceding siblings ...)
  2022-08-29  9:44 ` [PATCH 4/5] examples/l3fwd: use em " pbhagavatula
@ 2022-08-29  9:44 ` pbhagavatula
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-08-29  9:44 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix stack overflow when event vector size is greater then
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_fib.c | 85 +++++++++++++++++++++++++++-----------
 1 file changed, 62 insertions(+), 23 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index e02e4b3f5a..80f0330c69 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -261,7 +261,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 	uint32_t ipv4_arr[MAX_PKT_BURST];
 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-	uint16_t nh;
+	uint16_t nh, hops[MAX_PKT_BURST];
 	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +350,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 			else
 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 			if (nh != FIB_DEFAULT_HOP)
-				events[i].mbuf->port = nh;
+				hops[i] = nh != FIB_DEFAULT_HOP ?
+						  nh :
+						  events[i].mbuf->port;
+			process_packet(events[i].mbuf, &hops[i]);
+			events[i].mbuf->port = hops[i] != BAD_PORT ?
+						       hops[i] :
+						       events[i].mbuf->port;
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +424,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+			 uint32_t *ipv4_arr, uint16_t *hops)
 {
-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
 	struct rte_mbuf **mbufs = vec->mbufs;
-	uint32_t ipv4_arr[MAX_PKT_BURST];
-	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	struct lcore_conf *lconf;
 	uint16_t nh;
@@ -463,16 +467,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
 	/* Lookup IPv6 hops if IPv6 packets are present. */
 	if (ipv6_cnt > 0)
-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
-				     hopsv6, ipv6_cnt);
-
-	if (vec->attr_valid) {
-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
-		if (nh != FIB_DEFAULT_HOP)
-			vec->port = nh;
-		else
-			vec->attr_valid = 0;
-	}
+		rte_fib6_lookup_bulk(
+			lconf->ipv6_lookup_struct,
+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
+			ipv6_cnt);
 
 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
 	for (i = 0; i < vec->nb_elem; i++) {
@@ -481,9 +479,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
 		else
 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 		if (nh != FIB_DEFAULT_HOP)
-			mbufs[i]->port = nh;
-		event_vector_attr_validate(vec, mbufs[i]);
+			hops[i] = nh;
+		else
+			hops[i] = vec->attr_valid ? vec->port :
+						    vec->mbufs[i]->port;
 	}
+
+#if defined FIB_SEND_MULTI
+	uint16_t k;
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
 }
 
 static __rte_always_inline void
@@ -496,7 +511,32 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	const uint8_t event_d_id = evt_rsrc->event_d_id;
 	const uint16_t deq_len = evt_rsrc->deq_depth;
 	struct rte_event events[MAX_PKT_BURST];
+	uint8_t *type_arr, **ipv6_arr, *ptr;
 	int nb_enq = 0, nb_deq = 0, i;
+	uint64_t *hopsv4, *hopsv6;
+	uint32_t *ipv4_arr;
+	uint16_t *hops;
+	uintptr_t mem;
+
+	mem = (uintptr_t)rte_zmalloc(
+		"vector_fib",
+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
+			evt_rsrc->vector_size,
+		RTE_CACHE_LINE_SIZE);
+	if (mem == 0)
+		return;
+	ipv4_arr = (uint32_t *)mem;
+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
+
+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
+	for (i = 0; i < evt_rsrc->vector_size; i++)
+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
 
 	if (event_p_id < 0)
 		return;
@@ -519,10 +559,9 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			fib_process_event_vector(events[i].vec);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			fib_process_event_vector(events[i].vec, type_arr,
+						 ipv6_arr, hopsv4, hopsv6,
+						 ipv4_arr, hops);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 1/5] examples/l3fwd: fix port group mask generation
  2022-08-29  9:44 [PATCH 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                   ` (3 preceding siblings ...)
  2022-08-29  9:44 ` [PATCH 5/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-09-02  9:18 ` pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 2/5] examples/l3fwd: split processing and send stages pbhagavatula
                     ` (5 more replies)
  4 siblings, 6 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-02  9:18 UTC (permalink / raw)
  To: jerinj, David Christensen; +Cc: dev, Pavan Nikhilesh, stable

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v2 Changes:
 - Fix PPC, RISC-V, aarch32 compilation.

 examples/common/altivec/port_group.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
index 5e209b02fa..592ef80b7f 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,19 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
 	} *pnum = (void *)pn;
+	union u_vec {
+		__vector unsigned short v_us;
+		unsigned short s[8];
+	};

+	union u_vec res;
 	int32_t v;

-	v = vec_any_eq(dp1, dp2);
-
+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
+	res.v_us = dp1;

+	v = (res.s[0] & 0x1) | (res.s[1] & 0x2) | (res.s[2] & 0x4) |
+	    (res.s[3] & 0x8);
 	/* update last port counter. */
 	lp[0] += gptbl[v].lpv;

--
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 2/5] examples/l3fwd: split processing and send stages
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
@ 2022-09-02  9:18   ` pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-02  9:18 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
 examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
 examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
 	return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			 uint16_t *dst_port, uint16_t portid,
+			 struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i, j, pos;
-	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				dst_port[j + i] = em_get_dst_port(qconf,
 						pkts_burst[j + i], portid);
 		}
+
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 	}
 
-	for (; j < nb_rx; j++)
+	for (; j < nb_rx; j++) {
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
+	}
+}
 
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		      struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+	for (j = 0; j < nb_rx; j++)
 		pkts_burst[j] = ev[j]->mbuf;
-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-					       struct rte_ether_hdr *) + 1);
-	}
 
 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 			}
 			continue;
 		}
-		processx4_step3(&pkts_burst[j], &dst_port[j]);
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
 			pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint8_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				ipv4_flag[j / FWDSTEP],
 				portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
index 78ee83b76c..2a68c4c15e 100644
--- a/examples/l3fwd/l3fwd_lpm_neon.h
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
 	}
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i = 0, j = 0;
-	uint16_t dst_port[MAX_PKT_BURST];
 	int32x4_t dip;
 	uint32_t ipv4_flag;
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
 							void *));
 		}
-
 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
 			for (i = 0; i < FWDSTEP; i++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 			processx4_step2(qconf, dip, ipv4_flag, portid,
 					&pkts_burst[j], &dst_port[j]);
+			if (do_step3)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 		}
 
 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
 				&dst_port[j]);
+		if (do_step3)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 		j += FWDSTEP;
 	}
@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 							void *));
 			j++;
 		}
-
 		j -= m;
 		/* Classify last up to 3 packets one by one */
 		switch (m) {
 		case 3:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 2:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 1:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 		}
 	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index 3f637a23d1..db15030320 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		processx4_step2(qconf, dip[j / FWDSTEP],
 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-09-02  9:18   ` pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-02  9:18 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_altivec.h | 29 ++++++++++++++
 examples/l3fwd/l3fwd_event.h   | 71 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.c     | 38 ++++++++++--------
 examples/l3fwd/l3fwd_neon.h    | 45 +++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h     | 44 +++++++++++++++++++++
 5 files changed, 211 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..e45e138e59 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__vector unsigned short dp1;
+		__vector unsigned short dp;
+
+		dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
+		res = vec_all_eq(dp1, dp);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..3fe38aada0 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
 	uint64_t vector_tmo_ns;
 };
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#else
+static inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	int i;
+
+	for (i = 0; i < nb_elem; i++) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+	}
+
+	return dst_ports[0];
+}
+#endif
+
 static inline void
 event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
 {
@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
 	}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+		   uint16_t nb_pkts)
+{
+	uint16_t *des_pos, free = 0;
+	struct rte_mbuf **pos;
+	int i;
+
+	/* Filter out and free bad packets */
+	for (i = 0; i < nb_pkts; i++) {
+		if (dst_port[i] == BAD_PORT) {
+			rte_pktmbuf_free(mbufs[i]);
+			if (!free) {
+				pos = &mbufs[i];
+				des_pos = &dst_port[i];
+			}
+			free++;
+			continue;
+		}
+
+		if (free) {
+			*pos = mbufs[i];
+			pos++;
+			*des_pos = dst_port[i];
+			des_pos++;
+		}
+	}
 
+	return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+	uint16_t port, i;
+
+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+	/* Verify destination array */
+	port = process_dst_port(dst_port, vec->nb_elem);
+	if (port == BAD_PORT) {
+		vec->attr_valid = 0;
+		for (i = 0; i < vec->nb_elem; i++) {
+			vec->mbufs[i]->port = dst_port[i];
+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+		}
+	} else {
+		vec->attr_valid = 1;
+		vec->port = port;
+		vec->queue = 0;
+	}
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index d1b850dd5b..1652b7c470 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf,
+			 uint16_t *dst_port)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int i;
 
-	/* Process first packet to init vector attributes */
-	lpm_process_event_pkt(lconf, mbufs[0]);
+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined RTE_ARCH_PPC_64
 	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
+					  dst_port, lconf, 1);
+	} else {
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
+						  &dst_port[i], lconf, 1);
 	}
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		dst_port[i] = lpm_process_event_pkt(lconf, mbufs[i]);
+#endif
 
-	for (i = 1; i < vec->nb_elem; i++) {
-		lpm_process_event_pkt(lconf, mbufs[i]);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	process_event_vector(vec, dst_port);
 }
 
 /* Same eventdev loop for single and burst of vector */
@@ -458,6 +461,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	struct rte_event events[MAX_PKT_BURST];
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
+	uint16_t *dst_port_list;
 	unsigned int lcore_id;
 
 	if (event_p_id < 0)
@@ -465,7 +469,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
-
+	dst_port_list =
+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+			    RTE_CACHE_LINE_SIZE);
+	if (dst_port_list == NULL)
+		return;
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
 
 	while (!force_quit) {
@@ -483,10 +491,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			lpm_process_event_vector(events[i].vec, lconf);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			lpm_process_event_vector(events[i].vec, lconf,
+						 dst_port_list);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index ce515e0bc4..4d98288707 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -194,4 +194,49 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+#if defined(RTE_ARCH_ARM64)
+	while (nb_elem > 7) {
+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
+		uint16x8_t dp1;
+
+		dp1 = vld1q_u16(&dst_ports[i]);
+		dp1 = vceqq_u16(dp1, dp);
+		res = vminvq_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
+		uint16x4_t dp1;
+
+		dp1 = vld1_u16(&dst_ports[i]);
+		dp1 = vceq_u16(dp1, dp);
+		res = vminv_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+#endif
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_NEON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 0f0d0323a2..083729cdef 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		res = _mm_movemask_epi8(dp1);
+		if (res != 0xFFFF)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
+		res = _mm_movemask_ps((__m128)dp1);
+		if (res != 0xF)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_SSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 4/5] examples/l3fwd: fix event vector processing in fib
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 2/5] examples/l3fwd: split processing and send stages pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-09-02  9:18   ` pbhagavatula
  2022-09-02  9:18   ` [PATCH v2 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-02  9:18 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix stack overflow when event vector size is greater than
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_fib.c | 124 ++++++++++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 37 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index e02e4b3f5a..ada5d0d430 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -77,27 +77,38 @@ fib_parse_packet(struct rte_mbuf *mbuf,
  */
 #if !defined FIB_SEND_MULTI
 static inline void
-fib_send_single(int nb_tx, struct lcore_conf *qconf,
-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
 {
-	int32_t j;
 	struct rte_ether_hdr *eth_hdr;
 
-	for (j = 0; j < nb_tx; j++) {
-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
 #if defined DO_RFC_1812_CHECKS
-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
-				pkts_burst[j], struct rte_ether_hdr *) + 1),
-				&hops[j], pkts_burst[j]->packet_type);
+	rfc1812_process(
+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
+						pkt, struct rte_ether_hdr *) +
+					1),
+		hop, pkt->packet_type,
+		pkt->ol_flags & RTE_MBUF_F_RX_IP_CKSUM_MASK);
 #endif
 
-		/* Set MAC addresses. */
-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
-				struct rte_ether_hdr *);
-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
-				&eth_hdr->src_addr);
+	/* Set MAC addresses. */
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
+}
+
+static inline void
+fib_send_single(int nb_tx, struct lcore_conf *qconf,
+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+{
+	int32_t j;
 
+	for (j = 0; j < nb_tx; j++) {
+		process_packet(pkts_burst[j], &hops[j]);
+		if (hops[j] == BAD_PORT) {
+			rte_pktmbuf_free(pkts_burst[j]);
+			continue;
+		}
 		/* Send single packet. */
 		send_single_packet(qconf, pkts_burst[j], hops[j]);
 	}
@@ -261,7 +272,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 	uint32_t ipv4_arr[MAX_PKT_BURST];
 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-	uint16_t nh;
+	uint16_t nh, hops[MAX_PKT_BURST];
 	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +361,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 			else
 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 			if (nh != FIB_DEFAULT_HOP)
-				events[i].mbuf->port = nh;
+				hops[i] = nh != FIB_DEFAULT_HOP ?
+						  nh :
+						  events[i].mbuf->port;
+			process_packet(events[i].mbuf, &hops[i]);
+			events[i].mbuf->port = hops[i] != BAD_PORT ?
+						       hops[i] :
+						       events[i].mbuf->port;
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +435,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+			 uint32_t *ipv4_arr, uint16_t *hops)
 {
-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
 	struct rte_mbuf **mbufs = vec->mbufs;
-	uint32_t ipv4_arr[MAX_PKT_BURST];
-	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	struct lcore_conf *lconf;
 	uint16_t nh;
@@ -463,16 +478,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
 	/* Lookup IPv6 hops if IPv6 packets are present. */
 	if (ipv6_cnt > 0)
-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
-				     hopsv6, ipv6_cnt);
-
-	if (vec->attr_valid) {
-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
-		if (nh != FIB_DEFAULT_HOP)
-			vec->port = nh;
-		else
-			vec->attr_valid = 0;
-	}
+		rte_fib6_lookup_bulk(
+			lconf->ipv6_lookup_struct,
+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
+			ipv6_cnt);
 
 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
 	for (i = 0; i < vec->nb_elem; i++) {
@@ -481,9 +490,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
 		else
 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 		if (nh != FIB_DEFAULT_HOP)
-			mbufs[i]->port = nh;
-		event_vector_attr_validate(vec, mbufs[i]);
+			hops[i] = nh;
+		else
+			hops[i] = vec->attr_valid ? vec->port :
+						    vec->mbufs[i]->port;
 	}
+
+#if defined FIB_SEND_MULTI
+	uint16_t k;
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
 }
 
 static __rte_always_inline void
@@ -496,7 +522,32 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	const uint8_t event_d_id = evt_rsrc->event_d_id;
 	const uint16_t deq_len = evt_rsrc->deq_depth;
 	struct rte_event events[MAX_PKT_BURST];
+	uint8_t *type_arr, **ipv6_arr, *ptr;
 	int nb_enq = 0, nb_deq = 0, i;
+	uint64_t *hopsv4, *hopsv6;
+	uint32_t *ipv4_arr;
+	uint16_t *hops;
+	uintptr_t mem;
+
+	mem = (uintptr_t)rte_zmalloc(
+		"vector_fib",
+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
+			evt_rsrc->vector_size,
+		RTE_CACHE_LINE_SIZE);
+	if (mem == 0)
+		return;
+	ipv4_arr = (uint32_t *)mem;
+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
+
+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
+	for (i = 0; i < evt_rsrc->vector_size; i++)
+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
 
 	if (event_p_id < 0)
 		return;
@@ -519,10 +570,9 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			fib_process_event_vector(events[i].vec);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			fib_process_event_vector(events[i].vec, type_arr,
+						 ipv6_arr, hopsv4, hopsv6,
+						 ipv4_arr, hops);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 5/5] examples/l3fwd: use em vector path for event vector
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                     ` (2 preceding siblings ...)
  2022-09-02  9:18   ` [PATCH v2 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-09-02  9:18   ` pbhagavatula
  2022-09-08 18:33   ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation David Christensen
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
  5 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-02  9:18 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em.c            | 12 +++--
 examples/l3fwd/l3fwd_em.h            | 29 +++++------
 examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
 examples/l3fwd/l3fwd_event.h         | 21 --------
 5 files changed, 47 insertions(+), 112 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 10be24c61d..e7b35cfbd9 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -852,10 +852,15 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
 	unsigned int lcore_id;
+	uint16_t *dst_ports;
 
 	if (event_p_id < 0)
 		return;
 
+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+				RTE_CACHE_LINE_SIZE);
+	if (dst_ports == NULL)
+		return;
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
 
@@ -877,13 +882,12 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 			}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-			l3fwd_em_process_event_vector(events[i].vec, lconf);
+			l3fwd_em_process_event_vector(events[i].vec, lconf,
+						      dst_ports);
 #else
 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
-							     lconf);
+							     lconf, dst_ports);
 #endif
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
index fe2ee59f6a..7d051fc076 100644
--- a/examples/l3fwd/l3fwd_em.h
+++ b/examples/l3fwd/l3fwd_em.h
@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t portid,
 	}
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint16_t
 l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 {
 	struct rte_ether_hdr *eth_hdr;
@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
 	else
 		m->port = BAD_PORT;
+
+	return m->port;
 }
 
 /*
@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct rte_event **events,
 
 static inline void
 l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
-				     struct lcore_conf *qconf)
+				     struct lcore_conf *qconf,
+				     uint16_t *dst_ports)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i;
@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
 
-	/* Process first packet to init vector attributes */
-	l3fwd_em_simple_process(mbufs[0], qconf);
-	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
-	}
-
 	/*
 	 * Prefetch and forward already prefetched packets.
 	 */
-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
 		rte_prefetch0(
 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void *));
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
 	}
 
 	/* Forward remaining prefetched packets */
-	for (; i < vec->nb_elem; i++) {
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	for (; i < vec->nb_elem; i++)
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_port)
 {
-	struct rte_mbuf **mbufs = vec->mbufs;
-	uint16_t dst_port[MAX_PKT_BURST];
-	int32_t i, j, n, pos;
-
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
-		rte_prefetch0(
-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
+	uint16_t i;
 
 	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
-		uint32_t pkt_type =
-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
-		uint32_t l3_type, tcp_or_udp;
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
-			pkt_type &= mbufs[j + i]->packet_type;
-
-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
-		     i++, pos++) {
-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
-						       struct rte_ether_hdr *) +
-				      1);
-		}
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else {
-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-				mbufs[j + i]->port =
-					em_get_dst_port(qconf, mbufs[j + i],
-							mbufs[j + i]->port);
-				process_packet(mbufs[j + i],
-					       &mbufs[j + i]->port);
-				event_vector_attr_validate(vec, mbufs[j + i]);
-			}
-			continue;
-		}
-		processx4_step3(&mbufs[j], &dst_port[j]);
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-			mbufs[j + i]->port = dst_port[j + i];
-			event_vector_attr_validate(vec, mbufs[j + i]);
-		}
-	}
-
-	for (; j < vec->nb_elem; j++) {
-		mbufs[j]->port =
-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
-		process_packet(mbufs[j], &mbufs[j]->port);
-		event_vector_attr_validate(vec, mbufs[j]);
-	}
+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs, dst_port,
+					 vec->port, qconf, 1);
+	else
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_em_process_packets(1, &vec->mbufs[i],
+						 &dst_port[i],
+						 vec->mbufs[i]->port, qconf, 1);
+
+	process_event_vector(vec, dst_port);
 }
 
 #endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index d2f75edb8a..067f23889a 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **events,
 
 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
 		struct rte_mbuf *mbuf = events[j]->mbuf;
+		uint16_t port;
 
 		if (i < nb_rx) {
 			rte_prefetch0(rte_pktmbuf_mtod(
 					events[i]->mbuf,
 					struct rte_ether_hdr *) + 1);
 		}
+		port = mbuf->port;
 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
 		process_packet(mbuf, &mbuf->port);
+		if (mbuf->port == BAD_PORT)
+			mbuf->port = port;
 	}
 }
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_ports)
 {
+	const uint8_t attr_valid = vec->attr_valid;
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i, j;
 
 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
 
-	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
 		if (j < vec->nb_elem)
 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
 						       struct rte_ether_hdr *) +
 				      1);
-		mbufs[i]->port =
-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
-		process_packet(mbufs[i], &mbufs[i]->port);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
+					       attr_valid ? vec->port :
+							    mbufs[i]->port);
 	}
+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != j; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &dst_ports[i]);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index 3fe38aada0..e21817c36b 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
 }
 #endif
 
-static inline void
-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
-{
-	/* l3fwd application only changes mbuf port while processing */
-	if (vec->attr_valid && (vec->port != mbuf->port))
-		vec->attr_valid = 0;
-}
-
-static inline void
-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
-{
-	if (vec->attr_valid) {
-		vec->queue = txq;
-	} else {
-		int i;
-
-		for (i = 0; i < vec->nb_elem; i++)
-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
-	}
-}
-
 static inline uint16_t
 filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
 		   uint16_t nb_pkts)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 1/5] examples/l3fwd: fix port group mask generation
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                     ` (3 preceding siblings ...)
  2022-09-02  9:18   ` [PATCH v2 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-09-08 18:33   ` David Christensen
  2022-09-09  5:56     ` [EXT] " Pavan Nikhilesh Bhagavatula
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
  5 siblings, 1 reply; 41+ messages in thread
From: David Christensen @ 2022-09-08 18:33 UTC (permalink / raw)
  To: pbhagavatula, jerinj; +Cc: dev, stable



On 9/2/22 2:18 AM, pbhagavatula@marvell.com wrote:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Fix port group mask generation in altivec, vec_any_eq returns
> 0 or 1 while port_groupx4 expects comparison mask result.
> 
> Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>   v2 Changes:
>   - Fix PPC, RISC-V, aarch32 compilation.
> 
>   examples/common/altivec/port_group.h | 11 +++++++++--
>   1 file changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
> index 5e209b02fa..592ef80b7f 100644
> --- a/examples/common/altivec/port_group.h
> +++ b/examples/common/altivec/port_group.h
> @@ -26,12 +26,19 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
>   		uint16_t u16[FWDSTEP + 1];
>   		uint64_t u64;
>   	} *pnum = (void *)pn;
> +	union u_vec {
> +		__vector unsigned short v_us;
> +		unsigned short s[8];
> +	};
> 
> +	union u_vec res;
>   	int32_t v;
> 
> -	v = vec_any_eq(dp1, dp2);
> -
> +	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);

Altivec vec_cmpeq() is similar to Intel _mm_cmpeq_*(), so this looks 
right to me.

> +	res.v_us = dp1;
> 
> +	v = (res.s[0] & 0x1) | (res.s[1] & 0x2) | (res.s[2] & 0x4) |
> +	    (res.s[3] & 0x8);

This can be vectorized too.  The Intel _mm_unpacklo_epi16() intrinsic 
can be replaced with the following Altivec code:

extern __inline __m128i __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
{
   return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
}

The Intel _mm_movemask_ps() intrinsic can be replaced with the following 
Altivec implementation:

/* Creates a 4-bit mask from the most significant bits of the SPFP 
values.  */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
_mm_movemask_ps (__m128  __A)
{
   __vector unsigned long long result;
   static const __vector unsigned int perm_mask =
     {
#ifdef __LITTLE_ENDIAN__
         0x00204060, 0x80808080, 0x80808080, 0x80808080
#else
       0x80808080, 0x80808080, 0x80808080, 0x00204060
#endif
     };

   result = ((__vector unsigned long long)
             vec_vbpermq ((__vector unsigned char) __A,
                          (__vector unsigned char) perm_mask));

#ifdef __LITTLE_ENDIAN__
   return result[1];
#else
   return result[0];
#endif
}

Dave

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] Re: [PATCH v2 1/5] examples/l3fwd: fix port group mask generation
  2022-09-08 18:33   ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation David Christensen
@ 2022-09-09  5:56     ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 41+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2022-09-09  5:56 UTC (permalink / raw)
  To: David Christensen, Jerin Jacob Kollanukkaran; +Cc: dev, stable

> On 9/2/22 2:18 AM, pbhagavatula@marvell.com wrote:
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Fix port group mask generation in altivec, vec_any_eq returns
> > 0 or 1 while port_groupx4 expects comparison mask result.
> >
> > Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on
> powerpc")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >   v2 Changes:
> >   - Fix PPC, RISC-V, aarch32 compilation.
> >
> >   examples/common/altivec/port_group.h | 11 +++++++++--
> >   1 file changed, 9 insertions(+), 2 deletions(-)
> >
> > diff --git a/examples/common/altivec/port_group.h
> b/examples/common/altivec/port_group.h
> > index 5e209b02fa..592ef80b7f 100644
> > --- a/examples/common/altivec/port_group.h
> > +++ b/examples/common/altivec/port_group.h
> > @@ -26,12 +26,19 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t
> *lp,
> >   		uint16_t u16[FWDSTEP + 1];
> >   		uint64_t u64;
> >   	} *pnum = (void *)pn;
> > +	union u_vec {
> > +		__vector unsigned short v_us;
> > +		unsigned short s[8];
> > +	};
> >
> > +	union u_vec res;
> >   	int32_t v;
> >
> > -	v = vec_any_eq(dp1, dp2);
> > -
> > +	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
> 
> Altivec vec_cmpeq() is similar to Intel _mm_cmpeq_*(), so this looks
> right to me.
> 
> > +	res.v_us = dp1;
> >
> > +	v = (res.s[0] & 0x1) | (res.s[1] & 0x2) | (res.s[2] & 0x4) |
> > +	    (res.s[3] & 0x8);
> 
> This can be vectorized too.  The Intel _mm_unpacklo_epi16() intrinsic
> can be replaced with the following Altivec code:
> 
> extern __inline __m128i __attribute__((__gnu_inline__,
> __always_inline__, __artificial__))
> _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
> {
>    return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
> }
> 
> The Intel _mm_movemask_ps() intrinsic can be replaced with the following
> Altivec implementation:
> 
> /* Creates a 4-bit mask from the most significant bits of the SPFP
> values.  */
> extern __inline int __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_movemask_ps (__m128  __A)
> {
>    __vector unsigned long long result;
>    static const __vector unsigned int perm_mask =
>      {
> #ifdef __LITTLE_ENDIAN__
>          0x00204060, 0x80808080, 0x80808080, 0x80808080
> #else
>        0x80808080, 0x80808080, 0x80808080, 0x00204060
> #endif
>      };
> 
>    result = ((__vector unsigned long long)
>              vec_vbpermq ((__vector unsigned char) __A,
>                           (__vector unsigned char) perm_mask));
> 
> #ifdef __LITTLE_ENDIAN__
>    return result[1];
> #else
>    return result[0];
> #endif
> }
> 

Sure I will add this to the next version.

> Dave

Thanks, 
Pavan.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 1/5] examples/l3fwd: fix port group mask generation
  2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                     ` (4 preceding siblings ...)
  2022-09-08 18:33   ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation David Christensen
@ 2022-09-11 18:12   ` pbhagavatula
  2022-09-11 18:12     ` [PATCH v3 2/5] examples/l3fwd: split processing and send stages pbhagavatula
                       ` (4 more replies)
  5 siblings, 5 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-11 18:12 UTC (permalink / raw)
  To: jerinj, David Christensen; +Cc: dev, Pavan Nikhilesh, stable

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v3 Changes:
 - PPC optimize port mask generation.
 - Fix aarch32 compilation.

 v2 Changes:
 - Fix PPC, RISC-V, aarch32 compilation.

 examples/common/altivec/port_group.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
index 5e209b02fa..1c05bc025a 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,17 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
 	} *pnum = (void *)pn;
-
+	__vector unsigned long long result;
+	const __vector unsigned int perm_mask = {0x00204060, 0x80808080,
+						 0x80808080, 0x80808080};
 	int32_t v;

-	v = vec_any_eq(dp1, dp2);
-
+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
+	dp1 = vec_mergeh(dp1, dp1);
+	result = (__vector unsigned long long)vec_vbpermq(
+		(__vector unsigned char)dp1, (__vector unsigned char)perm_mask);

+	v = result[1];
 	/* update last port counter. */
 	lp[0] += gptbl[v].lpv;

--
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 2/5] examples/l3fwd: split processing and send stages
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
@ 2022-09-11 18:12     ` pbhagavatula
  2022-09-11 18:12     ` [PATCH v3 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-11 18:12 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
 examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
 examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
 	return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			 uint16_t *dst_port, uint16_t portid,
+			 struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i, j, pos;
-	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				dst_port[j + i] = em_get_dst_port(qconf,
 						pkts_burst[j + i], portid);
 		}
+
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 	}
 
-	for (; j < nb_rx; j++)
+	for (; j < nb_rx; j++) {
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
+	}
+}
 
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		      struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+	for (j = 0; j < nb_rx; j++)
 		pkts_burst[j] = ev[j]->mbuf;
-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-					       struct rte_ether_hdr *) + 1);
-	}
 
 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 			}
 			continue;
 		}
-		processx4_step3(&pkts_burst[j], &dst_port[j]);
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
 			pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint8_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				ipv4_flag[j / FWDSTEP],
 				portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
index 78ee83b76c..2a68c4c15e 100644
--- a/examples/l3fwd/l3fwd_lpm_neon.h
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
 	}
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i = 0, j = 0;
-	uint16_t dst_port[MAX_PKT_BURST];
 	int32x4_t dip;
 	uint32_t ipv4_flag;
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
 							void *));
 		}
-
 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
 			for (i = 0; i < FWDSTEP; i++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 			processx4_step2(qconf, dip, ipv4_flag, portid,
 					&pkts_burst[j], &dst_port[j]);
+			if (do_step3)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 		}
 
 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
 				&dst_port[j]);
+		if (do_step3)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 		j += FWDSTEP;
 	}
@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 							void *));
 			j++;
 		}
-
 		j -= m;
 		/* Classify last up to 3 packets one by one */
 		switch (m) {
 		case 3:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 2:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 1:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 		}
 	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index 3f637a23d1..db15030320 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		processx4_step2(qconf, dip[j / FWDSTEP],
 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
  2022-09-11 18:12     ` [PATCH v3 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-09-11 18:12     ` pbhagavatula
  2022-09-11 18:12     ` [PATCH v3 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-09-11 18:12 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_altivec.h | 29 ++++++++++++++
 examples/l3fwd/l3fwd_event.h   | 71 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.c     | 38 ++++++++++--------
 examples/l3fwd/l3fwd_neon.h    | 47 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h     | 44 +++++++++++++++++++++
 5 files changed, 213 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..e45e138e59 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__vector unsigned short dp1;
+		__vector unsigned short dp;
+
+		dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
+		res = vec_all_eq(dp1, dp);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..3fe38aada0 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
 	uint64_t vector_tmo_ns;
 };
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#else
+static inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	int i;
+
+	for (i = 0; i < nb_elem; i++) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+	}
+
+	return dst_ports[0];
+}
+#endif
+
 static inline void
 event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
 {
@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
 	}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+		   uint16_t nb_pkts)
+{
+	uint16_t *des_pos, free = 0;
+	struct rte_mbuf **pos;
+	int i;
+
+	/* Filter out and free bad packets */
+	for (i = 0; i < nb_pkts; i++) {
+		if (dst_port[i] == BAD_PORT) {
+			rte_pktmbuf_free(mbufs[i]);
+			if (!free) {
+				pos = &mbufs[i];
+				des_pos = &dst_port[i];
+			}
+			free++;
+			continue;
+		}
+
+		if (free) {
+			*pos = mbufs[i];
+			pos++;
+			*des_pos = dst_port[i];
+			des_pos++;
+		}
+	}
 
+	return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+	uint16_t port, i;
+
+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+	/* Verify destination array */
+	port = process_dst_port(dst_port, vec->nb_elem);
+	if (port == BAD_PORT) {
+		vec->attr_valid = 0;
+		for (i = 0; i < vec->nb_elem; i++) {
+			vec->mbufs[i]->port = dst_port[i];
+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+		}
+	} else {
+		vec->attr_valid = 1;
+		vec->port = port;
+		vec->queue = 0;
+	}
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index d1b850dd5b..1652b7c470 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf,
+			 uint16_t *dst_port)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int i;
 
-	/* Process first packet to init vector attributes */
-	lpm_process_event_pkt(lconf, mbufs[0]);
+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined RTE_ARCH_PPC_64
 	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
+					  dst_port, lconf, 1);
+	} else {
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
+						  &dst_port[i], lconf, 1);
 	}
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		dst_port[i] = lpm_process_event_pkt(lconf, mbufs[i]);
+#endif
 
-	for (i = 1; i < vec->nb_elem; i++) {
-		lpm_process_event_pkt(lconf, mbufs[i]);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	process_event_vector(vec, dst_port);
 }
 
 /* Same eventdev loop for single and burst of vector */
@@ -458,6 +461,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	struct rte_event events[MAX_PKT_BURST];
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
+	uint16_t *dst_port_list;
 	unsigned int lcore_id;
 
 	if (event_p_id < 0)
@@ -465,7 +469,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
-
+	dst_port_list =
+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+			    RTE_CACHE_LINE_SIZE);
+	if (dst_port_list == NULL)
+		return;
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
 
 	while (!force_quit) {
@@ -483,10 +491,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			lpm_process_event_vector(events[i].vec, lconf);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			lpm_process_event_vector(events[i].vec, lconf,
+						 dst_port_list);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index ce515e0bc4..bf365341fb 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -194,4 +194,51 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0;
+
+#if defined(RTE_ARCH_ARM64)
+	uint16_t res;
+
+	while (nb_elem > 7) {
+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
+		uint16x8_t dp1;
+
+		dp1 = vld1q_u16(&dst_ports[i]);
+		dp1 = vceqq_u16(dp1, dp);
+		res = vminvq_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
+		uint16x4_t dp1;
+
+		dp1 = vld1_u16(&dst_ports[i]);
+		dp1 = vceq_u16(dp1, dp);
+		res = vminv_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+#endif
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_NEON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 0f0d0323a2..083729cdef 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		res = _mm_movemask_epi8(dp1);
+		if (res != 0xFFFF)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
+		res = _mm_movemask_ps((__m128)dp1);
+		if (res != 0xF)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_SSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 4/5] examples/l3fwd: fix event vector processing in fib
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
  2022-09-11 18:12     ` [PATCH v3 2/5] examples/l3fwd: split processing and send stages pbhagavatula
  2022-09-11 18:12     ` [PATCH v3 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-09-11 18:12     ` pbhagavatula
  2022-10-07 20:03       ` [EXT] " Shijith Thotton
  2022-09-11 18:12     ` [PATCH v3 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  4 siblings, 1 reply; 41+ messages in thread
From: pbhagavatula @ 2022-09-11 18:12 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix stack overflow when event vector size is greater than
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_fib.c | 123 ++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 37 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index e02e4b3f5a..c4a45bc7f3 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -77,27 +77,37 @@ fib_parse_packet(struct rte_mbuf *mbuf,
  */
 #if !defined FIB_SEND_MULTI
 static inline void
-fib_send_single(int nb_tx, struct lcore_conf *qconf,
-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
 {
-	int32_t j;
 	struct rte_ether_hdr *eth_hdr;
 
-	for (j = 0; j < nb_tx; j++) {
-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
 #if defined DO_RFC_1812_CHECKS
-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
-				pkts_burst[j], struct rte_ether_hdr *) + 1),
-				&hops[j], pkts_burst[j]->packet_type);
+	rfc1812_process(
+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
+						pkt, struct rte_ether_hdr *) +
+					1),
+		hop, pkt->packet_type);
 #endif
 
-		/* Set MAC addresses. */
-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
-				struct rte_ether_hdr *);
-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
-				&eth_hdr->src_addr);
+	/* Set MAC addresses. */
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
+}
+
+static inline void
+fib_send_single(int nb_tx, struct lcore_conf *qconf,
+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+{
+	int32_t j;
 
+	for (j = 0; j < nb_tx; j++) {
+		process_packet(pkts_burst[j], &hops[j]);
+		if (hops[j] == BAD_PORT) {
+			rte_pktmbuf_free(pkts_burst[j]);
+			continue;
+		}
 		/* Send single packet. */
 		send_single_packet(qconf, pkts_burst[j], hops[j]);
 	}
@@ -261,7 +271,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 	uint32_t ipv4_arr[MAX_PKT_BURST];
 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-	uint16_t nh;
+	uint16_t nh, hops[MAX_PKT_BURST];
 	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +360,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 			else
 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 			if (nh != FIB_DEFAULT_HOP)
-				events[i].mbuf->port = nh;
+				hops[i] = nh != FIB_DEFAULT_HOP ?
+						  nh :
+						  events[i].mbuf->port;
+			process_packet(events[i].mbuf, &hops[i]);
+			events[i].mbuf->port = hops[i] != BAD_PORT ?
+						       hops[i] :
+						       events[i].mbuf->port;
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +434,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+			 uint32_t *ipv4_arr, uint16_t *hops)
 {
-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
 	struct rte_mbuf **mbufs = vec->mbufs;
-	uint32_t ipv4_arr[MAX_PKT_BURST];
-	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	struct lcore_conf *lconf;
 	uint16_t nh;
@@ -463,16 +477,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
 	/* Lookup IPv6 hops if IPv6 packets are present. */
 	if (ipv6_cnt > 0)
-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
-				     hopsv6, ipv6_cnt);
-
-	if (vec->attr_valid) {
-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
-		if (nh != FIB_DEFAULT_HOP)
-			vec->port = nh;
-		else
-			vec->attr_valid = 0;
-	}
+		rte_fib6_lookup_bulk(
+			lconf->ipv6_lookup_struct,
+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
+			ipv6_cnt);
 
 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
 	for (i = 0; i < vec->nb_elem; i++) {
@@ -481,9 +489,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
 		else
 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 		if (nh != FIB_DEFAULT_HOP)
-			mbufs[i]->port = nh;
-		event_vector_attr_validate(vec, mbufs[i]);
+			hops[i] = nh;
+		else
+			hops[i] = vec->attr_valid ? vec->port :
+						    vec->mbufs[i]->port;
 	}
+
+#if defined FIB_SEND_MULTI
+	uint16_t k;
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
 }
 
 static __rte_always_inline void
@@ -496,7 +521,32 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	const uint8_t event_d_id = evt_rsrc->event_d_id;
 	const uint16_t deq_len = evt_rsrc->deq_depth;
 	struct rte_event events[MAX_PKT_BURST];
+	uint8_t *type_arr, **ipv6_arr, *ptr;
 	int nb_enq = 0, nb_deq = 0, i;
+	uint64_t *hopsv4, *hopsv6;
+	uint32_t *ipv4_arr;
+	uint16_t *hops;
+	uintptr_t mem;
+
+	mem = (uintptr_t)rte_zmalloc(
+		"vector_fib",
+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
+			evt_rsrc->vector_size,
+		RTE_CACHE_LINE_SIZE);
+	if (mem == 0)
+		return;
+	ipv4_arr = (uint32_t *)mem;
+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
+
+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
+	for (i = 0; i < evt_rsrc->vector_size; i++)
+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
 
 	if (event_p_id < 0)
 		return;
@@ -519,10 +569,9 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			fib_process_event_vector(events[i].vec);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			fib_process_event_vector(events[i].vec, type_arr,
+						 ipv6_arr, hopsv4, hopsv6,
+						 ipv4_arr, hops);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 5/5] examples/l3fwd: use em vector path for event vector
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
                       ` (2 preceding siblings ...)
  2022-09-11 18:12     ` [PATCH v3 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-09-11 18:12     ` pbhagavatula
  2022-10-07 20:01       ` [EXT] " Shijith Thotton
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  4 siblings, 1 reply; 41+ messages in thread
From: pbhagavatula @ 2022-09-11 18:12 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em.c            | 12 +++--
 examples/l3fwd/l3fwd_em.h            | 29 +++++------
 examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
 examples/l3fwd/l3fwd_event.h         | 21 --------
 5 files changed, 47 insertions(+), 112 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 10be24c61d..e7b35cfbd9 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -852,10 +852,15 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
 	unsigned int lcore_id;
+	uint16_t *dst_ports;
 
 	if (event_p_id < 0)
 		return;
 
+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+				RTE_CACHE_LINE_SIZE);
+	if (dst_ports == NULL)
+		return;
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
 
@@ -877,13 +882,12 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 			}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-			l3fwd_em_process_event_vector(events[i].vec, lconf);
+			l3fwd_em_process_event_vector(events[i].vec, lconf,
+						      dst_ports);
 #else
 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
-							     lconf);
+							     lconf, dst_ports);
 #endif
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
index fe2ee59f6a..7d051fc076 100644
--- a/examples/l3fwd/l3fwd_em.h
+++ b/examples/l3fwd/l3fwd_em.h
@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t portid,
 	}
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint16_t
 l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 {
 	struct rte_ether_hdr *eth_hdr;
@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
 	else
 		m->port = BAD_PORT;
+
+	return m->port;
 }
 
 /*
@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct rte_event **events,
 
 static inline void
 l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
-				     struct lcore_conf *qconf)
+				     struct lcore_conf *qconf,
+				     uint16_t *dst_ports)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i;
@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
 
-	/* Process first packet to init vector attributes */
-	l3fwd_em_simple_process(mbufs[0], qconf);
-	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
-	}
-
 	/*
 	 * Prefetch and forward already prefetched packets.
 	 */
-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
 		rte_prefetch0(
 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void *));
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
 	}
 
 	/* Forward remaining prefetched packets */
-	for (; i < vec->nb_elem; i++) {
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	for (; i < vec->nb_elem; i++)
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_port)
 {
-	struct rte_mbuf **mbufs = vec->mbufs;
-	uint16_t dst_port[MAX_PKT_BURST];
-	int32_t i, j, n, pos;
-
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
-		rte_prefetch0(
-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
+	uint16_t i;
 
 	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
-		uint32_t pkt_type =
-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
-		uint32_t l3_type, tcp_or_udp;
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
-			pkt_type &= mbufs[j + i]->packet_type;
-
-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
-		     i++, pos++) {
-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
-						       struct rte_ether_hdr *) +
-				      1);
-		}
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else {
-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-				mbufs[j + i]->port =
-					em_get_dst_port(qconf, mbufs[j + i],
-							mbufs[j + i]->port);
-				process_packet(mbufs[j + i],
-					       &mbufs[j + i]->port);
-				event_vector_attr_validate(vec, mbufs[j + i]);
-			}
-			continue;
-		}
-		processx4_step3(&mbufs[j], &dst_port[j]);
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-			mbufs[j + i]->port = dst_port[j + i];
-			event_vector_attr_validate(vec, mbufs[j + i]);
-		}
-	}
-
-	for (; j < vec->nb_elem; j++) {
-		mbufs[j]->port =
-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
-		process_packet(mbufs[j], &mbufs[j]->port);
-		event_vector_attr_validate(vec, mbufs[j]);
-	}
+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs, dst_port,
+					 vec->port, qconf, 1);
+	else
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_em_process_packets(1, &vec->mbufs[i],
+						 &dst_port[i],
+						 vec->mbufs[i]->port, qconf, 1);
+
+	process_event_vector(vec, dst_port);
 }
 
 #endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index d2f75edb8a..067f23889a 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **events,
 
 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
 		struct rte_mbuf *mbuf = events[j]->mbuf;
+		uint16_t port;
 
 		if (i < nb_rx) {
 			rte_prefetch0(rte_pktmbuf_mtod(
 					events[i]->mbuf,
 					struct rte_ether_hdr *) + 1);
 		}
+		port = mbuf->port;
 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
 		process_packet(mbuf, &mbuf->port);
+		if (mbuf->port == BAD_PORT)
+			mbuf->port = port;
 	}
 }
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_ports)
 {
+	const uint8_t attr_valid = vec->attr_valid;
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i, j;
 
 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
 
-	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
 		if (j < vec->nb_elem)
 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
 						       struct rte_ether_hdr *) +
 				      1);
-		mbufs[i]->port =
-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
-		process_packet(mbufs[i], &mbufs[i]->port);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
+					       attr_valid ? vec->port :
+							    mbufs[i]->port);
 	}
+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != j; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &dst_ports[i]);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index 3fe38aada0..e21817c36b 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
 }
 #endif
 
-static inline void
-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
-{
-	/* l3fwd application only changes mbuf port while processing */
-	if (vec->attr_valid && (vec->port != mbuf->port))
-		vec->attr_valid = 0;
-}
-
-static inline void
-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
-{
-	if (vec->attr_valid) {
-		vec->queue = txq;
-	} else {
-		int i;
-
-		for (i = 0; i < vec->nb_elem; i++)
-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
-	}
-}
-
 static inline uint16_t
 filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
 		   uint16_t nb_pkts)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v3 5/5] examples/l3fwd: use em vector path for event vector
  2022-09-11 18:12     ` [PATCH v3 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-10-07 20:01       ` Shijith Thotton
  0 siblings, 0 replies; 41+ messages in thread
From: Shijith Thotton @ 2022-10-07 20:01 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula

>Use em vector path to process event vector.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>---
> examples/l3fwd/l3fwd_em.c            | 12 +++--
> examples/l3fwd/l3fwd_em.h            | 29 +++++------
> examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
> examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
> examples/l3fwd/l3fwd_event.h         | 21 --------
> 5 files changed, 47 insertions(+), 112 deletions(-)
>
>diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
>index 10be24c61d..e7b35cfbd9 100644
>--- a/examples/l3fwd/l3fwd_em.c
>+++ b/examples/l3fwd/l3fwd_em.c
>@@ -852,10 +852,15 @@ em_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 	int i, nb_enq = 0, nb_deq = 0;
> 	struct lcore_conf *lconf;
> 	unsigned int lcore_id;
>+	uint16_t *dst_ports;
>
> 	if (event_p_id < 0)
> 		return;
>
>+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
>+				RTE_CACHE_LINE_SIZE);

Free missing.

>+	if (dst_ports == NULL)
>+		return;
> 	lcore_id = rte_lcore_id();
> 	lconf = &lcore_conf[lcore_id];
>
>@@ -877,13 +882,12 @@ em_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 			}
>
> #if defined RTE_ARCH_X86 || defined __ARM_NEON
>-			l3fwd_em_process_event_vector(events[i].vec, lconf);
>+			l3fwd_em_process_event_vector(events[i].vec, lconf,
>+						      dst_ports);
> #else
> 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
>-							     lconf);
>+							     lconf, dst_ports);
> #endif
>-			if (flags & L3FWD_EVENT_TX_DIRECT)
>-				event_vector_txq_set(events[i].vec, 0);
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
>index fe2ee59f6a..7d051fc076 100644
>--- a/examples/l3fwd/l3fwd_em.h
>+++ b/examples/l3fwd/l3fwd_em.h
>@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t
>portid,
> 	}
> }
>
>-static __rte_always_inline void
>+static __rte_always_inline uint16_t
> l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
> {
> 	struct rte_ether_hdr *eth_hdr;
>@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct
>lcore_conf *qconf)
> 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
> 	else
> 		m->port = BAD_PORT;
>+
>+	return m->port;
> }
>
> /*
>@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct
>rte_event **events,
>
> static inline void
> l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
>-				     struct lcore_conf *qconf)
>+				     struct lcore_conf *qconf,
>+				     uint16_t *dst_ports)
> {
> 	struct rte_mbuf **mbufs = vec->mbufs;
> 	int32_t i;
>@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct
>rte_event_vector *vec,
> 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
> 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
>
>-	/* Process first packet to init vector attributes */
>-	l3fwd_em_simple_process(mbufs[0], qconf);
>-	if (vec->attr_valid) {
>-		if (mbufs[0]->port != BAD_PORT)
>-			vec->port = mbufs[0]->port;
>-		else
>-			vec->attr_valid = 0;
>-	}
>-
> 	/*
> 	 * Prefetch and forward already prefetched packets.
> 	 */
>-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
>+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
> 		rte_prefetch0(
> 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void
>*));
>-		l3fwd_em_simple_process(mbufs[i], qconf);
>-		event_vector_attr_validate(vec, mbufs[i]);
>+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
> 	}
>
> 	/* Forward remaining prefetched packets */
>-	for (; i < vec->nb_elem; i++) {
>-		l3fwd_em_simple_process(mbufs[i], qconf);
>-		event_vector_attr_validate(vec, mbufs[i]);
>-	}
>+	for (; i < vec->nb_elem; i++)
>+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
>+
>+	process_event_vector(vec, dst_ports);
> }
>
> #endif /* __L3FWD_EM_H__ */
>diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
>index 12b997e477..2e11eefad7 100644
>--- a/examples/l3fwd/l3fwd_em_hlm.h
>+++ b/examples/l3fwd/l3fwd_em_hlm.h
>@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event
>**ev,
>
> static inline void
> l3fwd_em_process_event_vector(struct rte_event_vector *vec,
>-			      struct lcore_conf *qconf)
>+			      struct lcore_conf *qconf, uint16_t *dst_port)
> {
>-	struct rte_mbuf **mbufs = vec->mbufs;
>-	uint16_t dst_port[MAX_PKT_BURST];
>-	int32_t i, j, n, pos;
>-
>-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
>-		rte_prefetch0(
>-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
>+	uint16_t i;
>
> 	if (vec->attr_valid)
>-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
>-
>-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
>-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
>-		uint32_t pkt_type =
>-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP |
>RTE_PTYPE_L4_UDP;
>-		uint32_t l3_type, tcp_or_udp;
>-
>-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
>-			pkt_type &= mbufs[j + i]->packet_type;
>-
>-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
>-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP |
>RTE_PTYPE_L4_UDP);
>-
>-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
>-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
>-		     i++, pos++) {
>-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
>-						       struct rte_ether_hdr *) +
>-				      1);
>-		}
>-
>-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
>-						      &dst_port[j]);
>-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
>-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
>-						      &dst_port[j]);
>-		} else {
>-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
>-				mbufs[j + i]->port =
>-					em_get_dst_port(qconf, mbufs[j + i],
>-							mbufs[j + i]->port);
>-				process_packet(mbufs[j + i],
>-					       &mbufs[j + i]->port);
>-				event_vector_attr_validate(vec, mbufs[j + i]);
>-			}
>-			continue;
>-		}
>-		processx4_step3(&mbufs[j], &dst_port[j]);
>-
>-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
>-			mbufs[j + i]->port = dst_port[j + i];
>-			event_vector_attr_validate(vec, mbufs[j + i]);
>-		}
>-	}
>-
>-	for (; j < vec->nb_elem; j++) {
>-		mbufs[j]->port =
>-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
>-		process_packet(mbufs[j], &mbufs[j]->port);
>-		event_vector_attr_validate(vec, mbufs[j]);
>-	}
>+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs,
>dst_port,
>+					 vec->port, qconf, 1);
>+	else
>+		for (i = 0; i < vec->nb_elem; i++)
>+			l3fwd_em_process_packets(1, &vec->mbufs[i],
>+						 &dst_port[i],
>+						 vec->mbufs[i]->port, qconf, 1);
>+
>+	process_event_vector(vec, dst_port);
> }
>
> #endif /* __L3FWD_EM_HLM_H__ */
>diff --git a/examples/l3fwd/l3fwd_em_sequential.h
>b/examples/l3fwd/l3fwd_em_sequential.h
>index d2f75edb8a..067f23889a 100644
>--- a/examples/l3fwd/l3fwd_em_sequential.h
>+++ b/examples/l3fwd/l3fwd_em_sequential.h
>@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event
>**events,
>
> 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
> 		struct rte_mbuf *mbuf = events[j]->mbuf;
>+		uint16_t port;
>
> 		if (i < nb_rx) {
> 			rte_prefetch0(rte_pktmbuf_mtod(
> 					events[i]->mbuf,
> 					struct rte_ether_hdr *) + 1);
> 		}
>+		port = mbuf->port;
> 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
> 		process_packet(mbuf, &mbuf->port);
>+		if (mbuf->port == BAD_PORT)
>+			mbuf->port = port;
> 	}
> }
>
> static inline void
> l3fwd_em_process_event_vector(struct rte_event_vector *vec,
>-			      struct lcore_conf *qconf)
>+			      struct lcore_conf *qconf, uint16_t *dst_ports)
> {
>+	const uint8_t attr_valid = vec->attr_valid;
> 	struct rte_mbuf **mbufs = vec->mbufs;
> 	int32_t i, j;
>
> 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
>
>-	if (vec->attr_valid)
>-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
>-
> 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
> 		if (j < vec->nb_elem)
> 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
> 						       struct rte_ether_hdr *) +
> 				      1);
>-		mbufs[i]->port =
>-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
>-		process_packet(mbufs[i], &mbufs[i]->port);
>-		event_vector_attr_validate(vec, mbufs[i]);
>+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
>+					       attr_valid ? vec->port :
>+							    mbufs[i]->port);
> 	}
>+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
>+
>+	for (i = 0; i != j; i += FWDSTEP)
>+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
>+	for (; i < vec->nb_elem; i++)
>+		process_packet(vec->mbufs[i], &dst_ports[i]);
>+
>+	process_event_vector(vec, dst_ports);
> }
>
> #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
>diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
>index 3fe38aada0..e21817c36b 100644
>--- a/examples/l3fwd/l3fwd_event.h
>+++ b/examples/l3fwd/l3fwd_event.h
>@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t
>nb_elem)
> }
> #endif
>
>-static inline void
>-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf
>*mbuf)
>-{
>-	/* l3fwd application only changes mbuf port while processing */
>-	if (vec->attr_valid && (vec->port != mbuf->port))
>-		vec->attr_valid = 0;
>-}
>-
>-static inline void
>-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
>-{
>-	if (vec->attr_valid) {
>-		vec->queue = txq;
>-	} else {
>-		int i;
>-
>-		for (i = 0; i < vec->nb_elem; i++)
>-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
>-	}
>-}
>-
> static inline uint16_t
> filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
> 		   uint16_t nb_pkts)
>--
>2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v3 4/5] examples/l3fwd: fix event vector processing in fib
  2022-09-11 18:12     ` [PATCH v3 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-10-07 20:03       ` Shijith Thotton
  0 siblings, 0 replies; 41+ messages in thread
From: Shijith Thotton @ 2022-10-07 20:03 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula

>
>Fix stack overflow when event vector size is greater than
>MAX_BURST_SIZE.
>Add missing mac swap and rfc1812 stage.
>
>Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>---
> examples/l3fwd/l3fwd_fib.c | 123 ++++++++++++++++++++++++++-----------
> 1 file changed, 86 insertions(+), 37 deletions(-)
>
>diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
>index e02e4b3f5a..c4a45bc7f3 100644
>--- a/examples/l3fwd/l3fwd_fib.c
>+++ b/examples/l3fwd/l3fwd_fib.c
>@@ -77,27 +77,37 @@ fib_parse_packet(struct rte_mbuf *mbuf,
>  */
> #if !defined FIB_SEND_MULTI
> static inline void
>-fib_send_single(int nb_tx, struct lcore_conf *qconf,
>-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
>+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
> {
>-	int32_t j;
> 	struct rte_ether_hdr *eth_hdr;
>
>-	for (j = 0; j < nb_tx; j++) {
>-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
>+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
> #if defined DO_RFC_1812_CHECKS
>-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
>-				pkts_burst[j], struct rte_ether_hdr *) + 1),
>-				&hops[j], pkts_burst[j]->packet_type);
>+	rfc1812_process(
>+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
>+						pkt, struct rte_ether_hdr *) +
>+					1),
>+		hop, pkt->packet_type);
> #endif
>
>-		/* Set MAC addresses. */
>-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
>-				struct rte_ether_hdr *);
>-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
>-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
>-				&eth_hdr->src_addr);
>+	/* Set MAC addresses. */
>+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
>+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
>+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
>+}
>+
>+static inline void
>+fib_send_single(int nb_tx, struct lcore_conf *qconf,
>+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
>+{
>+	int32_t j;
>
>+	for (j = 0; j < nb_tx; j++) {
>+		process_packet(pkts_burst[j], &hops[j]);
>+		if (hops[j] == BAD_PORT) {
>+			rte_pktmbuf_free(pkts_burst[j]);
>+			continue;
>+		}
> 		/* Send single packet. */
> 		send_single_packet(qconf, pkts_burst[j], hops[j]);
> 	}
>@@ -261,7 +271,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
> 	uint32_t ipv4_arr[MAX_PKT_BURST];
> 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
> 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
>-	uint16_t nh;
>+	uint16_t nh, hops[MAX_PKT_BURST];
> 	uint8_t type_arr[MAX_PKT_BURST];
> 	uint32_t ipv4_cnt, ipv6_cnt;
> 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
>@@ -350,7 +360,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
> 			else
> 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
> 			if (nh != FIB_DEFAULT_HOP)
>-				events[i].mbuf->port = nh;
>+				hops[i] = nh != FIB_DEFAULT_HOP ?
>+						  nh :
>+						  events[i].mbuf->port;
>+			process_packet(events[i].mbuf, &hops[i]);
>+			events[i].mbuf->port = hops[i] != BAD_PORT ?
>+						       hops[i] :
>+						       events[i].mbuf->port;
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>@@ -418,14 +434,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void
>*dummy)
> }
>
> static __rte_always_inline void
>-fib_process_event_vector(struct rte_event_vector *vec)
>+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
>+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
>+			 uint32_t *ipv4_arr, uint16_t *hops)
> {
>-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
>-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
> 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
> 	struct rte_mbuf **mbufs = vec->mbufs;
>-	uint32_t ipv4_arr[MAX_PKT_BURST];
>-	uint8_t type_arr[MAX_PKT_BURST];
> 	uint32_t ipv4_cnt, ipv6_cnt;
> 	struct lcore_conf *lconf;
> 	uint16_t nh;
>@@ -463,16 +477,10 @@ fib_process_event_vector(struct rte_event_vector
>*vec)
>
> 	/* Lookup IPv6 hops if IPv6 packets are present. */
> 	if (ipv6_cnt > 0)
>-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
>-				     hopsv6, ipv6_cnt);
>-
>-	if (vec->attr_valid) {
>-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
>-		if (nh != FIB_DEFAULT_HOP)
>-			vec->port = nh;
>-		else
>-			vec->attr_valid = 0;
>-	}
>+		rte_fib6_lookup_bulk(
>+			lconf->ipv6_lookup_struct,
>+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
>+			ipv6_cnt);
>
> 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
> 	for (i = 0; i < vec->nb_elem; i++) {
>@@ -481,9 +489,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
> 		else
> 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
> 		if (nh != FIB_DEFAULT_HOP)
>-			mbufs[i]->port = nh;
>-		event_vector_attr_validate(vec, mbufs[i]);
>+			hops[i] = nh;
>+		else
>+			hops[i] = vec->attr_valid ? vec->port :
>+						    vec->mbufs[i]->port;
> 	}
>+
>+#if defined FIB_SEND_MULTI
>+	uint16_t k;
>+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
>+
>+	for (i = 0; i != k; i += FWDSTEP)
>+		processx4_step3(&vec->mbufs[i], &hops[i]);
>+	for (; i < vec->nb_elem; i++)
>+		process_packet(vec->mbufs[i], &hops[i]);
>+#else
>+	for (i = 0; i < vec->nb_elem; i++)
>+		process_packet(vec->mbufs[i], &hops[i]);
>+#endif
>+
>+	process_event_vector(vec, hops);
> }
>
> static __rte_always_inline void
>@@ -496,7 +521,32 @@ fib_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 	const uint8_t event_d_id = evt_rsrc->event_d_id;
> 	const uint16_t deq_len = evt_rsrc->deq_depth;
> 	struct rte_event events[MAX_PKT_BURST];
>+	uint8_t *type_arr, **ipv6_arr, *ptr;
> 	int nb_enq = 0, nb_deq = 0, i;
>+	uint64_t *hopsv4, *hopsv6;
>+	uint32_t *ipv4_arr;
>+	uint16_t *hops;
>+	uintptr_t mem;
>+
>+	mem = (uintptr_t)rte_zmalloc(
>+		"vector_fib",
>+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
>+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
>+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
>+			evt_rsrc->vector_size,
>+		RTE_CACHE_LINE_SIZE);
 
Free missing.

>+	if (mem == 0)
>+		return;
>+	ipv4_arr = (uint32_t *)mem;
>+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
>+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
>+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
>+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
>+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
>+
>+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
>+	for (i = 0; i < evt_rsrc->vector_size; i++)
>+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
>
> 	if (event_p_id < 0)
> 		return;
>@@ -519,10 +569,9 @@ fib_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 				events[i].op = RTE_EVENT_OP_FORWARD;
> 			}
>
>-			fib_process_event_vector(events[i].vec);
>-
>-			if (flags & L3FWD_EVENT_TX_DIRECT)
>-				event_vector_txq_set(events[i].vec, 0);
>+			fib_process_event_vector(events[i].vec, type_arr,
>+						 ipv6_arr, hopsv4, hopsv6,
>+						 ipv4_arr, hops);
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>--
>2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 1/5] examples/l3fwd: fix port group mask generation
  2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
                       ` (3 preceding siblings ...)
  2022-09-11 18:12     ` [PATCH v3 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-10-11  9:08     ` pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 2/5] examples/l3fwd: split processing and send stages pbhagavatula
                         ` (4 more replies)
  4 siblings, 5 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-11  9:08 UTC (permalink / raw)
  To: jerinj, David Christensen; +Cc: dev, Pavan Nikhilesh, stable

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v4 Changes:
 - Fix missing `rte_free`.

 v3 Changes:
 - PPC optimize port mask generation.
 - Fix aarch32 compilation.

 v2 Changes:
 - Fix PPC, RISC-V, aarch32 compilation.

 examples/common/altivec/port_group.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
index 5e209b02fa..1c05bc025a 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,17 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
 	} *pnum = (void *)pn;
-
+	__vector unsigned long long result;
+	const __vector unsigned int perm_mask = {0x00204060, 0x80808080,
+						 0x80808080, 0x80808080};
 	int32_t v;

-	v = vec_any_eq(dp1, dp2);
-
+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
+	dp1 = vec_mergeh(dp1, dp1);
+	result = (__vector unsigned long long)vec_vbpermq(
+		(__vector unsigned char)dp1, (__vector unsigned char)perm_mask);

+	v = result[1];
 	/* update last port counter. */
 	lp[0] += gptbl[v].lpv;

--
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 2/5] examples/l3fwd: split processing and send stages
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
@ 2022-10-11  9:08       ` pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
                         ` (3 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-11  9:08 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
 examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
 examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
 	return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			 uint16_t *dst_port, uint16_t portid,
+			 struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i, j, pos;
-	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				dst_port[j + i] = em_get_dst_port(qconf,
 						pkts_burst[j + i], portid);
 		}
+
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 	}
 
-	for (; j < nb_rx; j++)
+	for (; j < nb_rx; j++) {
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
+	}
+}
 
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		      struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+	for (j = 0; j < nb_rx; j++)
 		pkts_burst[j] = ev[j]->mbuf;
-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-					       struct rte_ether_hdr *) + 1);
-	}
 
 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 			}
 			continue;
 		}
-		processx4_step3(&pkts_burst[j], &dst_port[j]);
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
 			pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint8_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				ipv4_flag[j / FWDSTEP],
 				portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
index 78ee83b76c..2a68c4c15e 100644
--- a/examples/l3fwd/l3fwd_lpm_neon.h
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
 	}
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i = 0, j = 0;
-	uint16_t dst_port[MAX_PKT_BURST];
 	int32x4_t dip;
 	uint32_t ipv4_flag;
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
 							void *));
 		}
-
 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
 			for (i = 0; i < FWDSTEP; i++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 			processx4_step2(qconf, dip, ipv4_flag, portid,
 					&pkts_burst[j], &dst_port[j]);
+			if (do_step3)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 		}
 
 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
 				&dst_port[j]);
+		if (do_step3)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 		j += FWDSTEP;
 	}
@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 							void *));
 			j++;
 		}
-
 		j -= m;
 		/* Classify last up to 3 packets one by one */
 		switch (m) {
 		case 3:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 2:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 1:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 		}
 	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index 3f637a23d1..db15030320 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		processx4_step2(qconf, dip[j / FWDSTEP],
 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-10-11  9:08       ` pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
                         ` (2 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-11  9:08 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_altivec.h | 29 ++++++++++++++
 examples/l3fwd/l3fwd_event.h   | 71 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.c     | 39 +++++++++++--------
 examples/l3fwd/l3fwd_neon.h    | 47 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h     | 44 +++++++++++++++++++++
 5 files changed, 214 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..e45e138e59 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__vector unsigned short dp1;
+		__vector unsigned short dp;
+
+		dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
+		res = vec_all_eq(dp1, dp);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..3fe38aada0 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
 	uint64_t vector_tmo_ns;
 };
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#else
+static inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	int i;
+
+	for (i = 0; i < nb_elem; i++) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+	}
+
+	return dst_ports[0];
+}
+#endif
+
 static inline void
 event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
 {
@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
 	}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+		   uint16_t nb_pkts)
+{
+	uint16_t *des_pos, free = 0;
+	struct rte_mbuf **pos;
+	int i;
+
+	/* Filter out and free bad packets */
+	for (i = 0; i < nb_pkts; i++) {
+		if (dst_port[i] == BAD_PORT) {
+			rte_pktmbuf_free(mbufs[i]);
+			if (!free) {
+				pos = &mbufs[i];
+				des_pos = &dst_port[i];
+			}
+			free++;
+			continue;
+		}
+
+		if (free) {
+			*pos = mbufs[i];
+			pos++;
+			*des_pos = dst_port[i];
+			des_pos++;
+		}
+	}
 
+	return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+	uint16_t port, i;
+
+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+	/* Verify destination array */
+	port = process_dst_port(dst_port, vec->nb_elem);
+	if (port == BAD_PORT) {
+		vec->attr_valid = 0;
+		for (i = 0; i < vec->nb_elem; i++) {
+			vec->mbufs[i]->port = dst_port[i];
+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+		}
+	} else {
+		vec->attr_valid = 1;
+		vec->port = port;
+		vec->queue = 0;
+	}
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index 22d7f61a42..5172979c72 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf,
+			 uint16_t *dst_port)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int i;
 
-	/* Process first packet to init vector attributes */
-	lpm_process_event_pkt(lconf, mbufs[0]);
+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined RTE_ARCH_PPC_64
 	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
+					  dst_port, lconf, 1);
+	} else {
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
+						  &dst_port[i], lconf, 1);
 	}
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		dst_port[i] = lpm_process_event_pkt(lconf, mbufs[i]);
+#endif
 
-	for (i = 1; i < vec->nb_elem; i++) {
-		lpm_process_event_pkt(lconf, mbufs[i]);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	process_event_vector(vec, dst_port);
 }
 
 /* Same eventdev loop for single and burst of vector */
@@ -458,6 +461,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	struct rte_event events[MAX_PKT_BURST];
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
+	uint16_t *dst_port_list;
 	unsigned int lcore_id;
 
 	if (event_p_id < 0)
@@ -465,7 +469,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
-
+	dst_port_list =
+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+			    RTE_CACHE_LINE_SIZE);
+	if (dst_port_list == NULL)
+		return;
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
 
 	while (!force_quit) {
@@ -483,10 +491,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			lpm_process_event_vector(events[i].vec, lconf);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			lpm_process_event_vector(events[i].vec, lconf,
+						 dst_port_list);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -510,6 +516,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(dst_port_list);
 }
 
 int __rte_noinline
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index ce515e0bc4..bf365341fb 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -194,4 +194,51 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0;
+
+#if defined(RTE_ARCH_ARM64)
+	uint16_t res;
+
+	while (nb_elem > 7) {
+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
+		uint16x8_t dp1;
+
+		dp1 = vld1q_u16(&dst_ports[i]);
+		dp1 = vceqq_u16(dp1, dp);
+		res = vminvq_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
+		uint16x4_t dp1;
+
+		dp1 = vld1_u16(&dst_ports[i]);
+		dp1 = vceq_u16(dp1, dp);
+		res = vminv_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+#endif
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_NEON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 0f0d0323a2..083729cdef 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		res = _mm_movemask_epi8(dp1);
+		if (res != 0xFFFF)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
+		res = _mm_movemask_ps((__m128)dp1);
+		if (res != 0xF)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_SSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 4/5] examples/l3fwd: fix event vector processing in fib
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 2/5] examples/l3fwd: split processing and send stages pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-10-11  9:08       ` pbhagavatula
  2022-10-11  9:08       ` [PATCH v4 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-11  9:08 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix stack overflow when event vector size is greater than
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_fib.c | 130 ++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 39 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index b82e0c0354..407e9def71 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -77,27 +77,37 @@ fib_parse_packet(struct rte_mbuf *mbuf,
  */
 #if !defined FIB_SEND_MULTI
 static inline void
-fib_send_single(int nb_tx, struct lcore_conf *qconf,
-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
 {
-	int32_t j;
 	struct rte_ether_hdr *eth_hdr;
 
-	for (j = 0; j < nb_tx; j++) {
-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
 #if defined DO_RFC_1812_CHECKS
-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
-				pkts_burst[j], struct rte_ether_hdr *) + 1),
-				&hops[j], pkts_burst[j]->packet_type);
+	rfc1812_process(
+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
+						pkt, struct rte_ether_hdr *) +
+					1),
+		hop, pkt->packet_type);
 #endif
 
-		/* Set MAC addresses. */
-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
-				struct rte_ether_hdr *);
-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
-				&eth_hdr->src_addr);
+	/* Set MAC addresses. */
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
+}
 
+static inline void
+fib_send_single(int nb_tx, struct lcore_conf *qconf,
+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+{
+	int32_t j;
+
+	for (j = 0; j < nb_tx; j++) {
+		process_packet(pkts_burst[j], &hops[j]);
+		if (hops[j] == BAD_PORT) {
+			rte_pktmbuf_free(pkts_burst[j]);
+			continue;
+		}
 		/* Send single packet. */
 		send_single_packet(qconf, pkts_burst[j], hops[j]);
 	}
@@ -261,7 +271,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 	uint32_t ipv4_arr[MAX_PKT_BURST];
 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-	uint16_t nh;
+	uint16_t nh, hops[MAX_PKT_BURST];
 	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +360,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 			else
 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 			if (nh != FIB_DEFAULT_HOP)
-				events[i].mbuf->port = nh;
+				hops[i] = nh != FIB_DEFAULT_HOP ?
+						  nh :
+						  events[i].mbuf->port;
+			process_packet(events[i].mbuf, &hops[i]);
+			events[i].mbuf->port = hops[i] != BAD_PORT ?
+						       hops[i] :
+						       events[i].mbuf->port;
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +434,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+			 uint32_t *ipv4_arr, uint16_t *hops)
 {
-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
 	struct rte_mbuf **mbufs = vec->mbufs;
-	uint32_t ipv4_arr[MAX_PKT_BURST];
-	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	struct lcore_conf *lconf;
 	uint16_t nh;
@@ -463,16 +477,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
 	/* Lookup IPv6 hops if IPv6 packets are present. */
 	if (ipv6_cnt > 0)
-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
-				     hopsv6, ipv6_cnt);
-
-	if (vec->attr_valid) {
-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
-		if (nh != FIB_DEFAULT_HOP)
-			vec->port = nh;
-		else
-			vec->attr_valid = 0;
-	}
+		rte_fib6_lookup_bulk(
+			lconf->ipv6_lookup_struct,
+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
+			ipv6_cnt);
 
 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
 	for (i = 0; i < vec->nb_elem; i++) {
@@ -481,9 +489,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
 		else
 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 		if (nh != FIB_DEFAULT_HOP)
-			mbufs[i]->port = nh;
-		event_vector_attr_validate(vec, mbufs[i]);
+			hops[i] = nh;
+		else
+			hops[i] = vec->attr_valid ? vec->port :
+						    vec->mbufs[i]->port;
 	}
+
+#if defined FIB_SEND_MULTI
+	uint16_t k;
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
 }
 
 static __rte_always_inline void
@@ -496,10 +521,37 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	const uint8_t event_d_id = evt_rsrc->event_d_id;
 	const uint16_t deq_len = evt_rsrc->deq_depth;
 	struct rte_event events[MAX_PKT_BURST];
+	uint8_t *type_arr, **ipv6_arr, *ptr;
 	int nb_enq = 0, nb_deq = 0, i;
-
-	if (event_p_id < 0)
+	uint64_t *hopsv4, *hopsv6;
+	uint32_t *ipv4_arr;
+	uint16_t *hops;
+	uintptr_t mem;
+
+	mem = (uintptr_t)rte_zmalloc(
+		"vector_fib",
+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
+			evt_rsrc->vector_size,
+		RTE_CACHE_LINE_SIZE);
+	if (mem == 0)
 		return;
+	ipv4_arr = (uint32_t *)mem;
+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
+
+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
+	for (i = 0; i < evt_rsrc->vector_size; i++)
+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
+
+	if (event_p_id < 0) {
+		rte_free(mem);
+		return;
+	}
 
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__,
 		rte_lcore_id());
@@ -519,10 +571,9 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			fib_process_event_vector(events[i].vec);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			fib_process_event_vector(events[i].vec, type_arr,
+						 ipv6_arr, hopsv4, hopsv6,
+						 ipv4_arr, hops);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -546,6 +597,7 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(mem);
 }
 
 int __rte_noinline
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4 5/5] examples/l3fwd: use em vector path for event vector
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                         ` (2 preceding siblings ...)
  2022-10-11  9:08       ` [PATCH v4 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-10-11  9:08       ` pbhagavatula
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-11  9:08 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em.c            | 13 +++--
 examples/l3fwd/l3fwd_em.h            | 29 +++++------
 examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
 examples/l3fwd/l3fwd_event.h         | 21 --------
 5 files changed, 48 insertions(+), 112 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index a203dc9e46..35de31157e 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -860,10 +860,15 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
 	unsigned int lcore_id;
+	uint16_t *dst_ports;
 
 	if (event_p_id < 0)
 		return;
 
+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+				RTE_CACHE_LINE_SIZE);
+	if (dst_ports == NULL)
+		return;
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
 
@@ -885,13 +890,12 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 			}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-			l3fwd_em_process_event_vector(events[i].vec, lconf);
+			l3fwd_em_process_event_vector(events[i].vec, lconf,
+						      dst_ports);
 #else
 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
-							     lconf);
+							     lconf, dst_ports);
 #endif
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -915,6 +919,7 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(dst_ports);
 }
 
 int __rte_noinline
diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
index fe2ee59f6a..7d051fc076 100644
--- a/examples/l3fwd/l3fwd_em.h
+++ b/examples/l3fwd/l3fwd_em.h
@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t portid,
 	}
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint16_t
 l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 {
 	struct rte_ether_hdr *eth_hdr;
@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
 	else
 		m->port = BAD_PORT;
+
+	return m->port;
 }
 
 /*
@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct rte_event **events,
 
 static inline void
 l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
-				     struct lcore_conf *qconf)
+				     struct lcore_conf *qconf,
+				     uint16_t *dst_ports)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i;
@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
 
-	/* Process first packet to init vector attributes */
-	l3fwd_em_simple_process(mbufs[0], qconf);
-	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
-	}
-
 	/*
 	 * Prefetch and forward already prefetched packets.
 	 */
-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
 		rte_prefetch0(
 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void *));
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
 	}
 
 	/* Forward remaining prefetched packets */
-	for (; i < vec->nb_elem; i++) {
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	for (; i < vec->nb_elem; i++)
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_port)
 {
-	struct rte_mbuf **mbufs = vec->mbufs;
-	uint16_t dst_port[MAX_PKT_BURST];
-	int32_t i, j, n, pos;
-
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
-		rte_prefetch0(
-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
+	uint16_t i;
 
 	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
-		uint32_t pkt_type =
-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
-		uint32_t l3_type, tcp_or_udp;
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
-			pkt_type &= mbufs[j + i]->packet_type;
-
-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
-		     i++, pos++) {
-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
-						       struct rte_ether_hdr *) +
-				      1);
-		}
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else {
-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-				mbufs[j + i]->port =
-					em_get_dst_port(qconf, mbufs[j + i],
-							mbufs[j + i]->port);
-				process_packet(mbufs[j + i],
-					       &mbufs[j + i]->port);
-				event_vector_attr_validate(vec, mbufs[j + i]);
-			}
-			continue;
-		}
-		processx4_step3(&mbufs[j], &dst_port[j]);
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-			mbufs[j + i]->port = dst_port[j + i];
-			event_vector_attr_validate(vec, mbufs[j + i]);
-		}
-	}
-
-	for (; j < vec->nb_elem; j++) {
-		mbufs[j]->port =
-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
-		process_packet(mbufs[j], &mbufs[j]->port);
-		event_vector_attr_validate(vec, mbufs[j]);
-	}
+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs, dst_port,
+					 vec->port, qconf, 1);
+	else
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_em_process_packets(1, &vec->mbufs[i],
+						 &dst_port[i],
+						 vec->mbufs[i]->port, qconf, 1);
+
+	process_event_vector(vec, dst_port);
 }
 
 #endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index d2f75edb8a..067f23889a 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **events,
 
 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
 		struct rte_mbuf *mbuf = events[j]->mbuf;
+		uint16_t port;
 
 		if (i < nb_rx) {
 			rte_prefetch0(rte_pktmbuf_mtod(
 					events[i]->mbuf,
 					struct rte_ether_hdr *) + 1);
 		}
+		port = mbuf->port;
 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
 		process_packet(mbuf, &mbuf->port);
+		if (mbuf->port == BAD_PORT)
+			mbuf->port = port;
 	}
 }
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_ports)
 {
+	const uint8_t attr_valid = vec->attr_valid;
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i, j;
 
 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
 
-	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
 		if (j < vec->nb_elem)
 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
 						       struct rte_ether_hdr *) +
 				      1);
-		mbufs[i]->port =
-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
-		process_packet(mbufs[i], &mbufs[i]->port);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
+					       attr_valid ? vec->port :
+							    mbufs[i]->port);
 	}
+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != j; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &dst_ports[i]);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index 3fe38aada0..e21817c36b 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
 }
 #endif
 
-static inline void
-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
-{
-	/* l3fwd application only changes mbuf port while processing */
-	if (vec->attr_valid && (vec->port != mbuf->port))
-		vec->attr_valid = 0;
-}
-
-static inline void
-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
-{
-	if (vec->attr_valid) {
-		vec->queue = txq;
-	} else {
-		int i;
-
-		for (i = 0; i < vec->nb_elem; i++)
-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
-	}
-}
-
 static inline uint16_t
 filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
 		   uint16_t nb_pkts)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v5 1/5] examples/l3fwd: fix port group mask generation
  2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                         ` (3 preceding siblings ...)
  2022-10-11  9:08       ` [PATCH v4 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-10-11 10:12       ` pbhagavatula
  2022-10-11 10:12         ` [PATCH v5 2/5] examples/l3fwd: split processing and send stages pbhagavatula
                           ` (5 more replies)
  4 siblings, 6 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-11 10:12 UTC (permalink / raw)
  To: jerinj, David Christensen; +Cc: dev, Pavan Nikhilesh, stable

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 v5 Changes:
 - Fix compilation errors.

 v4 Changes:
 - Fix missing `rte_free`.

 v3 Changes:
 - PPC optimize port mask generation.
 - Fix aarch32 compilation.

 v2 Changes:
 - Fix PPC, RISC-V, aarch32 compilation.

 examples/common/altivec/port_group.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
index 5e209b02fa..1c05bc025a 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,17 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
 	} *pnum = (void *)pn;
-
+	__vector unsigned long long result;
+	const __vector unsigned int perm_mask = {0x00204060, 0x80808080,
+						 0x80808080, 0x80808080};
 	int32_t v;

-	v = vec_any_eq(dp1, dp2);
-
+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
+	dp1 = vec_mergeh(dp1, dp1);
+	result = (__vector unsigned long long)vec_vbpermq(
+		(__vector unsigned char)dp1, (__vector unsigned char)perm_mask);

+	v = result[1];
 	/* update last port counter. */
 	lp[0] += gptbl[v].lpv;

--
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v5 2/5] examples/l3fwd: split processing and send stages
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
@ 2022-10-11 10:12         ` pbhagavatula
  2022-10-17 12:06           ` [EXT] " Shijith Thotton
  2022-10-11 10:12         ` [PATCH v5 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
                           ` (4 subsequent siblings)
  5 siblings, 1 reply; 41+ messages in thread
From: pbhagavatula @ 2022-10-11 10:12 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
 examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
 examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
 	return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			 uint16_t *dst_port, uint16_t portid,
+			 struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i, j, pos;
-	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				dst_port[j + i] = em_get_dst_port(qconf,
 						pkts_burst[j + i], portid);
 		}
+
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 	}
 
-	for (; j < nb_rx; j++)
+	for (; j < nb_rx; j++) {
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
+	}
+}
 
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		      struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+	for (j = 0; j < nb_rx; j++)
 		pkts_burst[j] = ev[j]->mbuf;
-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-					       struct rte_ether_hdr *) + 1);
-	}
 
 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 			}
 			continue;
 		}
-		processx4_step3(&pkts_burst[j], &dst_port[j]);
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
 			pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint8_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				ipv4_flag[j / FWDSTEP],
 				portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
index 78ee83b76c..2a68c4c15e 100644
--- a/examples/l3fwd/l3fwd_lpm_neon.h
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
 	}
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i = 0, j = 0;
-	uint16_t dst_port[MAX_PKT_BURST];
 	int32x4_t dip;
 	uint32_t ipv4_flag;
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
 							void *));
 		}
-
 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
 			for (i = 0; i < FWDSTEP; i++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 			processx4_step2(qconf, dip, ipv4_flag, portid,
 					&pkts_burst[j], &dst_port[j]);
+			if (do_step3)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 		}
 
 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
 				&dst_port[j]);
+		if (do_step3)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 		j += FWDSTEP;
 	}
@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 							void *));
 			j++;
 		}
-
 		j -= m;
 		/* Classify last up to 3 packets one by one */
 		switch (m) {
 		case 3:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 2:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 1:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 		}
 	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index 3f637a23d1..db15030320 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		processx4_step2(qconf, dip[j / FWDSTEP],
 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v5 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-10-11 10:12         ` [PATCH v5 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-10-11 10:12         ` pbhagavatula
  2022-10-17 12:06           ` [EXT] " Shijith Thotton
  2022-10-11 10:12         ` [PATCH v5 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
                           ` (3 subsequent siblings)
  5 siblings, 1 reply; 41+ messages in thread
From: pbhagavatula @ 2022-10-11 10:12 UTC (permalink / raw)
  To: jerinj, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_altivec.h | 29 ++++++++++++++
 examples/l3fwd/l3fwd_event.h   | 71 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.c     | 39 +++++++++++--------
 examples/l3fwd/l3fwd_neon.h    | 47 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h     | 44 +++++++++++++++++++++
 5 files changed, 214 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..e45e138e59 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__vector unsigned short dp1;
+		__vector unsigned short dp;
+
+		dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
+		res = vec_all_eq(dp1, dp);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..3fe38aada0 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
 	uint64_t vector_tmo_ns;
 };
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#else
+static inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	int i;
+
+	for (i = 0; i < nb_elem; i++) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+	}
+
+	return dst_ports[0];
+}
+#endif
+
 static inline void
 event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
 {
@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
 	}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+		   uint16_t nb_pkts)
+{
+	uint16_t *des_pos, free = 0;
+	struct rte_mbuf **pos;
+	int i;
+
+	/* Filter out and free bad packets */
+	for (i = 0; i < nb_pkts; i++) {
+		if (dst_port[i] == BAD_PORT) {
+			rte_pktmbuf_free(mbufs[i]);
+			if (!free) {
+				pos = &mbufs[i];
+				des_pos = &dst_port[i];
+			}
+			free++;
+			continue;
+		}
+
+		if (free) {
+			*pos = mbufs[i];
+			pos++;
+			*des_pos = dst_port[i];
+			des_pos++;
+		}
+	}
 
+	return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+	uint16_t port, i;
+
+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+	/* Verify destination array */
+	port = process_dst_port(dst_port, vec->nb_elem);
+	if (port == BAD_PORT) {
+		vec->attr_valid = 0;
+		for (i = 0; i < vec->nb_elem; i++) {
+			vec->mbufs[i]->port = dst_port[i];
+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+		}
+	} else {
+		vec->attr_valid = 1;
+		vec->port = port;
+		vec->queue = 0;
+	}
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index 22d7f61a42..5172979c72 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf,
+			 uint16_t *dst_port)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int i;
 
-	/* Process first packet to init vector attributes */
-	lpm_process_event_pkt(lconf, mbufs[0]);
+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined RTE_ARCH_PPC_64
 	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
+					  dst_port, lconf, 1);
+	} else {
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
+						  &dst_port[i], lconf, 1);
 	}
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		dst_port[i] = lpm_process_event_pkt(lconf, mbufs[i]);
+#endif
 
-	for (i = 1; i < vec->nb_elem; i++) {
-		lpm_process_event_pkt(lconf, mbufs[i]);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	process_event_vector(vec, dst_port);
 }
 
 /* Same eventdev loop for single and burst of vector */
@@ -458,6 +461,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	struct rte_event events[MAX_PKT_BURST];
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
+	uint16_t *dst_port_list;
 	unsigned int lcore_id;
 
 	if (event_p_id < 0)
@@ -465,7 +469,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
-
+	dst_port_list =
+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+			    RTE_CACHE_LINE_SIZE);
+	if (dst_port_list == NULL)
+		return;
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
 
 	while (!force_quit) {
@@ -483,10 +491,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			lpm_process_event_vector(events[i].vec, lconf);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			lpm_process_event_vector(events[i].vec, lconf,
+						 dst_port_list);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -510,6 +516,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(dst_port_list);
 }
 
 int __rte_noinline
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index ce515e0bc4..bf365341fb 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -194,4 +194,51 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0;
+
+#if defined(RTE_ARCH_ARM64)
+	uint16_t res;
+
+	while (nb_elem > 7) {
+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
+		uint16x8_t dp1;
+
+		dp1 = vld1q_u16(&dst_ports[i]);
+		dp1 = vceqq_u16(dp1, dp);
+		res = vminvq_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
+		uint16x4_t dp1;
+
+		dp1 = vld1_u16(&dst_ports[i]);
+		dp1 = vceq_u16(dp1, dp);
+		res = vminv_u16(dp1);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+#endif
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_NEON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 0f0d0323a2..083729cdef 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		res = _mm_movemask_epi8(dp1);
+		if (res != 0xFFFF)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
+		res = _mm_movemask_ps((__m128)dp1);
+		if (res != 0xF)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_SSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v5 4/5] examples/l3fwd: fix event vector processing in fib
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
  2022-10-11 10:12         ` [PATCH v5 2/5] examples/l3fwd: split processing and send stages pbhagavatula
  2022-10-11 10:12         ` [PATCH v5 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-10-11 10:12         ` pbhagavatula
  2022-10-17 12:06           ` [EXT] " Shijith Thotton
  2022-10-11 10:12         ` [PATCH v5 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
                           ` (2 subsequent siblings)
  5 siblings, 1 reply; 41+ messages in thread
From: pbhagavatula @ 2022-10-11 10:12 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix stack overflow when event vector size is greater than
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_fib.c | 130 ++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 39 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index b82e0c0354..edc0dd69b9 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -77,27 +77,37 @@ fib_parse_packet(struct rte_mbuf *mbuf,
  */
 #if !defined FIB_SEND_MULTI
 static inline void
-fib_send_single(int nb_tx, struct lcore_conf *qconf,
-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
 {
-	int32_t j;
 	struct rte_ether_hdr *eth_hdr;
 
-	for (j = 0; j < nb_tx; j++) {
-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
 #if defined DO_RFC_1812_CHECKS
-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
-				pkts_burst[j], struct rte_ether_hdr *) + 1),
-				&hops[j], pkts_burst[j]->packet_type);
+	rfc1812_process(
+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
+						pkt, struct rte_ether_hdr *) +
+					1),
+		hop, pkt->packet_type);
 #endif
 
-		/* Set MAC addresses. */
-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
-				struct rte_ether_hdr *);
-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
-				&eth_hdr->src_addr);
+	/* Set MAC addresses. */
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
+}
 
+static inline void
+fib_send_single(int nb_tx, struct lcore_conf *qconf,
+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+{
+	int32_t j;
+
+	for (j = 0; j < nb_tx; j++) {
+		process_packet(pkts_burst[j], &hops[j]);
+		if (hops[j] == BAD_PORT) {
+			rte_pktmbuf_free(pkts_burst[j]);
+			continue;
+		}
 		/* Send single packet. */
 		send_single_packet(qconf, pkts_burst[j], hops[j]);
 	}
@@ -261,7 +271,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 	uint32_t ipv4_arr[MAX_PKT_BURST];
 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-	uint16_t nh;
+	uint16_t nh, hops[MAX_PKT_BURST];
 	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +360,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 			else
 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 			if (nh != FIB_DEFAULT_HOP)
-				events[i].mbuf->port = nh;
+				hops[i] = nh != FIB_DEFAULT_HOP ?
+						  nh :
+						  events[i].mbuf->port;
+			process_packet(events[i].mbuf, &hops[i]);
+			events[i].mbuf->port = hops[i] != BAD_PORT ?
+						       hops[i] :
+						       events[i].mbuf->port;
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +434,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+			 uint32_t *ipv4_arr, uint16_t *hops)
 {
-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
 	struct rte_mbuf **mbufs = vec->mbufs;
-	uint32_t ipv4_arr[MAX_PKT_BURST];
-	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	struct lcore_conf *lconf;
 	uint16_t nh;
@@ -463,16 +477,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
 	/* Lookup IPv6 hops if IPv6 packets are present. */
 	if (ipv6_cnt > 0)
-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
-				     hopsv6, ipv6_cnt);
-
-	if (vec->attr_valid) {
-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
-		if (nh != FIB_DEFAULT_HOP)
-			vec->port = nh;
-		else
-			vec->attr_valid = 0;
-	}
+		rte_fib6_lookup_bulk(
+			lconf->ipv6_lookup_struct,
+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
+			ipv6_cnt);
 
 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
 	for (i = 0; i < vec->nb_elem; i++) {
@@ -481,9 +489,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
 		else
 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 		if (nh != FIB_DEFAULT_HOP)
-			mbufs[i]->port = nh;
-		event_vector_attr_validate(vec, mbufs[i]);
+			hops[i] = nh;
+		else
+			hops[i] = vec->attr_valid ? vec->port :
+						    vec->mbufs[i]->port;
 	}
+
+#if defined FIB_SEND_MULTI
+	uint16_t k;
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
 }
 
 static __rte_always_inline void
@@ -496,10 +521,37 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	const uint8_t event_d_id = evt_rsrc->event_d_id;
 	const uint16_t deq_len = evt_rsrc->deq_depth;
 	struct rte_event events[MAX_PKT_BURST];
+	uint8_t *type_arr, **ipv6_arr, *ptr;
 	int nb_enq = 0, nb_deq = 0, i;
-
-	if (event_p_id < 0)
+	uint64_t *hopsv4, *hopsv6;
+	uint32_t *ipv4_arr;
+	uint16_t *hops;
+	uintptr_t mem;
+
+	mem = (uintptr_t)rte_zmalloc(
+		"vector_fib",
+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
+			evt_rsrc->vector_size,
+		RTE_CACHE_LINE_SIZE);
+	if (mem == 0)
 		return;
+	ipv4_arr = (uint32_t *)mem;
+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
+
+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
+	for (i = 0; i < evt_rsrc->vector_size; i++)
+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
+
+	if (event_p_id < 0) {
+		rte_free((void *)mem);
+		return;
+	}
 
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__,
 		rte_lcore_id());
@@ -519,10 +571,9 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			fib_process_event_vector(events[i].vec);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			fib_process_event_vector(events[i].vec, type_arr,
+						 ipv6_arr, hopsv4, hopsv6,
+						 ipv4_arr, hops);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -546,6 +597,7 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free((void *)mem);
 }
 
 int __rte_noinline
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v5 5/5] examples/l3fwd: use em vector path for event vector
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                           ` (2 preceding siblings ...)
  2022-10-11 10:12         ` [PATCH v5 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-10-11 10:12         ` pbhagavatula
  2022-10-12  8:57           ` [EXT] " Shijith Thotton
  2022-10-17 12:05         ` [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask generation Shijith Thotton
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
  5 siblings, 1 reply; 41+ messages in thread
From: pbhagavatula @ 2022-10-11 10:12 UTC (permalink / raw)
  To: jerinj; +Cc: dev, Pavan Nikhilesh

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 examples/l3fwd/l3fwd_em.c            | 13 +++--
 examples/l3fwd/l3fwd_em.h            | 29 +++++------
 examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
 examples/l3fwd/l3fwd_event.h         | 21 --------
 5 files changed, 48 insertions(+), 112 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index a203dc9e46..35de31157e 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -860,10 +860,15 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
 	unsigned int lcore_id;
+	uint16_t *dst_ports;
 
 	if (event_p_id < 0)
 		return;
 
+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+				RTE_CACHE_LINE_SIZE);
+	if (dst_ports == NULL)
+		return;
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
 
@@ -885,13 +890,12 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 			}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-			l3fwd_em_process_event_vector(events[i].vec, lconf);
+			l3fwd_em_process_event_vector(events[i].vec, lconf,
+						      dst_ports);
 #else
 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
-							     lconf);
+							     lconf, dst_ports);
 #endif
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -915,6 +919,7 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(dst_ports);
 }
 
 int __rte_noinline
diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
index fe2ee59f6a..7d051fc076 100644
--- a/examples/l3fwd/l3fwd_em.h
+++ b/examples/l3fwd/l3fwd_em.h
@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t portid,
 	}
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint16_t
 l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 {
 	struct rte_ether_hdr *eth_hdr;
@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
 	else
 		m->port = BAD_PORT;
+
+	return m->port;
 }
 
 /*
@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct rte_event **events,
 
 static inline void
 l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
-				     struct lcore_conf *qconf)
+				     struct lcore_conf *qconf,
+				     uint16_t *dst_ports)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i;
@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
 
-	/* Process first packet to init vector attributes */
-	l3fwd_em_simple_process(mbufs[0], qconf);
-	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
-	}
-
 	/*
 	 * Prefetch and forward already prefetched packets.
 	 */
-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
 		rte_prefetch0(
 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void *));
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
 	}
 
 	/* Forward remaining prefetched packets */
-	for (; i < vec->nb_elem; i++) {
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	for (; i < vec->nb_elem; i++)
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_port)
 {
-	struct rte_mbuf **mbufs = vec->mbufs;
-	uint16_t dst_port[MAX_PKT_BURST];
-	int32_t i, j, n, pos;
-
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
-		rte_prefetch0(
-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
+	uint16_t i;
 
 	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
-		uint32_t pkt_type =
-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
-		uint32_t l3_type, tcp_or_udp;
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
-			pkt_type &= mbufs[j + i]->packet_type;
-
-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
-		     i++, pos++) {
-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
-						       struct rte_ether_hdr *) +
-				      1);
-		}
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else {
-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-				mbufs[j + i]->port =
-					em_get_dst_port(qconf, mbufs[j + i],
-							mbufs[j + i]->port);
-				process_packet(mbufs[j + i],
-					       &mbufs[j + i]->port);
-				event_vector_attr_validate(vec, mbufs[j + i]);
-			}
-			continue;
-		}
-		processx4_step3(&mbufs[j], &dst_port[j]);
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-			mbufs[j + i]->port = dst_port[j + i];
-			event_vector_attr_validate(vec, mbufs[j + i]);
-		}
-	}
-
-	for (; j < vec->nb_elem; j++) {
-		mbufs[j]->port =
-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
-		process_packet(mbufs[j], &mbufs[j]->port);
-		event_vector_attr_validate(vec, mbufs[j]);
-	}
+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs, dst_port,
+					 vec->port, qconf, 1);
+	else
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_em_process_packets(1, &vec->mbufs[i],
+						 &dst_port[i],
+						 vec->mbufs[i]->port, qconf, 1);
+
+	process_event_vector(vec, dst_port);
 }
 
 #endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index d2f75edb8a..067f23889a 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **events,
 
 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
 		struct rte_mbuf *mbuf = events[j]->mbuf;
+		uint16_t port;
 
 		if (i < nb_rx) {
 			rte_prefetch0(rte_pktmbuf_mtod(
 					events[i]->mbuf,
 					struct rte_ether_hdr *) + 1);
 		}
+		port = mbuf->port;
 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
 		process_packet(mbuf, &mbuf->port);
+		if (mbuf->port == BAD_PORT)
+			mbuf->port = port;
 	}
 }
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_ports)
 {
+	const uint8_t attr_valid = vec->attr_valid;
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i, j;
 
 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
 
-	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
 		if (j < vec->nb_elem)
 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
 						       struct rte_ether_hdr *) +
 				      1);
-		mbufs[i]->port =
-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
-		process_packet(mbufs[i], &mbufs[i]->port);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
+					       attr_valid ? vec->port :
+							    mbufs[i]->port);
 	}
+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != j; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &dst_ports[i]);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index 3fe38aada0..e21817c36b 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
 }
 #endif
 
-static inline void
-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
-{
-	/* l3fwd application only changes mbuf port while processing */
-	if (vec->attr_valid && (vec->port != mbuf->port))
-		vec->attr_valid = 0;
-}
-
-static inline void
-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
-{
-	if (vec->attr_valid) {
-		vec->queue = txq;
-	} else {
-		int i;
-
-		for (i = 0; i < vec->nb_elem; i++)
-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
-	}
-}
-
 static inline uint16_t
 filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
 		   uint16_t nb_pkts)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v5 5/5] examples/l3fwd: use em vector path for event vector
  2022-10-11 10:12         ` [PATCH v5 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-10-12  8:57           ` Shijith Thotton
  0 siblings, 0 replies; 41+ messages in thread
From: Shijith Thotton @ 2022-10-12  8:57 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula

>From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
>Use em vector path to process event vector.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>---
> examples/l3fwd/l3fwd_em.c            | 13 +++--
> examples/l3fwd/l3fwd_em.h            | 29 +++++------
> examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
> examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
> examples/l3fwd/l3fwd_event.h         | 21 --------
> 5 files changed, 48 insertions(+), 112 deletions(-)
>
>diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
>index a203dc9e46..35de31157e 100644
>--- a/examples/l3fwd/l3fwd_em.c
>+++ b/examples/l3fwd/l3fwd_em.c
>@@ -860,10 +860,15 @@ em_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 	int i, nb_enq = 0, nb_deq = 0;
> 	struct lcore_conf *lconf;
> 	unsigned int lcore_id;
>+	uint16_t *dst_ports;
>
> 	if (event_p_id < 0)
> 		return;
>
>+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
>+				RTE_CACHE_LINE_SIZE);
>+	if (dst_ports == NULL)
>+		return;
> 	lcore_id = rte_lcore_id();
> 	lconf = &lcore_conf[lcore_id];
>
>@@ -885,13 +890,12 @@ em_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 			}
>
> #if defined RTE_ARCH_X86 || defined __ARM_NEON
>-			l3fwd_em_process_event_vector(events[i].vec, lconf);
>+			l3fwd_em_process_event_vector(events[i].vec, lconf,
>+						      dst_ports);
> #else
> 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
>-							     lconf);
>+							     lconf, dst_ports);
> #endif
>-			if (flags & L3FWD_EVENT_TX_DIRECT)
>-				event_vector_txq_set(events[i].vec, 0);
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>@@ -915,6 +919,7 @@ em_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
>
> 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
> 				   nb_deq, 1);
>+	rte_free(dst_ports);
> }
>
> int __rte_noinline
>diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
>index fe2ee59f6a..7d051fc076 100644
>--- a/examples/l3fwd/l3fwd_em.h
>+++ b/examples/l3fwd/l3fwd_em.h
>@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t
>portid,
> 	}
> }
>
>-static __rte_always_inline void
>+static __rte_always_inline uint16_t
> l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
> {
> 	struct rte_ether_hdr *eth_hdr;
>@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct
>lcore_conf *qconf)
> 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
> 	else
> 		m->port = BAD_PORT;
>+
>+	return m->port;
> }
>
> /*
>@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct
>rte_event **events,
>
> static inline void
> l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
>-				     struct lcore_conf *qconf)
>+				     struct lcore_conf *qconf,
>+				     uint16_t *dst_ports)
> {
> 	struct rte_mbuf **mbufs = vec->mbufs;
> 	int32_t i;
>@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct
>rte_event_vector *vec,
> 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
> 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
>
>-	/* Process first packet to init vector attributes */
>-	l3fwd_em_simple_process(mbufs[0], qconf);
>-	if (vec->attr_valid) {
>-		if (mbufs[0]->port != BAD_PORT)
>-			vec->port = mbufs[0]->port;
>-		else
>-			vec->attr_valid = 0;
>-	}
>-
> 	/*
> 	 * Prefetch and forward already prefetched packets.
> 	 */
>-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
>+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
> 		rte_prefetch0(
> 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void
>*));
>-		l3fwd_em_simple_process(mbufs[i], qconf);
>-		event_vector_attr_validate(vec, mbufs[i]);
>+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
> 	}
>
> 	/* Forward remaining prefetched packets */
>-	for (; i < vec->nb_elem; i++) {
>-		l3fwd_em_simple_process(mbufs[i], qconf);
>-		event_vector_attr_validate(vec, mbufs[i]);
>-	}
>+	for (; i < vec->nb_elem; i++)
>+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
>+
>+	process_event_vector(vec, dst_ports);
> }
>
> #endif /* __L3FWD_EM_H__ */
>diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
>index 12b997e477..2e11eefad7 100644
>--- a/examples/l3fwd/l3fwd_em_hlm.h
>+++ b/examples/l3fwd/l3fwd_em_hlm.h
>@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event
>**ev,
>
> static inline void
> l3fwd_em_process_event_vector(struct rte_event_vector *vec,
>-			      struct lcore_conf *qconf)
>+			      struct lcore_conf *qconf, uint16_t *dst_port)
> {
>-	struct rte_mbuf **mbufs = vec->mbufs;
>-	uint16_t dst_port[MAX_PKT_BURST];
>-	int32_t i, j, n, pos;
>-
>-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
>-		rte_prefetch0(
>-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
>+	uint16_t i;
>
> 	if (vec->attr_valid)
>-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
>-
>-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
>-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
>-		uint32_t pkt_type =
>-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP |
>RTE_PTYPE_L4_UDP;
>-		uint32_t l3_type, tcp_or_udp;
>-
>-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
>-			pkt_type &= mbufs[j + i]->packet_type;
>-
>-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
>-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP |
>RTE_PTYPE_L4_UDP);
>-
>-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
>-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
>-		     i++, pos++) {
>-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
>-						       struct rte_ether_hdr *) +
>-				      1);
>-		}
>-
>-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
>-						      &dst_port[j]);
>-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
>-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
>-						      &dst_port[j]);
>-		} else {
>-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
>-				mbufs[j + i]->port =
>-					em_get_dst_port(qconf, mbufs[j + i],
>-							mbufs[j + i]->port);
>-				process_packet(mbufs[j + i],
>-					       &mbufs[j + i]->port);
>-				event_vector_attr_validate(vec, mbufs[j + i]);
>-			}
>-			continue;
>-		}
>-		processx4_step3(&mbufs[j], &dst_port[j]);
>-
>-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
>-			mbufs[j + i]->port = dst_port[j + i];
>-			event_vector_attr_validate(vec, mbufs[j + i]);
>-		}
>-	}
>-
>-	for (; j < vec->nb_elem; j++) {
>-		mbufs[j]->port =
>-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
>-		process_packet(mbufs[j], &mbufs[j]->port);
>-		event_vector_attr_validate(vec, mbufs[j]);
>-	}
>+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs,
>dst_port,
>+					 vec->port, qconf, 1);
>+	else
>+		for (i = 0; i < vec->nb_elem; i++)
>+			l3fwd_em_process_packets(1, &vec->mbufs[i],
>+						 &dst_port[i],
>+						 vec->mbufs[i]->port, qconf, 1);
>+
>+	process_event_vector(vec, dst_port);
> }
>
> #endif /* __L3FWD_EM_HLM_H__ */
>diff --git a/examples/l3fwd/l3fwd_em_sequential.h
>b/examples/l3fwd/l3fwd_em_sequential.h
>index d2f75edb8a..067f23889a 100644
>--- a/examples/l3fwd/l3fwd_em_sequential.h
>+++ b/examples/l3fwd/l3fwd_em_sequential.h
>@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event
>**events,
>
> 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
> 		struct rte_mbuf *mbuf = events[j]->mbuf;
>+		uint16_t port;
>
> 		if (i < nb_rx) {
> 			rte_prefetch0(rte_pktmbuf_mtod(
> 					events[i]->mbuf,
> 					struct rte_ether_hdr *) + 1);
> 		}
>+		port = mbuf->port;
> 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
> 		process_packet(mbuf, &mbuf->port);
>+		if (mbuf->port == BAD_PORT)
>+			mbuf->port = port;
> 	}
> }
>
> static inline void
> l3fwd_em_process_event_vector(struct rte_event_vector *vec,
>-			      struct lcore_conf *qconf)
>+			      struct lcore_conf *qconf, uint16_t *dst_ports)
> {
>+	const uint8_t attr_valid = vec->attr_valid;
> 	struct rte_mbuf **mbufs = vec->mbufs;
> 	int32_t i, j;
>
> 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
>
>-	if (vec->attr_valid)
>-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
>-
> 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
> 		if (j < vec->nb_elem)
> 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
> 						       struct rte_ether_hdr *) +
> 				      1);
>-		mbufs[i]->port =
>-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
>-		process_packet(mbufs[i], &mbufs[i]->port);
>-		event_vector_attr_validate(vec, mbufs[i]);
>+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
>+					       attr_valid ? vec->port :
>+							    mbufs[i]->port);
> 	}
>+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
>+
>+	for (i = 0; i != j; i += FWDSTEP)
>+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
>+	for (; i < vec->nb_elem; i++)
>+		process_packet(vec->mbufs[i], &dst_ports[i]);
>+
>+	process_event_vector(vec, dst_ports);
> }
>
> #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
>diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
>index 3fe38aada0..e21817c36b 100644
>--- a/examples/l3fwd/l3fwd_event.h
>+++ b/examples/l3fwd/l3fwd_event.h
>@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t
>nb_elem)
> }
> #endif
>
>-static inline void
>-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf
>*mbuf)
>-{
>-	/* l3fwd application only changes mbuf port while processing */
>-	if (vec->attr_valid && (vec->port != mbuf->port))
>-		vec->attr_valid = 0;
>-}
>-
>-static inline void
>-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
>-{
>-	if (vec->attr_valid) {
>-		vec->queue = txq;
>-	} else {
>-		int i;
>-
>-		for (i = 0; i < vec->nb_elem; i++)
>-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
>-	}
>-}
>-
> static inline uint16_t
> filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
> 		   uint16_t nb_pkts)
>--
>2.25.1
 
For the series:
Acked-by: Shijith Thotton <sthotton@marvell.com>



^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask generation
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                           ` (3 preceding siblings ...)
  2022-10-11 10:12         ` [PATCH v5 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-10-17 12:05         ` Shijith Thotton
  2022-10-20 16:15           ` Pavan Nikhilesh Bhagavatula
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
  5 siblings, 1 reply; 41+ messages in thread
From: Shijith Thotton @ 2022-10-17 12:05 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
	David Christensen
  Cc: dev, Pavan Nikhilesh Bhagavatula, stable

>
>Fix port group mask generation in altivec, vec_any_eq returns
>0 or 1 while port_groupx4 expects comparison mask result.
>
>Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
>Cc: stable@dpdk.org
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> v5 Changes:
> - Fix compilation errors.
>
> v4 Changes:
> - Fix missing `rte_free`.
>
> v3 Changes:
> - PPC optimize port mask generation.
> - Fix aarch32 compilation.
>
> v2 Changes:
> - Fix PPC, RISC-V, aarch32 compilation.
>
> examples/common/altivec/port_group.h | 11 ++++++++---
> 1 file changed, 8 insertions(+), 3 deletions(-)
>
>diff --git a/examples/common/altivec/port_group.h
>b/examples/common/altivec/port_group.h
>index 5e209b02fa..1c05bc025a 100644
>--- a/examples/common/altivec/port_group.h
>+++ b/examples/common/altivec/port_group.h
>@@ -26,12 +26,17 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
> 		uint16_t u16[FWDSTEP + 1];
> 		uint64_t u64;
> 	} *pnum = (void *)pn;
>-
>+	__vector unsigned long long result;
>+	const __vector unsigned int perm_mask = {0x00204060, 0x80808080,
>+						 0x80808080, 0x80808080};
> 	int32_t v;
>
>-	v = vec_any_eq(dp1, dp2);
>-
>+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
>+	dp1 = vec_mergeh(dp1, dp1);
>+	result = (__vector unsigned long long)vec_vbpermq(
>+		(__vector unsigned char)dp1, (__vector unsigned
>char)perm_mask);
>
>+	v = result[1];
> 	/* update last port counter. */
> 	lp[0] += gptbl[v].lpv;
>
>--
>2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v5 2/5] examples/l3fwd: split processing and send stages
  2022-10-11 10:12         ` [PATCH v5 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-10-17 12:06           ` Shijith Thotton
  0 siblings, 0 replies; 41+ messages in thread
From: Shijith Thotton @ 2022-10-17 12:06 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
	David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh Bhagavatula

>
>Split packet processing from packet send stage, as send stage
>is not common for poll and event mode.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
> examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
> examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
> examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
> 4 files changed, 95 insertions(+), 29 deletions(-)
>
>diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
>index e76f2760b0..12b997e477 100644
>--- a/examples/l3fwd/l3fwd_em_hlm.h
>+++ b/examples/l3fwd/l3fwd_em_hlm.h
>@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct
>rte_mbuf *pkt,
> 	return portid;
> }
>
>-/*
>- * Buffer optimized handling of packets, invoked
>- * from main_loop.
>- */
> static inline void
>-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>-		uint16_t portid, struct lcore_conf *qconf)
>+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>+			 uint16_t *dst_port, uint16_t portid,
>+			 struct lcore_conf *qconf, const uint8_t do_step3)
> {
> 	int32_t i, j, pos;
>-	uint16_t dst_port[MAX_PKT_BURST];
>
> 	/*
> 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
>@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf
>**pkts_burst,
> 				dst_port[j + i] = em_get_dst_port(qconf,
> 						pkts_burst[j + i], portid);
> 		}
>+
>+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i +=
>FWDSTEP)
>+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
> 	}
>
>-	for (; j < nb_rx; j++)
>+	for (; j < nb_rx; j++) {
> 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
>+	}
>+}
>
>-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
>+/*
>+ * Buffer optimized handling of packets, invoked
>+ * from main_loop.
>+ */
>+static inline void
>+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t
>portid,
>+		      struct lcore_conf *qconf)
>+{
>+	uint16_t dst_port[MAX_PKT_BURST];
>
>+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf,
>0);
>+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> }
>
> /*
>@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event
>**ev,
> 	 */
> 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
>
>-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
>+	for (j = 0; j < nb_rx; j++)
> 		pkts_burst[j] = ev[j]->mbuf;
>-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
>-					       struct rte_ether_hdr *) + 1);
>-	}
>
> 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
>
>@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event
>**ev,
> 			}
> 			continue;
> 		}
>-		processx4_step3(&pkts_burst[j], &dst_port[j]);
>+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
>+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
>
> 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
> 			pkts_burst[j + i]->port = dst_port[j + i];
>diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h
>b/examples/l3fwd/l3fwd_lpm_altivec.h
>index 0c6852a7bb..adb82f1478 100644
>--- a/examples/l3fwd/l3fwd_lpm_altivec.h
>+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
>@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
>  * from main_loop.
>  */
> static inline void
>-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>-			uint8_t portid, struct lcore_conf *qconf)
>+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>+			  uint8_t portid, uint16_t *dst_port,
>+			  struct lcore_conf *qconf, const uint8_t do_step3)
> {
> 	int32_t j;
>-	uint16_t dst_port[MAX_PKT_BURST];
> 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
> 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf
>**pkts_burst,
> 				ipv4_flag[j / FWDSTEP],
> 				portid, &pkts_burst[j], &dst_port[j]);
>
>+	if (do_step3)
>+		for (j = 0; j != k; j += FWDSTEP)
>+			processx4_step3(&pkts_burst[j], &dst_port[j]);
>+
> 	/* Classify last up to 3 packets one by one */
> 	switch (nb_rx % FWDSTEP) {
> 	case 3:
> 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &dst_port[j]);
> 		j++;
> 		/* fall-through */
> 	case 2:
> 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &dst_port[j]);
> 		j++;
> 		/* fall-through */
> 	case 1:
> 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &dst_port[j]);
> 		j++;
> 		/* fall-through */
> 	}
>+}
>+
>+static inline void
>+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t
>portid,
>+		       struct lcore_conf *qconf)
>+{
>+	uint16_t dst_port[MAX_PKT_BURST];
>
>+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
>+				  0);
> 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> }
>
>diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
>b/examples/l3fwd/l3fwd_lpm_neon.h
>index 78ee83b76c..2a68c4c15e 100644
>--- a/examples/l3fwd/l3fwd_lpm_neon.h
>+++ b/examples/l3fwd/l3fwd_lpm_neon.h
>@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
> 	}
> }
>
>-/*
>- * Buffer optimized handling of packets, invoked
>- * from main_loop.
>- */
> static inline void
>-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>-			uint16_t portid, struct lcore_conf *qconf)
>+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>+			  uint16_t portid, uint16_t *dst_port,
>+			  struct lcore_conf *qconf, const uint8_t do_step3)
> {
> 	int32_t i = 0, j = 0;
>-	uint16_t dst_port[MAX_PKT_BURST];
> 	int32x4_t dip;
> 	uint32_t ipv4_flag;
> 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf
>**pkts_burst,
> 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
> 							void *));
> 		}
>-
> 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
> 			for (i = 0; i < FWDSTEP; i++) {
> 				rte_prefetch0(rte_pktmbuf_mtod(
>@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf
>**pkts_burst,
> 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
> 			processx4_step2(qconf, dip, ipv4_flag, portid,
> 					&pkts_burst[j], &dst_port[j]);
>+			if (do_step3)
>+				processx4_step3(&pkts_burst[j], &dst_port[j]);
> 		}
>
> 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
> 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
> 				&dst_port[j]);
>+		if (do_step3)
>+			processx4_step3(&pkts_burst[j], &dst_port[j]);
>
> 		j += FWDSTEP;
> 	}
>@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf
>**pkts_burst,
> 							void *));
> 			j++;
> 		}
>-
> 		j -= m;
> 		/* Classify last up to 3 packets one by one */
> 		switch (m) {
> 		case 3:
> 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> 						       portid);
>+			if (do_step3)
>+				process_packet(pkts_burst[j], &dst_port[j]);
> 			j++;
> 			/* fallthrough */
> 		case 2:
> 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> 						       portid);
>+			if (do_step3)
>+				process_packet(pkts_burst[j], &dst_port[j]);
> 			j++;
> 			/* fallthrough */
> 		case 1:
> 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> 						       portid);
>+			if (do_step3)
>+				process_packet(pkts_burst[j], &dst_port[j]);
> 		}
> 	}
>+}
>+
>+/*
>+ * Buffer optimized handling of packets, invoked
>+ * from main_loop.
>+ */
>+static inline void
>+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t
>portid,
>+		       struct lcore_conf *qconf)
>+{
>+	uint16_t dst_port[MAX_PKT_BURST];
>
>+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
>+				  0);
> 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> }
>
>diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
>index 3f637a23d1..db15030320 100644
>--- a/examples/l3fwd/l3fwd_lpm_sse.h
>+++ b/examples/l3fwd/l3fwd_lpm_sse.h
>@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
>  * from main_loop.
>  */
> static inline void
>-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>-			uint16_t portid, struct lcore_conf *qconf)
>+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>+			  uint16_t portid, uint16_t *dst_port,
>+			  struct lcore_conf *qconf, const uint8_t do_step3)
> {
> 	int32_t j;
>-	uint16_t dst_port[MAX_PKT_BURST];
> 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
> 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf
>**pkts_burst,
> 		processx4_step2(qconf, dip[j / FWDSTEP],
> 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j],
>&dst_port[j]);
>
>+	if (do_step3)
>+		for (j = 0; j != k; j += FWDSTEP)
>+			processx4_step3(&pkts_burst[j], &dst_port[j]);
>+
> 	/* Classify last up to 3 packets one by one */
> 	switch (nb_rx % FWDSTEP) {
> 	case 3:
> 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &dst_port[j]);
> 		j++;
> 		/* fall-through */
> 	case 2:
> 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &dst_port[j]);
> 		j++;
> 		/* fall-through */
> 	case 1:
> 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
>+		if (do_step3)
>+			process_packet(pkts_burst[j], &dst_port[j]);
> 		j++;
> 	}
>+}
>+
>+static inline void
>+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t
>portid,
>+		       struct lcore_conf *qconf)
>+{
>+	uint16_t dst_port[MAX_PKT_BURST];
>
>+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
>+				  0);
> 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> }
>
>--
>2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v5 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-10-11 10:12         ` [PATCH v5 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-10-17 12:06           ` Shijith Thotton
  0 siblings, 0 replies; 41+ messages in thread
From: Shijith Thotton @ 2022-10-17 12:06 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran,
	David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh Bhagavatula

>
>Use lpm vector path to process event vector.
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> examples/l3fwd/l3fwd_altivec.h | 29 ++++++++++++++
> examples/l3fwd/l3fwd_event.h   | 71
>++++++++++++++++++++++++++++++++++
> examples/l3fwd/l3fwd_lpm.c     | 39 +++++++++++--------
> examples/l3fwd/l3fwd_neon.h    | 47 ++++++++++++++++++++++
> examples/l3fwd/l3fwd_sse.h     | 44 +++++++++++++++++++++
> 5 files changed, 214 insertions(+), 16 deletions(-)
>
>diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
>index 87018f5dbe..e45e138e59 100644
>--- a/examples/l3fwd/l3fwd_altivec.h
>+++ b/examples/l3fwd/l3fwd_altivec.h
>@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct
>rte_mbuf **pkts_burst,
> 	}
> }
>
>+static __rte_always_inline uint16_t
>+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
>+{
>+	uint16_t i = 0, res;
>+
>+	while (nb_elem > 7) {
>+		__vector unsigned short dp1;
>+		__vector unsigned short dp;
>+
>+		dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
>+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
>+		res = vec_all_eq(dp1, dp);
>+		if (!res)
>+			return BAD_PORT;
>+
>+		nb_elem -= 8;
>+		i += 8;
>+	}
>+
>+	while (nb_elem) {
>+		if (dst_ports[i] != dst_ports[0])
>+			return BAD_PORT;
>+		nb_elem--;
>+		i++;
>+	}
>+
>+	return dst_ports[0];
>+}
>+
> #endif /* _L3FWD_ALTIVEC_H_ */
>diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
>index b93841a16f..3fe38aada0 100644
>--- a/examples/l3fwd/l3fwd_event.h
>+++ b/examples/l3fwd/l3fwd_event.h
>@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
> 	uint64_t vector_tmo_ns;
> };
>
>+#if defined(RTE_ARCH_X86)
>+#include "l3fwd_sse.h"
>+#elif defined __ARM_NEON
>+#include "l3fwd_neon.h"
>+#elif defined(RTE_ARCH_PPC_64)
>+#include "l3fwd_altivec.h"
>+#else
>+static inline uint16_t
>+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
>+{
>+	int i;
>+
>+	for (i = 0; i < nb_elem; i++) {
>+		if (dst_ports[i] != dst_ports[0])
>+			return BAD_PORT;
>+	}
>+
>+	return dst_ports[0];
>+}
>+#endif
>+
> static inline void
> event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf
>*mbuf)
> {
>@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec,
>uint16_t txq)
> 	}
> }
>
>+static inline uint16_t
>+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
>+		   uint16_t nb_pkts)
>+{
>+	uint16_t *des_pos, free = 0;
>+	struct rte_mbuf **pos;
>+	int i;
>+
>+	/* Filter out and free bad packets */
>+	for (i = 0; i < nb_pkts; i++) {
>+		if (dst_port[i] == BAD_PORT) {
>+			rte_pktmbuf_free(mbufs[i]);
>+			if (!free) {
>+				pos = &mbufs[i];
>+				des_pos = &dst_port[i];
>+			}
>+			free++;
>+			continue;
>+		}
>+
>+		if (free) {
>+			*pos = mbufs[i];
>+			pos++;
>+			*des_pos = dst_port[i];
>+			des_pos++;
>+		}
>+	}
>
>+	return nb_pkts - free;
>+}
>+
>+static inline void
>+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
>+{
>+	uint16_t port, i;
>+
>+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec-
>>nb_elem);
>+	/* Verify destination array */
>+	port = process_dst_port(dst_port, vec->nb_elem);
>+	if (port == BAD_PORT) {
>+		vec->attr_valid = 0;
>+		for (i = 0; i < vec->nb_elem; i++) {
>+			vec->mbufs[i]->port = dst_port[i];
>+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
>+		}
>+	} else {
>+		vec->attr_valid = 1;
>+		vec->port = port;
>+		vec->queue = 0;
>+	}
>+}
>
> struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
> void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
>diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
>index 22d7f61a42..5172979c72 100644
>--- a/examples/l3fwd/l3fwd_lpm.c
>+++ b/examples/l3fwd/l3fwd_lpm.c
>@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void
>*dummy)
> }
>
> static __rte_always_inline void
>-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf
>*lconf)
>+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf
>*lconf,
>+			 uint16_t *dst_port)
> {
> 	struct rte_mbuf **mbufs = vec->mbufs;
> 	int i;
>
>-	/* Process first packet to init vector attributes */
>-	lpm_process_event_pkt(lconf, mbufs[0]);
>+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined
>RTE_ARCH_PPC_64
> 	if (vec->attr_valid) {
>-		if (mbufs[0]->port != BAD_PORT)
>-			vec->port = mbufs[0]->port;
>-		else
>-			vec->attr_valid = 0;
>+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
>+					  dst_port, lconf, 1);
>+	} else {
>+		for (i = 0; i < vec->nb_elem; i++)
>+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
>+						  &dst_port[i], lconf, 1);
> 	}
>+#else
>+	for (i = 0; i < vec->nb_elem; i++)
>+		dst_port[i] = lpm_process_event_pkt(lconf, mbufs[i]);
>+#endif
>
>-	for (i = 1; i < vec->nb_elem; i++) {
>-		lpm_process_event_pkt(lconf, mbufs[i]);
>-		event_vector_attr_validate(vec, mbufs[i]);
>-	}
>+	process_event_vector(vec, dst_port);
> }
>
> /* Same eventdev loop for single and burst of vector */
>@@ -458,6 +461,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 	struct rte_event events[MAX_PKT_BURST];
> 	int i, nb_enq = 0, nb_deq = 0;
> 	struct lcore_conf *lconf;
>+	uint16_t *dst_port_list;
> 	unsigned int lcore_id;
>
> 	if (event_p_id < 0)
>@@ -465,7 +469,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
>
> 	lcore_id = rte_lcore_id();
> 	lconf = &lcore_conf[lcore_id];
>-
>+	dst_port_list =
>+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
>+			    RTE_CACHE_LINE_SIZE);
>+	if (dst_port_list == NULL)
>+		return;
> 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
>
> 	while (!force_quit) {
>@@ -483,10 +491,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 				events[i].op = RTE_EVENT_OP_FORWARD;
> 			}
>
>-			lpm_process_event_vector(events[i].vec, lconf);
>-
>-			if (flags & L3FWD_EVENT_TX_DIRECT)
>-				event_vector_txq_set(events[i].vec, 0);
>+			lpm_process_event_vector(events[i].vec, lconf,
>+						 dst_port_list);
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>@@ -510,6 +516,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
>
> 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
> 				   nb_deq, 1);
>+	rte_free(dst_port_list);
> }
>
> int __rte_noinline
>diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
>index ce515e0bc4..bf365341fb 100644
>--- a/examples/l3fwd/l3fwd_neon.h
>+++ b/examples/l3fwd/l3fwd_neon.h
>@@ -194,4 +194,51 @@ send_packets_multi(struct lcore_conf *qconf, struct
>rte_mbuf **pkts_burst,
> 	}
> }
>
>+static __rte_always_inline uint16_t
>+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
>+{
>+	uint16_t i = 0;
>+
>+#if defined(RTE_ARCH_ARM64)
>+	uint16_t res;
>+
>+	while (nb_elem > 7) {
>+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
>+		uint16x8_t dp1;
>+
>+		dp1 = vld1q_u16(&dst_ports[i]);
>+		dp1 = vceqq_u16(dp1, dp);
>+		res = vminvq_u16(dp1);
>+		if (!res)
>+			return BAD_PORT;
>+
>+		nb_elem -= 8;
>+		i += 8;
>+	}
>+
>+	while (nb_elem > 3) {
>+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
>+		uint16x4_t dp1;
>+
>+		dp1 = vld1_u16(&dst_ports[i]);
>+		dp1 = vceq_u16(dp1, dp);
>+		res = vminv_u16(dp1);
>+		if (!res)
>+			return BAD_PORT;
>+
>+		nb_elem -= 4;
>+		i += 4;
>+	}
>+#endif
>+
>+	while (nb_elem) {
>+		if (dst_ports[i] != dst_ports[0])
>+			return BAD_PORT;
>+		nb_elem--;
>+		i++;
>+	}
>+
>+	return dst_ports[0];
>+}
>+
> #endif /* _L3FWD_NEON_H_ */
>diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
>index 0f0d0323a2..083729cdef 100644
>--- a/examples/l3fwd/l3fwd_sse.h
>+++ b/examples/l3fwd/l3fwd_sse.h
>@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct
>rte_mbuf **pkts_burst,
> 	}
> }
>
>+static __rte_always_inline uint16_t
>+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
>+{
>+	uint16_t i = 0, res;
>+
>+	while (nb_elem > 7) {
>+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
>+		__m128i dp1;
>+
>+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
>+		dp1 = _mm_cmpeq_epi16(dp1, dp);
>+		res = _mm_movemask_epi8(dp1);
>+		if (res != 0xFFFF)
>+			return BAD_PORT;
>+
>+		nb_elem -= 8;
>+		i += 8;
>+	}
>+
>+	while (nb_elem > 3) {
>+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
>+		__m128i dp1;
>+
>+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
>+		dp1 = _mm_cmpeq_epi16(dp1, dp);
>+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
>+		res = _mm_movemask_ps((__m128)dp1);
>+		if (res != 0xF)
>+			return BAD_PORT;
>+
>+		nb_elem -= 4;
>+		i += 4;
>+	}
>+
>+	while (nb_elem) {
>+		if (dst_ports[i] != dst_ports[0])
>+			return BAD_PORT;
>+		nb_elem--;
>+		i++;
>+	}
>+
>+	return dst_ports[0];
>+}
>+
> #endif /* _L3FWD_SSE_H_ */
>--
>2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v5 4/5] examples/l3fwd: fix event vector processing in fib
  2022-10-11 10:12         ` [PATCH v5 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-10-17 12:06           ` Shijith Thotton
  0 siblings, 0 replies; 41+ messages in thread
From: Shijith Thotton @ 2022-10-17 12:06 UTC (permalink / raw)
  To: Pavan Nikhilesh Bhagavatula, Jerin Jacob Kollanukkaran
  Cc: dev, Pavan Nikhilesh Bhagavatula

>
>Fix stack overflow when event vector size is greater than
>MAX_BURST_SIZE.
>Add missing mac swap and rfc1812 stage.
>
>Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")
>
>Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Shijith Thotton <sthotton@marvell.com>

>---
> examples/l3fwd/l3fwd_fib.c | 130 ++++++++++++++++++++++++++-----------
> 1 file changed, 91 insertions(+), 39 deletions(-)
>
>diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
>index b82e0c0354..edc0dd69b9 100644
>--- a/examples/l3fwd/l3fwd_fib.c
>+++ b/examples/l3fwd/l3fwd_fib.c
>@@ -77,27 +77,37 @@ fib_parse_packet(struct rte_mbuf *mbuf,
>  */
> #if !defined FIB_SEND_MULTI
> static inline void
>-fib_send_single(int nb_tx, struct lcore_conf *qconf,
>-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
>+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
> {
>-	int32_t j;
> 	struct rte_ether_hdr *eth_hdr;
>
>-	for (j = 0; j < nb_tx; j++) {
>-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
>+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
> #if defined DO_RFC_1812_CHECKS
>-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
>-				pkts_burst[j], struct rte_ether_hdr *) + 1),
>-				&hops[j], pkts_burst[j]->packet_type);
>+	rfc1812_process(
>+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
>+						pkt, struct rte_ether_hdr *) +
>+					1),
>+		hop, pkt->packet_type);
> #endif
>
>-		/* Set MAC addresses. */
>-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
>-				struct rte_ether_hdr *);
>-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
>-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
>-				&eth_hdr->src_addr);
>+	/* Set MAC addresses. */
>+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
>+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
>+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
>+}
>
>+static inline void
>+fib_send_single(int nb_tx, struct lcore_conf *qconf,
>+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
>+{
>+	int32_t j;
>+
>+	for (j = 0; j < nb_tx; j++) {
>+		process_packet(pkts_burst[j], &hops[j]);
>+		if (hops[j] == BAD_PORT) {
>+			rte_pktmbuf_free(pkts_burst[j]);
>+			continue;
>+		}
> 		/* Send single packet. */
> 		send_single_packet(qconf, pkts_burst[j], hops[j]);
> 	}
>@@ -261,7 +271,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
> 	uint32_t ipv4_arr[MAX_PKT_BURST];
> 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
> 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
>-	uint16_t nh;
>+	uint16_t nh, hops[MAX_PKT_BURST];
> 	uint8_t type_arr[MAX_PKT_BURST];
> 	uint32_t ipv4_cnt, ipv6_cnt;
> 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
>@@ -350,7 +360,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
> 			else
> 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
> 			if (nh != FIB_DEFAULT_HOP)
>-				events[i].mbuf->port = nh;
>+				hops[i] = nh != FIB_DEFAULT_HOP ?
>+						  nh :
>+						  events[i].mbuf->port;
>+			process_packet(events[i].mbuf, &hops[i]);
>+			events[i].mbuf->port = hops[i] != BAD_PORT ?
>+						       hops[i] :
>+						       events[i].mbuf->port;
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>@@ -418,14 +434,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void
>*dummy)
> }
>
> static __rte_always_inline void
>-fib_process_event_vector(struct rte_event_vector *vec)
>+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
>+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
>+			 uint32_t *ipv4_arr, uint16_t *hops)
> {
>-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
>-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
> 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
> 	struct rte_mbuf **mbufs = vec->mbufs;
>-	uint32_t ipv4_arr[MAX_PKT_BURST];
>-	uint8_t type_arr[MAX_PKT_BURST];
> 	uint32_t ipv4_cnt, ipv6_cnt;
> 	struct lcore_conf *lconf;
> 	uint16_t nh;
>@@ -463,16 +477,10 @@ fib_process_event_vector(struct rte_event_vector
>*vec)
>
> 	/* Lookup IPv6 hops if IPv6 packets are present. */
> 	if (ipv6_cnt > 0)
>-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
>-				     hopsv6, ipv6_cnt);
>-
>-	if (vec->attr_valid) {
>-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
>-		if (nh != FIB_DEFAULT_HOP)
>-			vec->port = nh;
>-		else
>-			vec->attr_valid = 0;
>-	}
>+		rte_fib6_lookup_bulk(
>+			lconf->ipv6_lookup_struct,
>+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
>+			ipv6_cnt);
>
> 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
> 	for (i = 0; i < vec->nb_elem; i++) {
>@@ -481,9 +489,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
> 		else
> 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
> 		if (nh != FIB_DEFAULT_HOP)
>-			mbufs[i]->port = nh;
>-		event_vector_attr_validate(vec, mbufs[i]);
>+			hops[i] = nh;
>+		else
>+			hops[i] = vec->attr_valid ? vec->port :
>+						    vec->mbufs[i]->port;
> 	}
>+
>+#if defined FIB_SEND_MULTI
>+	uint16_t k;
>+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
>+
>+	for (i = 0; i != k; i += FWDSTEP)
>+		processx4_step3(&vec->mbufs[i], &hops[i]);
>+	for (; i < vec->nb_elem; i++)
>+		process_packet(vec->mbufs[i], &hops[i]);
>+#else
>+	for (i = 0; i < vec->nb_elem; i++)
>+		process_packet(vec->mbufs[i], &hops[i]);
>+#endif
>+
>+	process_event_vector(vec, hops);
> }
>
> static __rte_always_inline void
>@@ -496,10 +521,37 @@ fib_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 	const uint8_t event_d_id = evt_rsrc->event_d_id;
> 	const uint16_t deq_len = evt_rsrc->deq_depth;
> 	struct rte_event events[MAX_PKT_BURST];
>+	uint8_t *type_arr, **ipv6_arr, *ptr;
> 	int nb_enq = 0, nb_deq = 0, i;
>-
>-	if (event_p_id < 0)
>+	uint64_t *hopsv4, *hopsv6;
>+	uint32_t *ipv4_arr;
>+	uint16_t *hops;
>+	uintptr_t mem;
>+
>+	mem = (uintptr_t)rte_zmalloc(
>+		"vector_fib",
>+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
>+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
>+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
>+			evt_rsrc->vector_size,
>+		RTE_CACHE_LINE_SIZE);
>+	if (mem == 0)
> 		return;
>+	ipv4_arr = (uint32_t *)mem;
>+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
>+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
>+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
>+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
>+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
>+
>+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
>+	for (i = 0; i < evt_rsrc->vector_size; i++)
>+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
>+
>+	if (event_p_id < 0) {
>+		rte_free((void *)mem);
>+		return;
>+	}
>
> 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__,
> 		rte_lcore_id());
>@@ -519,10 +571,9 @@ fib_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
> 				events[i].op = RTE_EVENT_OP_FORWARD;
> 			}
>
>-			fib_process_event_vector(events[i].vec);
>-
>-			if (flags & L3FWD_EVENT_TX_DIRECT)
>-				event_vector_txq_set(events[i].vec, 0);
>+			fib_process_event_vector(events[i].vec, type_arr,
>+						 ipv6_arr, hopsv4, hopsv6,
>+						 ipv4_arr, hops);
> 		}
>
> 		if (flags & L3FWD_EVENT_TX_ENQ) {
>@@ -546,6 +597,7 @@ fib_event_loop_vector(struct l3fwd_event_resources
>*evt_rsrc,
>
> 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
> 				   nb_deq, 1);
>+	rte_free((void *)mem);
> }
>
> int __rte_noinline
>--
>2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask generation
  2022-10-17 12:05         ` [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask generation Shijith Thotton
@ 2022-10-20 16:15           ` Pavan Nikhilesh Bhagavatula
  0 siblings, 0 replies; 41+ messages in thread
From: Pavan Nikhilesh Bhagavatula @ 2022-10-20 16:15 UTC (permalink / raw)
  To: Shijith Thotton, Jerin Jacob Kollanukkaran, David Christensen; +Cc: dev, stable



> -----Original Message-----
> From: Shijith Thotton <sthotton@marvell.com>
> Sent: Monday, October 17, 2022 5:36 PM
> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; David Christensen
> <drc@linux.vnet.ibm.com>
> Cc: dev@dpdk.org; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; stable@dpdk.org
> Subject: RE: [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask
> generation
> 
> >
> >Fix port group mask generation in altivec, vec_any_eq returns
> >0 or 1 while port_groupx4 expects comparison mask result.
> >
> >Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on
> powerpc")
> >Cc: stable@dpdk.org
> >
> >Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Acked-by: Shijith Thotton <sthotton@marvell.com>
> 

Thomas, 

Will this series make it into 22.11 release?

> >---
> > v5 Changes:
> > - Fix compilation errors.
> >
> > v4 Changes:
> > - Fix missing `rte_free`.
> >
> > v3 Changes:
> > - PPC optimize port mask generation.
> > - Fix aarch32 compilation.
> >
> > v2 Changes:
> > - Fix PPC, RISC-V, aarch32 compilation.
> >
> > examples/common/altivec/port_group.h | 11 ++++++++---
> > 1 file changed, 8 insertions(+), 3 deletions(-)
> >
> >diff --git a/examples/common/altivec/port_group.h
> >b/examples/common/altivec/port_group.h
> >index 5e209b02fa..1c05bc025a 100644
> >--- a/examples/common/altivec/port_group.h
> >+++ b/examples/common/altivec/port_group.h
> >@@ -26,12 +26,17 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t
> *lp,
> > 		uint16_t u16[FWDSTEP + 1];
> > 		uint64_t u64;
> > 	} *pnum = (void *)pn;
> >-
> >+	__vector unsigned long long result;
> >+	const __vector unsigned int perm_mask = {0x00204060, 0x80808080,
> >+						 0x80808080, 0x80808080};
> > 	int32_t v;
> >
> >-	v = vec_any_eq(dp1, dp2);
> >-
> >+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
> >+	dp1 = vec_mergeh(dp1, dp1);
> >+	result = (__vector unsigned long long)vec_vbpermq(
> >+		(__vector unsigned char)dp1, (__vector unsigned
> >char)perm_mask);
> >
> >+	v = result[1];
> > 	/* update last port counter. */
> > 	lp[0] += gptbl[v].lpv;
> >
> >--
> >2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v6 1/5] examples/l3fwd: fix port group mask generation
  2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
                           ` (4 preceding siblings ...)
  2022-10-17 12:05         ` [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask generation Shijith Thotton
@ 2022-10-25 16:05         ` pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 2/5] examples/l3fwd: split processing and send stages pbhagavatula
                             ` (4 more replies)
  5 siblings, 5 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-25 16:05 UTC (permalink / raw)
  To: jerinj, thomas, David Christensen
  Cc: dev, Pavan Nikhilesh, stable, Shijith Thotton

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: stable@dpdk.org

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
 v6 Changes:
 - Minor optimiazation to process_dst_port NEON.

 v5 Changes:
 - Fix compilation errors.

 v4 Changes:
 - Fix missing `rte_free`.

 v3 Changes:
 - PPC optimize port mask generation.
 - Fix aarch32 compilation.

 v2 Changes:
 - Fix PPC, RISC-V, aarch32 compilation.

 examples/common/altivec/port_group.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
index 5e209b02fa..1c05bc025a 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,17 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
 		uint16_t u16[FWDSTEP + 1];
 		uint64_t u64;
 	} *pnum = (void *)pn;
-
+	__vector unsigned long long result;
+	const __vector unsigned int perm_mask = {0x00204060, 0x80808080,
+						 0x80808080, 0x80808080};
 	int32_t v;

-	v = vec_any_eq(dp1, dp2);
-
+	dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
+	dp1 = vec_mergeh(dp1, dp1);
+	result = (__vector unsigned long long)vec_vbpermq(
+		(__vector unsigned char)dp1, (__vector unsigned char)perm_mask);

+	v = result[1];
 	/* update last port counter. */
 	lp[0] += gptbl[v].lpv;

--
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v6 2/5] examples/l3fwd: split processing and send stages
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
@ 2022-10-25 16:05           ` pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
                             ` (3 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-25 16:05 UTC (permalink / raw)
  To: jerinj, thomas, David Christensen, Ruifeng Wang,
	Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh, Shijith Thotton

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
 examples/l3fwd/l3fwd_em_hlm.h      | 39 +++++++++++++++++++-----------
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ++++++++++++++++---
 examples/l3fwd/l3fwd_lpm_neon.h    | 35 ++++++++++++++++++++-------
 examples/l3fwd/l3fwd_lpm_sse.h     | 25 ++++++++++++++++---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
 	return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-		uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			 uint16_t *dst_port, uint16_t portid,
+			 struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i, j, pos;
-	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				dst_port[j + i] = em_get_dst_port(qconf,
 						pkts_burst[j + i], portid);
 		}
+
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 	}
 
-	for (; j < nb_rx; j++)
+	for (; j < nb_rx; j++) {
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &pkts_burst[j]->port);
+	}
+}
 
-	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		      struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+	for (j = 0; j < nb_rx; j++)
 		pkts_burst[j] = ev[j]->mbuf;
-		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-					       struct rte_ether_hdr *) + 1);
-	}
 
 	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 			}
 			continue;
 		}
-		processx4_step3(&pkts_burst[j], &dst_port[j]);
+		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+			processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
 		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
 			pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint8_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 				ipv4_flag[j / FWDSTEP],
 				portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint8_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
index 78ee83b76c..2a68c4c15e 100644
--- a/examples/l3fwd/l3fwd_lpm_neon.h
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -80,16 +80,12 @@ processx4_step2(const struct lcore_conf *qconf,
 	}
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t i = 0, j = 0;
-	uint16_t dst_port[MAX_PKT_BURST];
 	int32x4_t dip;
 	uint32_t ipv4_flag;
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -100,7 +96,6 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
 							void *));
 		}
-
 		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
 			for (i = 0; i < FWDSTEP; i++) {
 				rte_prefetch0(rte_pktmbuf_mtod(
@@ -111,11 +106,15 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 			processx4_step2(qconf, dip, ipv4_flag, portid,
 					&pkts_burst[j], &dst_port[j]);
+			if (do_step3)
+				processx4_step3(&pkts_burst[j], &dst_port[j]);
 		}
 
 		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
 		processx4_step2(qconf, dip, ipv4_flag, portid, &pkts_burst[j],
 				&dst_port[j]);
+		if (do_step3)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
 
 		j += FWDSTEP;
 	}
@@ -138,26 +137,44 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 							void *));
 			j++;
 		}
-
 		j -= m;
 		/* Classify last up to 3 packets one by one */
 		switch (m) {
 		case 3:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 2:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 			j++;
 			/* fallthrough */
 		case 1:
 			dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
 						       portid);
+			if (do_step3)
+				process_packet(pkts_burst[j], &dst_port[j]);
 		}
 	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
diff --git a/examples/l3fwd/l3fwd_lpm_sse.h b/examples/l3fwd/l3fwd_lpm_sse.h
index 3f637a23d1..db15030320 100644
--- a/examples/l3fwd/l3fwd_lpm_sse.h
+++ b/examples/l3fwd/l3fwd_lpm_sse.h
@@ -82,11 +82,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-			uint16_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			  uint16_t portid, uint16_t *dst_port,
+			  struct lcore_conf *qconf, const uint8_t do_step3)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
 	__m128i dip[MAX_PKT_BURST / FWDSTEP];
 	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
 	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -99,21 +99,40 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		processx4_step2(qconf, dip[j / FWDSTEP],
 				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j], &dst_port[j]);
 
+	if (do_step3)
+		for (j = 0; j != k; j += FWDSTEP)
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
 	/* Classify last up to 3 packets one by one */
 	switch (nb_rx % FWDSTEP) {
 	case 3:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 2:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 		/* fall-through */
 	case 1:
 		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		if (do_step3)
+			process_packet(pkts_burst[j], &dst_port[j]);
 		j++;
 	}
+}
+
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+		       struct lcore_conf *qconf)
+{
+	uint16_t dst_port[MAX_PKT_BURST];
 
+	l3fwd_lpm_process_packets(nb_rx, pkts_burst, portid, dst_port, qconf,
+				  0);
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v6 3/5] examples/l3fwd: use lpm vector path for event vector
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 2/5] examples/l3fwd: split processing and send stages pbhagavatula
@ 2022-10-25 16:05           ` pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
                             ` (2 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-25 16:05 UTC (permalink / raw)
  To: jerinj, thomas, David Christensen, Ruifeng Wang,
	Bruce Richardson, Konstantin Ananyev
  Cc: dev, Pavan Nikhilesh, Shijith Thotton

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
 examples/l3fwd/l3fwd_altivec.h | 29 ++++++++++++++
 examples/l3fwd/l3fwd_event.h   | 71 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/l3fwd_lpm.c     | 39 +++++++++++--------
 examples/l3fwd/l3fwd_neon.h    | 48 +++++++++++++++++++++++
 examples/l3fwd/l3fwd_sse.h     | 44 +++++++++++++++++++++
 5 files changed, 215 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..e45e138e59 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__vector unsigned short dp1;
+		__vector unsigned short dp;
+
+		dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
+		dp1 = *((__vector unsigned short *)&dst_ports[i]);
+		res = vec_all_eq(dp1, dp);
+		if (!res)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..3fe38aada0 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
 	uint64_t vector_tmo_ns;
 };
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#else
+static inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	int i;
+
+	for (i = 0; i < nb_elem; i++) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+	}
+
+	return dst_ports[0];
+}
+#endif
+
 static inline void
 event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
 {
@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
 	}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+		   uint16_t nb_pkts)
+{
+	uint16_t *des_pos, free = 0;
+	struct rte_mbuf **pos;
+	int i;
+
+	/* Filter out and free bad packets */
+	for (i = 0; i < nb_pkts; i++) {
+		if (dst_port[i] == BAD_PORT) {
+			rte_pktmbuf_free(mbufs[i]);
+			if (!free) {
+				pos = &mbufs[i];
+				des_pos = &dst_port[i];
+			}
+			free++;
+			continue;
+		}
+
+		if (free) {
+			*pos = mbufs[i];
+			pos++;
+			*des_pos = dst_port[i];
+			des_pos++;
+		}
+	}
 
+	return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+	uint16_t port, i;
+
+	vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+	/* Verify destination array */
+	port = process_dst_port(dst_port, vec->nb_elem);
+	if (port == BAD_PORT) {
+		vec->attr_valid = 0;
+		for (i = 0; i < vec->nb_elem; i++) {
+			vec->mbufs[i]->port = dst_port[i];
+			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+		}
+	} else {
+		vec->attr_valid = 1;
+		vec->port = port;
+		vec->queue = 0;
+	}
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index 22d7f61a42..5172979c72 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf *lconf,
+			 uint16_t *dst_port)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int i;
 
-	/* Process first packet to init vector attributes */
-	lpm_process_event_pkt(lconf, mbufs[0]);
+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined RTE_ARCH_PPC_64
 	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
+		l3fwd_lpm_process_packets(vec->nb_elem, mbufs, vec->port,
+					  dst_port, lconf, 1);
+	} else {
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_lpm_process_packets(1, &mbufs[i], mbufs[i]->port,
+						  &dst_port[i], lconf, 1);
 	}
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		dst_port[i] = lpm_process_event_pkt(lconf, mbufs[i]);
+#endif
 
-	for (i = 1; i < vec->nb_elem; i++) {
-		lpm_process_event_pkt(lconf, mbufs[i]);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	process_event_vector(vec, dst_port);
 }
 
 /* Same eventdev loop for single and burst of vector */
@@ -458,6 +461,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	struct rte_event events[MAX_PKT_BURST];
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
+	uint16_t *dst_port_list;
 	unsigned int lcore_id;
 
 	if (event_p_id < 0)
@@ -465,7 +469,11 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
-
+	dst_port_list =
+		rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+			    RTE_CACHE_LINE_SIZE);
+	if (dst_port_list == NULL)
+		return;
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__, lcore_id);
 
 	while (!force_quit) {
@@ -483,10 +491,8 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			lpm_process_event_vector(events[i].vec, lconf);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			lpm_process_event_vector(events[i].vec, lconf,
+						 dst_port_list);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -510,6 +516,7 @@ lpm_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(dst_port_list);
 }
 
 int __rte_noinline
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index ce515e0bc4..40807d5965 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -194,4 +194,52 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0;
+
+#if defined(RTE_ARCH_ARM64)
+	uint64_t res;
+
+	while (nb_elem > 7) {
+		uint16x8_t dp = vdupq_n_u16(dst_ports[0]);
+		uint16x8_t dp1;
+
+		dp1 = vld1q_u16(&dst_ports[i]);
+		dp1 = vceqq_u16(dp1, dp);
+		res = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(dp1, 4)),
+				    0);
+		if (res != ~0ULL)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		uint16x4_t dp = vdup_n_u16(dst_ports[0]);
+		uint16x4_t dp1;
+
+		dp1 = vld1_u16(&dst_ports[i]);
+		dp1 = vceq_u16(dp1, dp);
+		res = vget_lane_u64(vreinterpret_u64_u16(dp1), 0);
+		if (res != ~0ULL)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+#endif
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_NEON_H_ */
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index 0f0d0323a2..083729cdef 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -194,4 +194,48 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 	}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+	uint16_t i = 0, res;
+
+	while (nb_elem > 7) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		res = _mm_movemask_epi8(dp1);
+		if (res != 0xFFFF)
+			return BAD_PORT;
+
+		nb_elem -= 8;
+		i += 8;
+	}
+
+	while (nb_elem > 3) {
+		__m128i dp = _mm_set1_epi16(dst_ports[0]);
+		__m128i dp1;
+
+		dp1 = _mm_loadu_si128((__m128i *)&dst_ports[i]);
+		dp1 = _mm_cmpeq_epi16(dp1, dp);
+		dp1 = _mm_unpacklo_epi16(dp1, dp1);
+		res = _mm_movemask_ps((__m128)dp1);
+		if (res != 0xF)
+			return BAD_PORT;
+
+		nb_elem -= 4;
+		i += 4;
+	}
+
+	while (nb_elem) {
+		if (dst_ports[i] != dst_ports[0])
+			return BAD_PORT;
+		nb_elem--;
+		i++;
+	}
+
+	return dst_ports[0];
+}
+
 #endif /* _L3FWD_SSE_H_ */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v6 4/5] examples/l3fwd: fix event vector processing in fib
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 2/5] examples/l3fwd: split processing and send stages pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
@ 2022-10-25 16:05           ` pbhagavatula
  2022-10-25 16:05           ` [PATCH v6 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
  2022-10-31 14:52           ` [PATCH v6 1/5] examples/l3fwd: fix port group mask generation Thomas Monjalon
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-25 16:05 UTC (permalink / raw)
  To: jerinj, thomas; +Cc: dev, Pavan Nikhilesh, Shijith Thotton

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Fix stack overflow when event vector size is greater than
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
 examples/l3fwd/l3fwd_fib.c | 130 ++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 39 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index b82e0c0354..edc0dd69b9 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -77,27 +77,37 @@ fib_parse_packet(struct rte_mbuf *mbuf,
  */
 #if !defined FIB_SEND_MULTI
 static inline void
-fib_send_single(int nb_tx, struct lcore_conf *qconf,
-		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
 {
-	int32_t j;
 	struct rte_ether_hdr *eth_hdr;
 
-	for (j = 0; j < nb_tx; j++) {
-		/* Run rfc1812 if packet is ipv4 and checks enabled. */
+	/* Run rfc1812 if packet is ipv4 and checks enabled. */
 #if defined DO_RFC_1812_CHECKS
-		rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
-				pkts_burst[j], struct rte_ether_hdr *) + 1),
-				&hops[j], pkts_burst[j]->packet_type);
+	rfc1812_process(
+		(struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
+						pkt, struct rte_ether_hdr *) +
+					1),
+		hop, pkt->packet_type);
 #endif
 
-		/* Set MAC addresses. */
-		eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
-				struct rte_ether_hdr *);
-		*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[hops[j]];
-		rte_ether_addr_copy(&ports_eth_addr[hops[j]],
-				&eth_hdr->src_addr);
+	/* Set MAC addresses. */
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+	*(uint64_t *)&eth_hdr->dst_addr = dest_eth_addr[*hop];
+	rte_ether_addr_copy(&ports_eth_addr[*hop], &eth_hdr->src_addr);
+}
 
+static inline void
+fib_send_single(int nb_tx, struct lcore_conf *qconf,
+		struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+{
+	int32_t j;
+
+	for (j = 0; j < nb_tx; j++) {
+		process_packet(pkts_burst[j], &hops[j]);
+		if (hops[j] == BAD_PORT) {
+			rte_pktmbuf_free(pkts_burst[j]);
+			continue;
+		}
 		/* Send single packet. */
 		send_single_packet(qconf, pkts_burst[j], hops[j]);
 	}
@@ -261,7 +271,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 	uint32_t ipv4_arr[MAX_PKT_BURST];
 	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
 	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-	uint16_t nh;
+	uint16_t nh, hops[MAX_PKT_BURST];
 	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +360,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
 			else
 				nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 			if (nh != FIB_DEFAULT_HOP)
-				events[i].mbuf->port = nh;
+				hops[i] = nh != FIB_DEFAULT_HOP ?
+						  nh :
+						  events[i].mbuf->port;
+			process_packet(events[i].mbuf, &hops[i]);
+			events[i].mbuf->port = hops[i] != BAD_PORT ?
+						       hops[i] :
+						       events[i].mbuf->port;
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +434,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+			 uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+			 uint32_t *ipv4_arr, uint16_t *hops)
 {
-	uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-	uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
 	uint32_t ipv4_arr_assem, ipv6_arr_assem;
 	struct rte_mbuf **mbufs = vec->mbufs;
-	uint32_t ipv4_arr[MAX_PKT_BURST];
-	uint8_t type_arr[MAX_PKT_BURST];
 	uint32_t ipv4_cnt, ipv6_cnt;
 	struct lcore_conf *lconf;
 	uint16_t nh;
@@ -463,16 +477,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
 	/* Lookup IPv6 hops if IPv6 packets are present. */
 	if (ipv6_cnt > 0)
-		rte_fib6_lookup_bulk(lconf->ipv6_lookup_struct, ipv6_arr,
-				     hopsv6, ipv6_cnt);
-
-	if (vec->attr_valid) {
-		nh = type_arr[0] ? (uint16_t)hopsv4[0] : (uint16_t)hopsv6[0];
-		if (nh != FIB_DEFAULT_HOP)
-			vec->port = nh;
-		else
-			vec->attr_valid = 0;
-	}
+		rte_fib6_lookup_bulk(
+			lconf->ipv6_lookup_struct,
+			(uint8_t(*)[RTE_FIB6_IPV6_ADDR_SIZE])ipv6_arr, hopsv6,
+			ipv6_cnt);
 
 	/* Assign ports looked up in fib depending on IPv4 or IPv6 */
 	for (i = 0; i < vec->nb_elem; i++) {
@@ -481,9 +489,26 @@ fib_process_event_vector(struct rte_event_vector *vec)
 		else
 			nh = (uint16_t)hopsv6[ipv6_arr_assem++];
 		if (nh != FIB_DEFAULT_HOP)
-			mbufs[i]->port = nh;
-		event_vector_attr_validate(vec, mbufs[i]);
+			hops[i] = nh;
+		else
+			hops[i] = vec->attr_valid ? vec->port :
+						    vec->mbufs[i]->port;
 	}
+
+#if defined FIB_SEND_MULTI
+	uint16_t k;
+	k = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != k; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &hops[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#else
+	for (i = 0; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &hops[i]);
+#endif
+
+	process_event_vector(vec, hops);
 }
 
 static __rte_always_inline void
@@ -496,10 +521,37 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	const uint8_t event_d_id = evt_rsrc->event_d_id;
 	const uint16_t deq_len = evt_rsrc->deq_depth;
 	struct rte_event events[MAX_PKT_BURST];
+	uint8_t *type_arr, **ipv6_arr, *ptr;
 	int nb_enq = 0, nb_deq = 0, i;
-
-	if (event_p_id < 0)
+	uint64_t *hopsv4, *hopsv6;
+	uint32_t *ipv4_arr;
+	uint16_t *hops;
+	uintptr_t mem;
+
+	mem = (uintptr_t)rte_zmalloc(
+		"vector_fib",
+		(sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint64_t) +
+		 sizeof(uint64_t) + sizeof(uint16_t) + sizeof(uint8_t *) +
+		 (sizeof(uint8_t) * RTE_FIB6_IPV6_ADDR_SIZE)) *
+			evt_rsrc->vector_size,
+		RTE_CACHE_LINE_SIZE);
+	if (mem == 0)
 		return;
+	ipv4_arr = (uint32_t *)mem;
+	type_arr = (uint8_t *)&ipv4_arr[evt_rsrc->vector_size];
+	hopsv4 = (uint64_t *)&type_arr[evt_rsrc->vector_size];
+	hopsv6 = (uint64_t *)&hopsv4[evt_rsrc->vector_size];
+	hops = (uint16_t *)&hopsv6[evt_rsrc->vector_size];
+	ipv6_arr = (uint8_t **)&hops[evt_rsrc->vector_size];
+
+	ptr = (uint8_t *)&ipv6_arr[evt_rsrc->vector_size];
+	for (i = 0; i < evt_rsrc->vector_size; i++)
+		ipv6_arr[i] = &ptr[RTE_FIB6_IPV6_ADDR_SIZE + i];
+
+	if (event_p_id < 0) {
+		rte_free((void *)mem);
+		return;
+	}
 
 	RTE_LOG(INFO, L3FWD, "entering %s on lcore %u\n", __func__,
 		rte_lcore_id());
@@ -519,10 +571,9 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 				events[i].op = RTE_EVENT_OP_FORWARD;
 			}
 
-			fib_process_event_vector(events[i].vec);
-
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
+			fib_process_event_vector(events[i].vec, type_arr,
+						 ipv6_arr, hopsv4, hopsv6,
+						 ipv4_arr, hops);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -546,6 +597,7 @@ fib_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free((void *)mem);
 }
 
 int __rte_noinline
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v6 5/5] examples/l3fwd: use em vector path for event vector
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
                             ` (2 preceding siblings ...)
  2022-10-25 16:05           ` [PATCH v6 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
@ 2022-10-25 16:05           ` pbhagavatula
  2022-10-31 14:52           ` [PATCH v6 1/5] examples/l3fwd: fix port group mask generation Thomas Monjalon
  4 siblings, 0 replies; 41+ messages in thread
From: pbhagavatula @ 2022-10-25 16:05 UTC (permalink / raw)
  To: jerinj, thomas; +Cc: dev, Pavan Nikhilesh, Shijith Thotton

From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Shijith Thotton <sthotton@marvell.com>
---
 examples/l3fwd/l3fwd_em.c            | 13 +++--
 examples/l3fwd/l3fwd_em.h            | 29 +++++------
 examples/l3fwd/l3fwd_em_hlm.h        | 72 +++++-----------------------
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++++++----
 examples/l3fwd/l3fwd_event.h         | 21 --------
 5 files changed, 48 insertions(+), 112 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index a203dc9e46..35de31157e 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -860,10 +860,15 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 	int i, nb_enq = 0, nb_deq = 0;
 	struct lcore_conf *lconf;
 	unsigned int lcore_id;
+	uint16_t *dst_ports;
 
 	if (event_p_id < 0)
 		return;
 
+	dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+				RTE_CACHE_LINE_SIZE);
+	if (dst_ports == NULL)
+		return;
 	lcore_id = rte_lcore_id();
 	lconf = &lcore_conf[lcore_id];
 
@@ -885,13 +890,12 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 			}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-			l3fwd_em_process_event_vector(events[i].vec, lconf);
+			l3fwd_em_process_event_vector(events[i].vec, lconf,
+						      dst_ports);
 #else
 			l3fwd_em_no_opt_process_event_vector(events[i].vec,
-							     lconf);
+							     lconf, dst_ports);
 #endif
-			if (flags & L3FWD_EVENT_TX_DIRECT)
-				event_vector_txq_set(events[i].vec, 0);
 		}
 
 		if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -915,6 +919,7 @@ em_event_loop_vector(struct l3fwd_event_resources *evt_rsrc,
 
 	l3fwd_event_worker_cleanup(event_d_id, event_p_id, events, nb_enq,
 				   nb_deq, 1);
+	rte_free(dst_ports);
 }
 
 int __rte_noinline
diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
index fe2ee59f6a..7d051fc076 100644
--- a/examples/l3fwd/l3fwd_em.h
+++ b/examples/l3fwd/l3fwd_em.h
@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t portid,
 	}
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint16_t
 l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 {
 	struct rte_ether_hdr *eth_hdr;
@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 		m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
 	else
 		m->port = BAD_PORT;
+
+	return m->port;
 }
 
 /*
@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct rte_event **events,
 
 static inline void
 l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
-				     struct lcore_conf *qconf)
+				     struct lcore_conf *qconf,
+				     uint16_t *dst_ports)
 {
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i;
@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
 	for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
 		rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
 
-	/* Process first packet to init vector attributes */
-	l3fwd_em_simple_process(mbufs[0], qconf);
-	if (vec->attr_valid) {
-		if (mbufs[0]->port != BAD_PORT)
-			vec->port = mbufs[0]->port;
-		else
-			vec->attr_valid = 0;
-	}
-
 	/*
 	 * Prefetch and forward already prefetched packets.
 	 */
-	for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
+	for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
 		rte_prefetch0(
 			rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void *));
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
 	}
 
 	/* Forward remaining prefetched packets */
-	for (; i < vec->nb_elem; i++) {
-		l3fwd_em_simple_process(mbufs[i], qconf);
-		event_vector_attr_validate(vec, mbufs[i]);
-	}
+	for (; i < vec->nb_elem; i++)
+		dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_port)
 {
-	struct rte_mbuf **mbufs = vec->mbufs;
-	uint16_t dst_port[MAX_PKT_BURST];
-	int32_t i, j, n, pos;
-
-	for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < vec->nb_elem; j++)
-		rte_prefetch0(
-			rte_pktmbuf_mtod(mbufs[j], struct rte_ether_hdr *) + 1);
+	uint16_t i;
 
 	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
-	n = RTE_ALIGN_FLOOR(vec->nb_elem, EM_HASH_LOOKUP_COUNT);
-	for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
-		uint32_t pkt_type =
-			RTE_PTYPE_L3_MASK | RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP;
-		uint32_t l3_type, tcp_or_udp;
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
-			pkt_type &= mbufs[j + i]->packet_type;
-
-		l3_type = pkt_type & RTE_PTYPE_L3_MASK;
-		tcp_or_udp = pkt_type & (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
-
-		for (i = 0, pos = j + EM_HASH_LOOKUP_COUNT;
-		     i < EM_HASH_LOOKUP_COUNT && pos < vec->nb_elem;
-		     i++, pos++) {
-			rte_prefetch0(rte_pktmbuf_mtod(mbufs[pos],
-						       struct rte_ether_hdr *) +
-				      1);
-		}
-
-		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
-			em_get_dst_port_ipv4xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV6)) {
-			em_get_dst_port_ipv6xN_events(qconf, &mbufs[j],
-						      &dst_port[j]);
-		} else {
-			for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-				mbufs[j + i]->port =
-					em_get_dst_port(qconf, mbufs[j + i],
-							mbufs[j + i]->port);
-				process_packet(mbufs[j + i],
-					       &mbufs[j + i]->port);
-				event_vector_attr_validate(vec, mbufs[j + i]);
-			}
-			continue;
-		}
-		processx4_step3(&mbufs[j], &dst_port[j]);
-
-		for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++) {
-			mbufs[j + i]->port = dst_port[j + i];
-			event_vector_attr_validate(vec, mbufs[j + i]);
-		}
-	}
-
-	for (; j < vec->nb_elem; j++) {
-		mbufs[j]->port =
-			em_get_dst_port(qconf, mbufs[j], mbufs[j]->port);
-		process_packet(mbufs[j], &mbufs[j]->port);
-		event_vector_attr_validate(vec, mbufs[j]);
-	}
+		l3fwd_em_process_packets(vec->nb_elem, vec->mbufs, dst_port,
+					 vec->port, qconf, 1);
+	else
+		for (i = 0; i < vec->nb_elem; i++)
+			l3fwd_em_process_packets(1, &vec->mbufs[i],
+						 &dst_port[i],
+						 vec->mbufs[i]->port, qconf, 1);
+
+	process_event_vector(vec, dst_port);
 }
 
 #endif /* __L3FWD_EM_HLM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index d2f75edb8a..067f23889a 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -113,39 +113,48 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **events,
 
 	for (i = 1, j = 0; j < nb_rx; i++, j++) {
 		struct rte_mbuf *mbuf = events[j]->mbuf;
+		uint16_t port;
 
 		if (i < nb_rx) {
 			rte_prefetch0(rte_pktmbuf_mtod(
 					events[i]->mbuf,
 					struct rte_ether_hdr *) + 1);
 		}
+		port = mbuf->port;
 		mbuf->port = em_get_dst_port(qconf, mbuf, mbuf->port);
 		process_packet(mbuf, &mbuf->port);
+		if (mbuf->port == BAD_PORT)
+			mbuf->port = port;
 	}
 }
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
-			      struct lcore_conf *qconf)
+			      struct lcore_conf *qconf, uint16_t *dst_ports)
 {
+	const uint8_t attr_valid = vec->attr_valid;
 	struct rte_mbuf **mbufs = vec->mbufs;
 	int32_t i, j;
 
 	rte_prefetch0(rte_pktmbuf_mtod(mbufs[0], struct rte_ether_hdr *) + 1);
 
-	if (vec->attr_valid)
-		vec->port = em_get_dst_port(qconf, mbufs[0], mbufs[0]->port);
-
 	for (i = 0, j = 1; i < vec->nb_elem; i++, j++) {
 		if (j < vec->nb_elem)
 			rte_prefetch0(rte_pktmbuf_mtod(mbufs[j],
 						       struct rte_ether_hdr *) +
 				      1);
-		mbufs[i]->port =
-			em_get_dst_port(qconf, mbufs[i], mbufs[i]->port);
-		process_packet(mbufs[i], &mbufs[i]->port);
-		event_vector_attr_validate(vec, mbufs[i]);
+		dst_ports[i] = em_get_dst_port(qconf, mbufs[i],
+					       attr_valid ? vec->port :
+							    mbufs[i]->port);
 	}
+	j = RTE_ALIGN_FLOOR(vec->nb_elem, FWDSTEP);
+
+	for (i = 0; i != j; i += FWDSTEP)
+		processx4_step3(&vec->mbufs[i], &dst_ports[i]);
+	for (; i < vec->nb_elem; i++)
+		process_packet(vec->mbufs[i], &dst_ports[i]);
+
+	process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_SEQUENTIAL_H__ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index 3fe38aada0..e21817c36b 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -103,27 +103,6 @@ process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
 }
 #endif
 
-static inline void
-event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
-{
-	/* l3fwd application only changes mbuf port while processing */
-	if (vec->attr_valid && (vec->port != mbuf->port))
-		vec->attr_valid = 0;
-}
-
-static inline void
-event_vector_txq_set(struct rte_event_vector *vec, uint16_t txq)
-{
-	if (vec->attr_valid) {
-		vec->queue = txq;
-	} else {
-		int i;
-
-		for (i = 0; i < vec->nb_elem; i++)
-			rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], txq);
-	}
-}
-
 static inline uint16_t
 filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
 		   uint16_t nb_pkts)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v6 1/5] examples/l3fwd: fix port group mask generation
  2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
                             ` (3 preceding siblings ...)
  2022-10-25 16:05           ` [PATCH v6 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
@ 2022-10-31 14:52           ` Thomas Monjalon
  4 siblings, 0 replies; 41+ messages in thread
From: Thomas Monjalon @ 2022-10-31 14:52 UTC (permalink / raw)
  To: Pavan Nikhilesh
  Cc: jerinj, David Christensen, stable, dev, stable, Shijith Thotton

25/10/2022 18:05, pbhagavatula@marvell.com:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Fix port group mask generation in altivec, vec_any_eq returns
> 0 or 1 while port_groupx4 expects comparison mask result.
> 
> Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Acked-by: Shijith Thotton <sthotton@marvell.com>

Series applied, thanks.





^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2022-10-31 14:53 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-29  9:44 [PATCH 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
2022-08-29  9:44 ` [PATCH 2/5] examples/l3fwd: split processing and send stages pbhagavatula
2022-08-29  9:44 ` [PATCH 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
2022-08-29  9:44 ` [PATCH 4/5] examples/l3fwd: use em " pbhagavatula
2022-08-29  9:44 ` [PATCH 5/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
2022-09-02  9:18 ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
2022-09-02  9:18   ` [PATCH v2 2/5] examples/l3fwd: split processing and send stages pbhagavatula
2022-09-02  9:18   ` [PATCH v2 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
2022-09-02  9:18   ` [PATCH v2 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
2022-09-02  9:18   ` [PATCH v2 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
2022-09-08 18:33   ` [PATCH v2 1/5] examples/l3fwd: fix port group mask generation David Christensen
2022-09-09  5:56     ` [EXT] " Pavan Nikhilesh Bhagavatula
2022-09-11 18:12   ` [PATCH v3 " pbhagavatula
2022-09-11 18:12     ` [PATCH v3 2/5] examples/l3fwd: split processing and send stages pbhagavatula
2022-09-11 18:12     ` [PATCH v3 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
2022-09-11 18:12     ` [PATCH v3 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
2022-10-07 20:03       ` [EXT] " Shijith Thotton
2022-09-11 18:12     ` [PATCH v3 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
2022-10-07 20:01       ` [EXT] " Shijith Thotton
2022-10-11  9:08     ` [PATCH v4 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
2022-10-11  9:08       ` [PATCH v4 2/5] examples/l3fwd: split processing and send stages pbhagavatula
2022-10-11  9:08       ` [PATCH v4 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
2022-10-11  9:08       ` [PATCH v4 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
2022-10-11  9:08       ` [PATCH v4 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
2022-10-11 10:12       ` [PATCH v5 1/5] examples/l3fwd: fix port group mask generation pbhagavatula
2022-10-11 10:12         ` [PATCH v5 2/5] examples/l3fwd: split processing and send stages pbhagavatula
2022-10-17 12:06           ` [EXT] " Shijith Thotton
2022-10-11 10:12         ` [PATCH v5 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
2022-10-17 12:06           ` [EXT] " Shijith Thotton
2022-10-11 10:12         ` [PATCH v5 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
2022-10-17 12:06           ` [EXT] " Shijith Thotton
2022-10-11 10:12         ` [PATCH v5 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
2022-10-12  8:57           ` [EXT] " Shijith Thotton
2022-10-17 12:05         ` [EXT] [PATCH v5 1/5] examples/l3fwd: fix port group mask generation Shijith Thotton
2022-10-20 16:15           ` Pavan Nikhilesh Bhagavatula
2022-10-25 16:05         ` [PATCH v6 " pbhagavatula
2022-10-25 16:05           ` [PATCH v6 2/5] examples/l3fwd: split processing and send stages pbhagavatula
2022-10-25 16:05           ` [PATCH v6 3/5] examples/l3fwd: use lpm vector path for event vector pbhagavatula
2022-10-25 16:05           ` [PATCH v6 4/5] examples/l3fwd: fix event vector processing in fib pbhagavatula
2022-10-25 16:05           ` [PATCH v6 5/5] examples/l3fwd: use em vector path for event vector pbhagavatula
2022-10-31 14:52           ` [PATCH v6 1/5] examples/l3fwd: fix port group mask generation Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).