DPDK patches and discussions
 help / color / mirror / Atom feed
* [PATCH] examples/ipsec-secgw: add support of NEON with poll mode
@ 2022-05-24  9:57 Rahul Bhansali
  2022-05-24 23:00 ` Konstantin Ananyev
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-05-24  9:57 UTC (permalink / raw)
  To: dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang; +Cc: jerinj, Rahul Bhansali

This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by upto ~8% and inbound performance increased by
upto ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
 examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
 examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
 examples/ipsec-secgw/ipsec_worker.c   |   9 +
 4 files changed, 734 insertions(+)
 create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
 create mode 100644 examples/ipsec-secgw/ipsec_neon.h

diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 25255e053c..038c4669f5 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@
 #include "parser.h"
 #include "sad.h"
 
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;
 
 #define MAX_JUMBO_PKT_LEN  9600
@@ -96,6 +100,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
 	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };
 
+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
 
 #define CMD_LINE_OPT_CONFIG		"config"
@@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
 			process_pkts_outbound(&qconf->outbound, &traffic);
 	}
 
+#if defined __ARM_NEON
+	/* Neon optimized packet routing */
+	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+			 qconf->outbound.ipv4_offloads, true);
+	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
 	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
 		    qconf->outbound.ipv4_offloads, true);
 	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }
 
 static inline void
@@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
 		return -EINVAL;
 
 	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+			    (struct rte_ether_addr *)(val_eth + port));
 	return 0;
 }
 
@@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 			portid, rte_strerror(-ret));
 
 	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+			    (struct rte_ether_addr *)(val_eth + portid));
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+			    (struct rte_ether_addr *)(val_eth + portid) + 1);
+
 	print_ethaddr("Address: ", &ethaddr);
 	printf("\n");
 
diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+		uint64_t *inline_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+	int i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+							RTE_ETHER_HDR_LEN);
+		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		/* Fetch destination IPv4 address */
+		dst[i] = ipv4_hdr->dst_addr;
+		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+	}
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	uint32_t next_hop;
+	rte_xmm_t dst;
+	uint8_t i;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* If all 4 packets are non-inline */
+	if (!inline_flag) {
+		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+				 BAD_PORT);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+		return;
+	}
+
+	/* Inline and non-inline packets */
+	dst.x = dip;
+	for (i = 0; i < FWDSTEP; i++) {
+		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+			dprt[i] = (uint16_t) (((next_hop &
+						RTE_LPM_LOOKUP_SUCCESS) != 0)
+						? next_hop : BAD_PORT);
+
+		} else {
+			dprt[i] = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						 dst.u32[i], &next_hop) == 0)
+						? next_hop : BAD_PORT);
+		}
+	}
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+		   uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint32_t next_hop;
+	uint32_t dst_ip;
+
+	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+		next_hop = get_hop_for_offload_pkt(pkt, 0);
+		*dst_port = (uint16_t) (((next_hop &
+					  RTE_LPM_LOOKUP_SUCCESS) != 0)
+					  ? next_hop : BAD_PORT);
+	} else {
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+		*dst_port = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						dst_ip, &next_hop) == 0)
+						? next_hop : BAD_PORT);
+	}
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+	uint8_t dst_ip6[MAX_PKT_BURST][16];
+	int32_t dst_port[MAX_PKT_BURST];
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	int32_t hop[MAX_PKT_BURST];
+	struct rte_mbuf *pkt;
+	uint8_t lpm_pkts = 0;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	/* Need to do an LPM lookup for non-inline packets. Inline packets will
+	 * have port ID in the SA
+	 */
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+		pkt->l2_len = RTE_ETHER_HDR_LEN;
+		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+			/* Security offload not enabled. So an LPM lookup is
+			 * required to get the hop
+			 */
+			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+			memcpy(&dst_ip6[lpm_pkts][0],
+					ipv6_hdr->dst_addr, 16);
+			lpm_pkts++;
+		}
+	}
+
+	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+				  hop, lpm_pkts);
+
+	lpm_pkts = 0;
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			/* Read hop from the SA */
+			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+		} else {
+			/* Need to use hop returned by lookup */
+			dst_port[i] = hop[lpm_pkts++];
+		}
+		if (dst_port[i] == -1)
+			dst_port[i] = BAD_PORT;
+	}
+
+	/* Send packets */
+	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+		 uint64_t tx_offloads, bool ip_cksum)
+{
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+	uint16_t dst_port[MAX_PKT_BURST];
+	uint64_t inline_flag = 0;
+	int32x4_t dip;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	for (i = 0; i != k; i += FWDSTEP) {
+		processx4_step1(&pkts[i], &dip, &inline_flag);
+		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+				&dst_port[i]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (m) {
+	case 3:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+	}
+
+	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..39dddcd1e3
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+
+#define FWDSTEP		4
+#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
+#define BAD_PORT	((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Group consecutive packets with the same destination port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {         \
+		(lp)[0]++;                           \
+	} else {                                     \
+		(dlp) = (dcp)[idx];                  \
+		(lp) = (pn) + (idx);                 \
+		(lp)[0] = 1;                         \
+	}                                            \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+	struct rte_mbuf *pkt;
+	uint8_t i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		pkt = pkts[i];
+
+		/* Check if it is a large packet */
+		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+			*l_pkt |= 1;
+
+		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+		te[i] = vld1q_u32(p[i]);
+
+		/* Update last 4 bytes */
+		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+		vst1q_u32(p[i], ve[i]);
+
+		if (ip_cksum) {
+			struct rte_ipv4_hdr *ip;
+
+			pkt->ol_flags |= tx_offloads;
+
+			ip = (struct rte_ipv4_hdr *)
+				(p[i] + RTE_ETHER_HDR_LEN + 1);
+			ip->hdr_checksum = 0;
+
+			/* calculate IPv4 cksum in SW */
+			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+				ip->hdr_checksum = rte_ipv4_cksum(ip);
+		}
+
+	}
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+	int32_t v;
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+	rte_compiler_barrier();
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+	       bool ip_cksum, uint8_t *l_pkt)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	/* Check if it is a large packet */
+	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+		*l_pkt |= 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+
+	if (ip_cksum) {
+		struct rte_ipv4_hdr *ip;
+
+		pkt->ol_flags |= tx_offloads;
+
+		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip->hdr_checksum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+			ip->hdr_checksum = rte_ipv4_cksum(ip);
+	}
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+	uint8_t proto;
+	uint32_t i;
+
+	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+	for (i = 0; i < num; i++)
+		send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	struct lcore_conf *qconf;
+	uint32_t len, j, n;
+
+	qconf = &lcore_conf[lcoreid];
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		core_stats_update_tx(n);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+		}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		if (len == 0)
+			goto exit;
+
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+			case 0:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 3:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 2:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 1:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+		}
+		}
+	}
+
+exit:
+	qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	uint16_t pnum[MAX_PKT_BURST + 1];
+	uint8_t l_pkt = 0;
+	uint16_t dlp, *lp;
+	int i = 0, k;
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (i = FWDSTEP; i != k; i += FWDSTEP) {
+			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+					ip_cksum, &l_pkt);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[i - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (i = 0; i < nb_rx; i += k) {
+
+		uint16_t pn;
+
+		pn = dst_port[i];
+		k = pnum[i];
+
+		if (likely(pn != BAD_PORT)) {
+			if (l_pkt)
+				/* Large packet is present, need to send
+				 * individual packets with fragment
+				 */
+				send_packets(pkts + i, pn, k, is_ipv4);
+			else
+				send_packetsx4(pkts + i, pn, k);
+
+		} else {
+			free_pkts(&pkts[i], k);
+			if (is_ipv4)
+				core_statistics[lcoreid].lpm4.miss++;
+			else
+				core_statistics[lcoreid].lpm6.miss++;
+		}
+	}
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"
 
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
 	struct rte_security_session *sess;
 	struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
 				v6_num = ip6.num;
 			}
 
+#if defined __ARM_NEON
+			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+			route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
 			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
 			route6_pkts(rt6_ctx, v6, v6_num);
+#endif
 		}
 	}
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] examples/ipsec-secgw: add support of NEON with poll mode
  2022-05-24  9:57 [PATCH] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
@ 2022-05-24 23:00 ` Konstantin Ananyev
  2022-05-25 11:03   ` [EXT] " Rahul Bhansali
  2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
  2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
  2 siblings, 1 reply; 26+ messages in thread
From: Konstantin Ananyev @ 2022-05-24 23:00 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang; +Cc: jerinj

24/05/2022 10:57, Rahul Bhansali пишет:
> This adds the support of NEON based lpm lookup along with
> multi packet processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance
> increased by upto ~8% and inbound performance increased by
> upto ~6%.


Interesting, good bunch of code looks like a dup from l3fwd:
grouping, precessx4_step?, etc.
Would it be possible to move dup code into some common place,
so it can be used by both examples?

> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
>   examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
>   examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
>   examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
>   examples/ipsec-secgw/ipsec_worker.c   |   9 +
>   4 files changed, 734 insertions(+)
>   create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
>   create mode 100644 examples/ipsec-secgw/ipsec_neon.h
> 
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> index 25255e053c..038c4669f5 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -56,6 +56,10 @@
>   #include "parser.h"
>   #include "sad.h"
>   
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>   volatile bool force_quit;
>   
>   #define MAX_JUMBO_PKT_LEN  9600
> @@ -96,6 +100,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
>   	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
>   };
>   
> +/*
> + * To hold ethernet header per port, which will be applied
> + * to outgoing packets.
> + */
> +xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
>   struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
>   
>   #define CMD_LINE_OPT_CONFIG		"config"
> @@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
>   			process_pkts_outbound(&qconf->outbound, &traffic);
>   	}
>   
> +#if defined __ARM_NEON
> +	/* Neon optimized packet routing */
> +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> +			 qconf->outbound.ipv4_offloads, true);
> +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#else
>   	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>   		    qconf->outbound.ipv4_offloads, true);
>   	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#endif
>   }
>   
>   static inline void
> @@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
>   		return -EINVAL;
>   
>   	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> +			    (struct rte_ether_addr *)(val_eth + port));
>   	return 0;
>   }
>   
> @@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
>   			portid, rte_strerror(-ret));
>   
>   	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> +
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> +			    (struct rte_ether_addr *)(val_eth + portid));
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> +
>   	print_ethaddr("Address: ", &ethaddr);
>   	printf("\n");
>   
> diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
> new file mode 100644
> index 0000000000..959a5a8666
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
> @@ -0,0 +1,213 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef __IPSEC_LPM_NEON_H__
> +#define __IPSEC_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +#include "ipsec_neon.h"
> +
> +/*
> + * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
> +		uint64_t *inline_flag)
> +{
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	struct rte_ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +	int i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
> +							RTE_ETHER_HDR_LEN);
> +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
> +
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +
> +		/* Fetch destination IPv4 address */
> +		dst[i] = ipv4_hdr->dst_addr;
> +		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
> +	}
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + */
> +static inline void
> +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
> +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
> +{
> +	uint32_t next_hop;
> +	rte_xmm_t dst;
> +	uint8_t i;
> +
> +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> +
> +	/* If all 4 packets are non-inline */
> +	if (!inline_flag) {
> +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
> +				 BAD_PORT);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +		return;
> +	}
> +
> +	/* Inline and non-inline packets */
> +	dst.x = dip;
> +	for (i = 0; i < FWDSTEP; i++) {
> +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
> +			dprt[i] = (uint16_t) (((next_hop &
> +						RTE_LPM_LOOKUP_SUCCESS) != 0)
> +						? next_hop : BAD_PORT);
> +
> +		} else {
> +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						 dst.u32[i], &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +		}
> +	}
> +}
> +
> +/*
> + * Process single packets for destination port.
> + */
> +static inline void
> +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
> +		   uint16_t *dst_port)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	uint32_t next_hop;
> +	uint32_t dst_ip;
> +
> +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +							RTE_ETHER_HDR_LEN);
> +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +	pkt->l2_len = RTE_ETHER_HDR_LEN;
> +
> +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +		next_hop = get_hop_for_offload_pkt(pkt, 0);
> +		*dst_port = (uint16_t) (((next_hop &
> +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
> +					  ? next_hop : BAD_PORT);
> +	} else {
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +		*dst_port = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						dst_ip, &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of IPv6 packets.
> + */
> +static inline void
> +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
> +{
> +	uint8_t dst_ip6[MAX_PKT_BURST][16];
> +	int32_t dst_port[MAX_PKT_BURST];
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv6_hdr *ipv6_hdr;
> +	int32_t hop[MAX_PKT_BURST];
> +	struct rte_mbuf *pkt;
> +	uint8_t lpm_pkts = 0;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
> +	 * have port ID in the SA
> +	 */
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +							RTE_ETHER_HDR_LEN);
> +		pkt->l2_len = RTE_ETHER_HDR_LEN;
> +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
> +
> +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
> +			/* Security offload not enabled. So an LPM lookup is
> +			 * required to get the hop
> +			 */
> +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +			memcpy(&dst_ip6[lpm_pkts][0],
> +					ipv6_hdr->dst_addr, 16);
> +			lpm_pkts++;
> +		}
> +	}
> +
> +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
> +				  hop, lpm_pkts);
> +
> +	lpm_pkts = 0;
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			/* Read hop from the SA */
> +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
> +		} else {
> +			/* Need to use hop returned by lookup */
> +			dst_port[i] = hop[lpm_pkts++];
> +		}
> +		if (dst_port[i] == -1)
> +			dst_port[i] = BAD_PORT;
> +	}
> +
> +	/* Send packets */
> +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
> +}
> +
> +/*
> + * Buffer optimized handling of IPv4 packets.
> + */
> +static inline void
> +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
> +		 uint64_t tx_offloads, bool ip_cksum)
> +{
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	const int32_t m = nb_rx % FWDSTEP;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	uint64_t inline_flag = 0;
> +	int32x4_t dip;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	for (i = 0; i != k; i += FWDSTEP) {
> +		processx4_step1(&pkts[i], &dip, &inline_flag);
> +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
> +				&dst_port[i]);
> +	}
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (m) {
> +	case 3:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +	}
> +
> +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
> +}
> +
> +#endif /* __IPSEC_LPM_NEON_H__ */
> diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
> new file mode 100644
> index 0000000000..39dddcd1e3
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_neon.h
> @@ -0,0 +1,487 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _IPSEC_NEON_H_
> +#define _IPSEC_NEON_H_
> +
> +#include "ipsec.h"
> +
> +#define FWDSTEP		4
> +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
> +#define BAD_PORT	((uint16_t)-1)
> +
> +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
> +/*
> + * Group consecutive packets with the same destination port into one burst.
> + * To avoid extra latency this is done together with some other packet
> + * processing, but after we made a final decision about packet's destination.
> + * To do this we maintain:
> + * pnum - array of number of consecutive packets with the same dest port for
> + * each packet in the input burst.
> + * lp - pointer to the last updated element in the pnum.
> + * dlp - dest port value lp corresponds to.
> + */
> +
> +#define	GRPSZ	(1 << FWDSTEP)
> +#define	GRPMSK	(GRPSZ - 1)
> +
> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> +	if (likely((dlp) == (dcp)[(idx)])) {         \
> +		(lp)[0]++;                           \
> +	} else {                                     \
> +		(dlp) = (dcp)[idx];                  \
> +		(lp) = (pn) + (idx);                 \
> +		(lp)[0] = 1;                         \
> +	}                                            \
> +} while (0)
> +
> +static const struct {
> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> +	int32_t  idx;  /* index for new last updated elemnet. */
> +	uint16_t lpv;  /* add value to the last updated element. */
> +} gptbl[GRPSZ] = {
> +	{
> +		/* 0: a != b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 1: a == b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 2: a != b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 3: a == b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020003),
> +		.idx = 4,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 4: a != b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 5: a == b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 6: a != b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 7: a == b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030004),
> +		.idx = 4,
> +		.lpv = 3,
> +	},
> +	{
> +		/* 8: a != b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 9: a == b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010002),
> +		.idx = 3,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xa: a != b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xb: a == b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020003),
> +		.idx = 3,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 0xc: a != b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010001),
> +		.idx = 2,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xd: a == b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010002),
> +		.idx = 2,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xe: a != b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040001),
> +		.idx = 1,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xf: a == b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040005),
> +		.idx = 0,
> +		.lpv = 4,
> +	},
> +};
> +
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
> +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
> +{
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +	struct rte_mbuf *pkt;
> +	uint8_t i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		pkt = pkts[i];
> +
> +		/* Check if it is a large packet */
> +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +			*l_pkt |= 1;
> +
> +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
> +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
> +		te[i] = vld1q_u32(p[i]);
> +
> +		/* Update last 4 bytes */
> +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
> +		vst1q_u32(p[i], ve[i]);
> +
> +		if (ip_cksum) {
> +			struct rte_ipv4_hdr *ip;
> +
> +			pkt->ol_flags |= tx_offloads;
> +
> +			ip = (struct rte_ipv4_hdr *)
> +				(p[i] + RTE_ETHER_HDR_LEN + 1);
> +			ip->hdr_checksum = 0;
> +
> +			/* calculate IPv4 cksum in SW */
> +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +				ip->hdr_checksum = rte_ipv4_cksum(ip);
> +		}
> +
> +	}
> +}
> +
> +/*
> + * Group consecutive packets with the same destination port in bursts of 4.
> + * Suppose we have array of destination ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisons at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +	     uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +	int32_t v;
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +	rte_compiler_barrier();
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +/**
> + * Process single packet:
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
> +	       bool ip_cksum, uint8_t *l_pkt)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	/* Check if it is a large packet */
> +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +		*l_pkt |= 1;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +
> +	if (ip_cksum) {
> +		struct rte_ipv4_hdr *ip;
> +
> +		pkt->ol_flags |= tx_offloads;
> +
> +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip->hdr_checksum = 0;
> +
> +		/* calculate IPv4 cksum in SW */
> +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +			ip->hdr_checksum = rte_ipv4_cksum(ip);
> +	}
> +}
> +
> +static inline void
> +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
> +{
> +	uint8_t proto;
> +	uint32_t i;
> +
> +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
> +	for (i = 0; i < num; i++)
> +		send_single_packet(m[i], port, proto);
> +}
> +
> +static inline void
> +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
> +{
> +	unsigned int lcoreid = rte_lcore_id();
> +	struct lcore_conf *qconf;
> +	uint32_t len, j, n;
> +
> +	qconf = &lcore_conf[lcoreid];
> +
> +	len = qconf->tx_mbufs[port].len;
> +
> +	/*
> +	 * If TX buffer for that queue is empty, and we have enough packets,
> +	 * then send them straightway.
> +	 */
> +	if (num >= MAX_TX_BURST && len == 0) {
> +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
> +		core_stats_update_tx(n);
> +		if (unlikely(n < num)) {
> +			do {
> +				rte_pktmbuf_free(m[n]);
> +			} while (++n < num);
> +		}
> +		return;
> +	}
> +
> +	/*
> +	 * Put packets into TX buffer for that queue.
> +	 */
> +
> +	n = len + num;
> +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
> +
> +	j = 0;
> +	switch (n % FWDSTEP) {
> +	while (j < n) {
> +		case 0:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 3:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 2:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 1:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +		}
> +	}
> +
> +	len += n;
> +
> +	/* enough pkts to be sent */
> +	if (unlikely(len == MAX_PKT_BURST)) {
> +
> +		send_burst(qconf, MAX_PKT_BURST, port);
> +
> +		/* copy rest of the packets into the TX buffer. */
> +		len = num - n;
> +		if (len == 0)
> +			goto exit;
> +
> +		j = 0;
> +		switch (len % FWDSTEP) {
> +		while (j < len) {
> +			case 0:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 3:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 2:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 1:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +		}
> +		}
> +	}
> +
> +exit:
> +	qconf->tx_mbufs[port].len = len;
> +}
> +
> +/**
> + * Send packets burst to the ports in dst_port array
> + */
> +static __rte_always_inline void
> +send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
> +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
> +{
> +	unsigned int lcoreid = rte_lcore_id();
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +	uint8_t l_pkt = 0;
> +	uint16_t dlp, *lp;
> +	int i = 0, k;
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
> +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
> +					ip_cksum, &l_pkt);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
> +			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> +		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[i - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (i = 0; i < nb_rx; i += k) {
> +
> +		uint16_t pn;
> +
> +		pn = dst_port[i];
> +		k = pnum[i];
> +
> +		if (likely(pn != BAD_PORT)) {
> +			if (l_pkt)
> +				/* Large packet is present, need to send
> +				 * individual packets with fragment
> +				 */
> +				send_packets(pkts + i, pn, k, is_ipv4);
> +			else
> +				send_packetsx4(pkts + i, pn, k);
> +
> +		} else {
> +			free_pkts(&pkts[i], k);
> +			if (is_ipv4)
> +				core_statistics[lcoreid].lpm4.miss++;
> +			else
> +				core_statistics[lcoreid].lpm6.miss++;
> +		}
> +	}
> +}
> +
> +#endif /* _IPSEC_NEON_H_ */
> diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
> index e1d4e3d864..803157d8ee 100644
> --- a/examples/ipsec-secgw/ipsec_worker.c
> +++ b/examples/ipsec-secgw/ipsec_worker.c
> @@ -12,6 +12,10 @@
>   #include "ipsec-secgw.h"
>   #include "ipsec_worker.h"
>   
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>   struct port_drv_mode_data {
>   	struct rte_security_session *sess;
>   	struct rte_security_ctx *ctx;
> @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
>   				v6_num = ip6.num;
>   			}
>   
> +#if defined __ARM_NEON
> +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
> +			route6_pkts_neon(rt6_ctx, v6, v6_num);
> +#else
>   			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
>   			route6_pkts(rt6_ctx, v6, v6_num);
> +#endif
>   		}
>   	}
>   }


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] Re: [PATCH] examples/ipsec-secgw: add support of NEON with poll mode
  2022-05-24 23:00 ` Konstantin Ananyev
@ 2022-05-25 11:03   ` Rahul Bhansali
  2022-05-27 11:44     ` Konstantin Ananyev
  0 siblings, 1 reply; 26+ messages in thread
From: Rahul Bhansali @ 2022-05-25 11:03 UTC (permalink / raw)
  To: Konstantin Ananyev, dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang
  Cc: Jerin Jacob Kollanukkaran



> -----Original Message-----
> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Sent: Wednesday, May 25, 2022 4:30 AM
> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Radu Nicolau
> <radu.nicolau@intel.com>; Akhil Goyal <gakhil@marvell.com>; Ruifeng Wang
> <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: [EXT] Re: [PATCH] examples/ipsec-secgw: add support of NEON with
> poll mode
> 
> External Email
> 
> ----------------------------------------------------------------------
> 24/05/2022 10:57, Rahul Bhansali пишет:
> > This adds the support of NEON based lpm lookup along with multi packet
> > processing for burst send in packets routing.
> >
> > Performance impact:
> > On cn10k, with poll mode inline protocol, outbound performance
> > increased by upto ~8% and inbound performance increased by upto ~6%.
> 
> 
> Interesting, good bunch of code looks like a dup from l3fwd:
> grouping, precessx4_step?, etc.

Yes, neon logic is taken as a reference from l3fwd and some modifications as per
requirement of ipsec example.

> Would it be possible to move dup code into some common place,
> so it can be used by both examples?
processx4_step... has some additional Ethernet header, inline vs non-inline packets lpm lookup,
IP checksum etc processes and even if we separate out to make common code with l3fwd then getting
less performance as additional things to be done separately again under certain conditions for
individual packets.

For grouping specific port_groupx4() only, we can have it in a common place. If it is worth,
I can make changes accordingly. Do let me know.

> 
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
> >   examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
> >   examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
> >   examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
> >   examples/ipsec-secgw/ipsec_worker.c   |   9 +
> >   4 files changed, 734 insertions(+)
> >   create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
> >   create mode 100644 examples/ipsec-secgw/ipsec_neon.h
> >
> > diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-
> secgw/ipsec-secgw.c
> > index 25255e053c..038c4669f5 100644
> > --- a/examples/ipsec-secgw/ipsec-secgw.c
> > +++ b/examples/ipsec-secgw/ipsec-secgw.c
> > @@ -56,6 +56,10 @@
> >   #include "parser.h"
> >   #include "sad.h"
> >
> > +#if defined(__ARM_NEON)
> > +#include "ipsec_lpm_neon.h"
> > +#endif
> > +
> >   volatile bool force_quit;
> >
> >   #define MAX_JUMBO_PKT_LEN  9600
> > @@ -96,6 +100,12 @@ struct ethaddr_info
> ethaddr_tbl[RTE_MAX_ETHPORTS] = {
> >   	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
> >   };
> >
> > +/*
> > + * To hold ethernet header per port, which will be applied
> > + * to outgoing packets.
> > + */
> > +xmm_t val_eth[RTE_MAX_ETHPORTS];
> > +
> >   struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
> >
> >   #define CMD_LINE_OPT_CONFIG		"config"
> > @@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct
> rte_mbuf **pkts,
> >   			process_pkts_outbound(&qconf->outbound, &traffic);
> >   	}
> >
> > +#if defined __ARM_NEON
> > +	/* Neon optimized packet routing */
> > +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> > +			 qconf->outbound.ipv4_offloads, true);
> > +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> > +#else
> >   	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> >   		    qconf->outbound.ipv4_offloads, true);
> >   	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> > +#endif
> >   }
> >
> >   static inline void
> > @@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct
> rte_ether_addr *addr)
> >   		return -EINVAL;
> >
> >   	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> > +			    (struct rte_ether_addr *)(val_eth + port));
> >   	return 0;
> >   }
> >
> > @@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t
> req_rx_offloads, uint64_t req_tx_offloads)
> >   			portid, rte_strerror(-ret));
> >
> >   	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> > +
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> > +			    (struct rte_ether_addr *)(val_eth + portid));
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> > +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> > +
> >   	print_ethaddr("Address: ", &ethaddr);
> >   	printf("\n");
> >
> > diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-
> secgw/ipsec_lpm_neon.h
> > new file mode 100644
> > index 0000000000..959a5a8666
> > --- /dev/null
> > +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
> > @@ -0,0 +1,213 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2022 Marvell.
> > + */
> > +
> > +#ifndef __IPSEC_LPM_NEON_H__
> > +#define __IPSEC_LPM_NEON_H__
> > +
> > +#include <arm_neon.h>
> > +#include "ipsec_neon.h"
> > +
> > +/*
> > + * Append ethernet header and read destination IPV4 addresses from 4
> mbufs.
> > + */
> > +static inline void
> > +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
> > +		uint64_t *inline_flag)
> > +{
> > +	struct rte_ipv4_hdr *ipv4_hdr;
> > +	struct rte_ether_hdr *eth_hdr;
> > +	int32_t dst[FWDSTEP];
> > +	int i;
> > +
> > +	for (i = 0; i < FWDSTEP; i++) {
> > +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
> > +
> 	RTE_ETHER_HDR_LEN);
> > +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
> > +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
> > +
> > +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> > +
> > +		/* Fetch destination IPv4 address */
> > +		dst[i] = ipv4_hdr->dst_addr;
> > +		*inline_flag |= pkt[i]->ol_flags &
> RTE_MBUF_F_TX_SEC_OFFLOAD;
> > +	}
> > +
> > +	dip[0] = vld1q_s32(dst);
> > +}
> > +
> > +/*
> > + * Lookup into LPM for destination port.
> > + */
> > +static inline void
> > +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
> > +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
> > +{
> > +	uint32_t next_hop;
> > +	rte_xmm_t dst;
> > +	uint8_t i;
> > +
> > +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> > +
> > +	/* If all 4 packets are non-inline */
> > +	if (!inline_flag) {
> > +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
> > +				 BAD_PORT);
> > +		/* get rid of unused upper 16 bit for each dport. */
> > +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> > +		return;
> > +	}
> > +
> > +	/* Inline and non-inline packets */
> > +	dst.x = dip;
> > +	for (i = 0; i < FWDSTEP; i++) {
> > +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> > +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
> > +			dprt[i] = (uint16_t) (((next_hop &
> > +						RTE_LPM_LOOKUP_SUCCESS)
> != 0)
> > +						? next_hop : BAD_PORT);
> > +
> > +		} else {
> > +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
> > +						(struct rte_lpm *)rt_ctx,
> > +						 dst.u32[i], &next_hop) == 0)
> > +						? next_hop : BAD_PORT);
> > +		}
> > +	}
> > +}
> > +
> > +/*
> > + * Process single packets for destination port.
> > + */
> > +static inline void
> > +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
> > +		   uint16_t *dst_port)
> > +{
> > +	struct rte_ether_hdr *eth_hdr;
> > +	struct rte_ipv4_hdr *ipv4_hdr;
> > +	uint32_t next_hop;
> > +	uint32_t dst_ip;
> > +
> > +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> > +
> 	RTE_ETHER_HDR_LEN);
> > +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
> > +	pkt->l2_len = RTE_ETHER_HDR_LEN;
> > +
> > +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> > +		next_hop = get_hop_for_offload_pkt(pkt, 0);
> > +		*dst_port = (uint16_t) (((next_hop &
> > +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
> > +					  ? next_hop : BAD_PORT);
> > +	} else {
> > +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> > +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> > +		*dst_port = (uint16_t) ((rte_lpm_lookup(
> > +						(struct rte_lpm *)rt_ctx,
> > +						dst_ip, &next_hop) == 0)
> > +						? next_hop : BAD_PORT);
> > +	}
> > +}
> > +
> > +/*
> > + * Buffer optimized handling of IPv6 packets.
> > + */
> > +static inline void
> > +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
> > +{
> > +	uint8_t dst_ip6[MAX_PKT_BURST][16];
> > +	int32_t dst_port[MAX_PKT_BURST];
> > +	struct rte_ether_hdr *eth_hdr;
> > +	struct rte_ipv6_hdr *ipv6_hdr;
> > +	int32_t hop[MAX_PKT_BURST];
> > +	struct rte_mbuf *pkt;
> > +	uint8_t lpm_pkts = 0;
> > +	int32_t i;
> > +
> > +	if (nb_rx == 0)
> > +		return;
> > +
> > +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
> > +	 * have port ID in the SA
> > +	 */
> > +
> > +	for (i = 0; i < nb_rx; i++) {
> > +		pkt = pkts[i];
> > +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> > +
> 	RTE_ETHER_HDR_LEN);
> > +		pkt->l2_len = RTE_ETHER_HDR_LEN;
> > +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
> > +
> > +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
> > +			/* Security offload not enabled. So an LPM lookup is
> > +			 * required to get the hop
> > +			 */
> > +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> > +			memcpy(&dst_ip6[lpm_pkts][0],
> > +					ipv6_hdr->dst_addr, 16);
> > +			lpm_pkts++;
> > +		}
> > +	}
> > +
> > +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
> > +				  hop, lpm_pkts);
> > +
> > +	lpm_pkts = 0;
> > +
> > +	for (i = 0; i < nb_rx; i++) {
> > +		pkt = pkts[i];
> > +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> > +			/* Read hop from the SA */
> > +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
> > +		} else {
> > +			/* Need to use hop returned by lookup */
> > +			dst_port[i] = hop[lpm_pkts++];
> > +		}
> > +		if (dst_port[i] == -1)
> > +			dst_port[i] = BAD_PORT;
> > +	}
> > +
> > +	/* Send packets */
> > +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
> > +}
> > +
> > +/*
> > + * Buffer optimized handling of IPv4 packets.
> > + */
> > +static inline void
> > +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
> > +		 uint64_t tx_offloads, bool ip_cksum)
> > +{
> > +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> > +	const int32_t m = nb_rx % FWDSTEP;
> > +	uint16_t dst_port[MAX_PKT_BURST];
> > +	uint64_t inline_flag = 0;
> > +	int32x4_t dip;
> > +	int32_t i;
> > +
> > +	if (nb_rx == 0)
> > +		return;
> > +
> > +	for (i = 0; i != k; i += FWDSTEP) {
> > +		processx4_step1(&pkts[i], &dip, &inline_flag);
> > +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
> > +				&dst_port[i]);
> > +	}
> > +
> > +	/* Classify last up to 3 packets one by one */
> > +	switch (m) {
> > +	case 3:
> > +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> > +		i++;
> > +		/* fallthrough */
> > +	case 2:
> > +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> > +		i++;
> > +		/* fallthrough */
> > +	case 1:
> > +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> > +	}
> > +
> > +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
> > +}
> > +
> > +#endif /* __IPSEC_LPM_NEON_H__ */
> > diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-
> secgw/ipsec_neon.h
> > new file mode 100644
> > index 0000000000..39dddcd1e3
> > --- /dev/null
> > +++ b/examples/ipsec-secgw/ipsec_neon.h
> > @@ -0,0 +1,487 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2022 Marvell.
> > + */
> > +
> > +#ifndef _IPSEC_NEON_H_
> > +#define _IPSEC_NEON_H_
> > +
> > +#include "ipsec.h"
> > +
> > +#define FWDSTEP		4
> > +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
> > +#define BAD_PORT	((uint16_t)-1)
> > +
> > +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
> > +
> > +/*
> > + * Group consecutive packets with the same destination port into one burst.
> > + * To avoid extra latency this is done together with some other packet
> > + * processing, but after we made a final decision about packet's destination.
> > + * To do this we maintain:
> > + * pnum - array of number of consecutive packets with the same dest port for
> > + * each packet in the input burst.
> > + * lp - pointer to the last updated element in the pnum.
> > + * dlp - dest port value lp corresponds to.
> > + */
> > +
> > +#define	GRPSZ	(1 << FWDSTEP)
> > +#define	GRPMSK	(GRPSZ - 1)
> > +
> > +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> > +	if (likely((dlp) == (dcp)[(idx)])) {         \
> > +		(lp)[0]++;                           \
> > +	} else {                                     \
> > +		(dlp) = (dcp)[idx];                  \
> > +		(lp) = (pn) + (idx);                 \
> > +		(lp)[0] = 1;                         \
> > +	}                                            \
> > +} while (0)
> > +
> > +static const struct {
> > +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> > +	int32_t  idx;  /* index for new last updated elemnet. */
> > +	uint16_t lpv;  /* add value to the last updated element. */
> > +} gptbl[GRPSZ] = {
> > +	{
> > +		/* 0: a != b, b != c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100010001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 1: a == b, b != c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100010002),
> > +		.idx = 4,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 2: a != b, b == c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100020001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 3: a == b, b == c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100020003),
> > +		.idx = 4,
> > +		.lpv = 2,
> > +	},
> > +	{
> > +		/* 4: a != b, b != c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200010001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 5: a == b, b != c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200010002),
> > +		.idx = 4,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 6: a != b, b == c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200030001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 7: a == b, b == c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200030004),
> > +		.idx = 4,
> > +		.lpv = 3,
> > +	},
> > +	{
> > +		/* 8: a != b, b != c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100010001),
> > +		.idx = 3,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 9: a == b, b != c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100010002),
> > +		.idx = 3,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 0xa: a != b, b == c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100020001),
> > +		.idx = 3,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 0xb: a == b, b == c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100020003),
> > +		.idx = 3,
> > +		.lpv = 2,
> > +	},
> > +	{
> > +		/* 0xc: a != b, b != c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300010001),
> > +		.idx = 2,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 0xd: a == b, b != c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300010002),
> > +		.idx = 2,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 0xe: a != b, b == c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300040001),
> > +		.idx = 1,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 0xf: a == b, b == c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300040005),
> > +		.idx = 0,
> > +		.lpv = 4,
> > +	},
> > +};
> > +
> > +
> > +/*
> > + * Update source and destination MAC addresses in the ethernet header.
> > + */
> > +static inline void
> > +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t
> dst_port[FWDSTEP],
> > +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
> > +{
> > +	uint32x4_t te[FWDSTEP];
> > +	uint32x4_t ve[FWDSTEP];
> > +	uint32_t *p[FWDSTEP];
> > +	struct rte_mbuf *pkt;
> > +	uint8_t i;
> > +
> > +	for (i = 0; i < FWDSTEP; i++) {
> > +		pkt = pkts[i];
> > +
> > +		/* Check if it is a large packet */
> > +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> > +			*l_pkt |= 1;
> > +
> > +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
> > +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
> > +		te[i] = vld1q_u32(p[i]);
> > +
> > +		/* Update last 4 bytes */
> > +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
> > +		vst1q_u32(p[i], ve[i]);
> > +
> > +		if (ip_cksum) {
> > +			struct rte_ipv4_hdr *ip;
> > +
> > +			pkt->ol_flags |= tx_offloads;
> > +
> > +			ip = (struct rte_ipv4_hdr *)
> > +				(p[i] + RTE_ETHER_HDR_LEN + 1);
> > +			ip->hdr_checksum = 0;
> > +
> > +			/* calculate IPv4 cksum in SW */
> > +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> > +				ip->hdr_checksum = rte_ipv4_cksum(ip);
> > +		}
> > +
> > +	}
> > +}
> > +
> > +/*
> > + * Group consecutive packets with the same destination port in bursts of 4.
> > + * Suppose we have array of destination ports:
> > + * dst_port[] = {a, b, c, d,, e, ... }
> > + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> > + * We doing 4 comparisons at once and the result is 4 bit mask.
> > + * This mask is used as an index into prebuild array of pnum values.
> > + */
> > +static inline uint16_t *
> > +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> > +	     uint16x8_t dp2)
> > +{
> > +	union {
> > +		uint16_t u16[FWDSTEP + 1];
> > +		uint64_t u64;
> > +	} *pnum = (void *)pn;
> > +
> > +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> > +	int32_t v;
> > +
> > +	dp1 = vceqq_u16(dp1, dp2);
> > +	dp1 = vandq_u16(dp1, mask);
> > +	v = vaddvq_u16(dp1);
> > +
> > +	/* update last port counter. */
> > +	lp[0] += gptbl[v].lpv;
> > +	rte_compiler_barrier();
> > +
> > +	/* if dest port value has changed. */
> > +	if (v != GRPMSK) {
> > +		pnum->u64 = gptbl[v].pnum;
> > +		pnum->u16[FWDSTEP] = 1;
> > +		lp = pnum->u16 + gptbl[v].idx;
> > +	}
> > +
> > +	return lp;
> > +}
> > +
> > +/**
> > + * Process single packet:
> > + * Update source and destination MAC addresses in the ethernet header.
> > + */
> > +static inline void
> > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t
> tx_offloads,
> > +	       bool ip_cksum, uint8_t *l_pkt)
> > +{
> > +	struct rte_ether_hdr *eth_hdr;
> > +	uint32x4_t te, ve;
> > +
> > +	/* Check if it is a large packet */
> > +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> > +		*l_pkt |= 1;
> > +
> > +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
> > +
> > +	te = vld1q_u32((uint32_t *)eth_hdr);
> > +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> > +
> > +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> > +	vst1q_u32((uint32_t *)eth_hdr, ve);
> > +
> > +	if (ip_cksum) {
> > +		struct rte_ipv4_hdr *ip;
> > +
> > +		pkt->ol_flags |= tx_offloads;
> > +
> > +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> > +		ip->hdr_checksum = 0;
> > +
> > +		/* calculate IPv4 cksum in SW */
> > +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> > +			ip->hdr_checksum = rte_ipv4_cksum(ip);
> > +	}
> > +}
> > +
> > +static inline void
> > +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool
> is_ipv4)
> > +{
> > +	uint8_t proto;
> > +	uint32_t i;
> > +
> > +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
> > +	for (i = 0; i < num; i++)
> > +		send_single_packet(m[i], port, proto);
> > +}
> > +
> > +static inline void
> > +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
> > +{
> > +	unsigned int lcoreid = rte_lcore_id();
> > +	struct lcore_conf *qconf;
> > +	uint32_t len, j, n;
> > +
> > +	qconf = &lcore_conf[lcoreid];
> > +
> > +	len = qconf->tx_mbufs[port].len;
> > +
> > +	/*
> > +	 * If TX buffer for that queue is empty, and we have enough packets,
> > +	 * then send them straightway.
> > +	 */
> > +	if (num >= MAX_TX_BURST && len == 0) {
> > +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
> > +		core_stats_update_tx(n);
> > +		if (unlikely(n < num)) {
> > +			do {
> > +				rte_pktmbuf_free(m[n]);
> > +			} while (++n < num);
> > +		}
> > +		return;
> > +	}
> > +
> > +	/*
> > +	 * Put packets into TX buffer for that queue.
> > +	 */
> > +
> > +	n = len + num;
> > +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
> > +
> > +	j = 0;
> > +	switch (n % FWDSTEP) {
> > +	while (j < n) {
> > +		case 0:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +			/* fallthrough */
> > +		case 3:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +			/* fallthrough */
> > +		case 2:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +			/* fallthrough */
> > +		case 1:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +		}
> > +	}
> > +
> > +	len += n;
> > +
> > +	/* enough pkts to be sent */
> > +	if (unlikely(len == MAX_PKT_BURST)) {
> > +
> > +		send_burst(qconf, MAX_PKT_BURST, port);
> > +
> > +		/* copy rest of the packets into the TX buffer. */
> > +		len = num - n;
> > +		if (len == 0)
> > +			goto exit;
> > +
> > +		j = 0;
> > +		switch (len % FWDSTEP) {
> > +		while (j < len) {
> > +			case 0:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +				/* fallthrough */
> > +			case 3:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +				/* fallthrough */
> > +			case 2:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +				/* fallthrough */
> > +			case 1:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +		}
> > +		}
> > +	}
> > +
> > +exit:
> > +	qconf->tx_mbufs[port].len = len;
> > +}
> > +
> > +/**
> > + * Send packets burst to the ports in dst_port array
> > + */
> > +static __rte_always_inline void
> > +send_multi_pkts(struct rte_mbuf **pkts, uint16_t
> dst_port[MAX_PKT_BURST],
> > +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
> > +{
> > +	unsigned int lcoreid = rte_lcore_id();
> > +	uint16_t pnum[MAX_PKT_BURST + 1];
> > +	uint8_t l_pkt = 0;
> > +	uint16_t dlp, *lp;
> > +	int i = 0, k;
> > +
> > +	/*
> > +	 * Finish packet processing and group consecutive
> > +	 * packets with the same destination port.
> > +	 */
> > +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> > +
> > +	if (k != 0) {
> > +		uint16x8_t dp1, dp2;
> > +
> > +		lp = pnum;
> > +		lp[0] = 1;
> > +
> > +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
> > +
> > +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> > +		dp1 = vld1q_u16(dst_port);
> > +
> > +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
> > +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
> > +					ip_cksum, &l_pkt);
> > +
> > +			/*
> > +			 * dp2:
> > +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> > +			 */
> > +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
> > +			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> > +
> > +			/*
> > +			 * dp1:
> > +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> > +			 */
> > +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
> > +		}
> > +
> > +		/*
> > +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> > +		 */
> > +		dp2 = vextq_u16(dp1, dp1, 1);
> > +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> > +		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> > +
> > +		/*
> > +		 * remove values added by the last repeated
> > +		 * dst port.
> > +		 */
> > +		lp[0]--;
> > +		dlp = dst_port[i - 1];
> > +	} else {
> > +		/* set dlp and lp to the never used values. */
> > +		dlp = BAD_PORT - 1;
> > +		lp = pnum + MAX_PKT_BURST;
> > +	}
> > +
> > +	/* Process up to last 3 packets one by one. */
> > +	switch (nb_rx % FWDSTEP) {
> > +	case 3:
> > +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> > +			       &l_pkt);
> > +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> > +		i++;
> > +		/* fallthrough */
> > +	case 2:
> > +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> > +			       &l_pkt);
> > +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> > +		i++;
> > +		/* fallthrough */
> > +	case 1:
> > +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> > +			       &l_pkt);
> > +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> > +	}
> > +
> > +	/*
> > +	 * Send packets out, through destination port.
> > +	 * Consecutive packets with the same destination port
> > +	 * are already grouped together.
> > +	 * If destination port for the packet equals BAD_PORT,
> > +	 * then free the packet without sending it out.
> > +	 */
> > +	for (i = 0; i < nb_rx; i += k) {
> > +
> > +		uint16_t pn;
> > +
> > +		pn = dst_port[i];
> > +		k = pnum[i];
> > +
> > +		if (likely(pn != BAD_PORT)) {
> > +			if (l_pkt)
> > +				/* Large packet is present, need to send
> > +				 * individual packets with fragment
> > +				 */
> > +				send_packets(pkts + i, pn, k, is_ipv4);
> > +			else
> > +				send_packetsx4(pkts + i, pn, k);
> > +
> > +		} else {
> > +			free_pkts(&pkts[i], k);
> > +			if (is_ipv4)
> > +				core_statistics[lcoreid].lpm4.miss++;
> > +			else
> > +				core_statistics[lcoreid].lpm6.miss++;
> > +		}
> > +	}
> > +}
> > +
> > +#endif /* _IPSEC_NEON_H_ */
> > diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-
> secgw/ipsec_worker.c
> > index e1d4e3d864..803157d8ee 100644
> > --- a/examples/ipsec-secgw/ipsec_worker.c
> > +++ b/examples/ipsec-secgw/ipsec_worker.c
> > @@ -12,6 +12,10 @@
> >   #include "ipsec-secgw.h"
> >   #include "ipsec_worker.h"
> >
> > +#if defined(__ARM_NEON)
> > +#include "ipsec_lpm_neon.h"
> > +#endif
> > +
> >   struct port_drv_mode_data {
> >   	struct rte_security_session *sess;
> >   	struct rte_security_ctx *ctx;
> > @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
> >   				v6_num = ip6.num;
> >   			}
> >
> > +#if defined __ARM_NEON
> > +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
> > +			route6_pkts_neon(rt6_ctx, v6, v6_num);
> > +#else
> >   			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
> >   			route6_pkts(rt6_ctx, v6, v6_num);
> > +#endif
> >   		}
> >   	}
> >   }


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [EXT] Re: [PATCH] examples/ipsec-secgw: add support of NEON with poll mode
  2022-05-25 11:03   ` [EXT] " Rahul Bhansali
@ 2022-05-27 11:44     ` Konstantin Ananyev
  0 siblings, 0 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2022-05-27 11:44 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang
  Cc: Jerin Jacob Kollanukkaran


> 
> 
>> -----Original Message-----
>> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>> Sent: Wednesday, May 25, 2022 4:30 AM
>> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Radu Nicolau
>> <radu.nicolau@intel.com>; Akhil Goyal <gakhil@marvell.com>; Ruifeng Wang
>> <ruifeng.wang@arm.com>
>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
>> Subject: [EXT] Re: [PATCH] examples/ipsec-secgw: add support of NEON with
>> poll mode
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> 24/05/2022 10:57, Rahul Bhansali пишет:
>>> This adds the support of NEON based lpm lookup along with multi packet
>>> processing for burst send in packets routing.
>>>
>>> Performance impact:
>>> On cn10k, with poll mode inline protocol, outbound performance
>>> increased by upto ~8% and inbound performance increased by upto ~6%.
>>
>>
>> Interesting, good bunch of code looks like a dup from l3fwd:
>> grouping, precessx4_step?, etc.
> 
> Yes, neon logic is taken as a reference from l3fwd and some modifications as per
> requirement of ipsec example.
> 
>> Would it be possible to move dup code into some common place,
>> so it can be used by both examples?
> processx4_step... has some additional Ethernet header, inline vs non-inline packets lpm lookup,
> IP checksum etc processes and even if we separate out to make common code with l3fwd then getting
> less performance as additional things to be done separately again under certain conditions for
> individual packets.

Ok.

> 
> For grouping specific port_groupx4() only, we can have it in a common place. If it is worth,
> I can make changes accordingly. Do let me know.


I think would be really good.
Probably some other apps (or even libs) can benefit from it too -
it seems generic enough to me.

> 
>>
>>>
>>> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
>>> ---
>>>    examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
>>>    examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
>>>    examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
>>>    examples/ipsec-secgw/ipsec_worker.c   |   9 +
>>>    4 files changed, 734 insertions(+)
>>>    create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
>>>    create mode 100644 examples/ipsec-secgw/ipsec_neon.h
>>>
>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-
>> secgw/ipsec-secgw.c
>>> index 25255e053c..038c4669f5 100644
>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>> @@ -56,6 +56,10 @@
>>>    #include "parser.h"
>>>    #include "sad.h"
>>>
>>> +#if defined(__ARM_NEON)
>>> +#include "ipsec_lpm_neon.h"
>>> +#endif
>>> +
>>>    volatile bool force_quit;
>>>
>>>    #define MAX_JUMBO_PKT_LEN  9600
>>> @@ -96,6 +100,12 @@ struct ethaddr_info
>> ethaddr_tbl[RTE_MAX_ETHPORTS] = {
>>>    	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
>>>    };
>>>
>>> +/*
>>> + * To hold ethernet header per port, which will be applied
>>> + * to outgoing packets.
>>> + */
>>> +xmm_t val_eth[RTE_MAX_ETHPORTS];
>>> +
>>>    struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
>>>
>>>    #define CMD_LINE_OPT_CONFIG		"config"
>>> @@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct
>> rte_mbuf **pkts,
>>>    			process_pkts_outbound(&qconf->outbound, &traffic);
>>>    	}
>>>
>>> +#if defined __ARM_NEON
>>> +	/* Neon optimized packet routing */
>>> +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>>> +			 qconf->outbound.ipv4_offloads, true);
>>> +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
>>> +#else
>>>    	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>>>    		    qconf->outbound.ipv4_offloads, true);
>>>    	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
>>> +#endif
>>>    }
>>>
>>>    static inline void
>>> @@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct
>> rte_ether_addr *addr)
>>>    		return -EINVAL;
>>>
>>>    	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
>>> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
>>> +			    (struct rte_ether_addr *)(val_eth + port));
>>>    	return 0;
>>>    }
>>>
>>> @@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t
>> req_rx_offloads, uint64_t req_tx_offloads)
>>>    			portid, rte_strerror(-ret));
>>>
>>>    	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
>>> +
>>> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
>>> +			    (struct rte_ether_addr *)(val_eth + portid));
>>> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
>>> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
>>> +
>>>    	print_ethaddr("Address: ", &ethaddr);
>>>    	printf("\n");
>>>
>>> diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-
>> secgw/ipsec_lpm_neon.h
>>> new file mode 100644
>>> index 0000000000..959a5a8666
>>> --- /dev/null
>>> +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
>>> @@ -0,0 +1,213 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(C) 2022 Marvell.
>>> + */
>>> +
>>> +#ifndef __IPSEC_LPM_NEON_H__
>>> +#define __IPSEC_LPM_NEON_H__
>>> +
>>> +#include <arm_neon.h>
>>> +#include "ipsec_neon.h"
>>> +
>>> +/*
>>> + * Append ethernet header and read destination IPV4 addresses from 4
>> mbufs.
>>> + */
>>> +static inline void
>>> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
>>> +		uint64_t *inline_flag)
>>> +{
>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	int32_t dst[FWDSTEP];
>>> +	int i;
>>> +
>>> +	for (i = 0; i < FWDSTEP; i++) {
>>> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
>>> +
>> 	RTE_ETHER_HDR_LEN);
>>> +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
>>> +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
>>> +
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +
>>> +		/* Fetch destination IPv4 address */
>>> +		dst[i] = ipv4_hdr->dst_addr;
>>> +		*inline_flag |= pkt[i]->ol_flags &
>> RTE_MBUF_F_TX_SEC_OFFLOAD;
>>> +	}
>>> +
>>> +	dip[0] = vld1q_s32(dst);
>>> +}
>>> +
>>> +/*
>>> + * Lookup into LPM for destination port.
>>> + */
>>> +static inline void
>>> +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
>>> +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
>>> +{
>>> +	uint32_t next_hop;
>>> +	rte_xmm_t dst;
>>> +	uint8_t i;
>>> +
>>> +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
>>> +
>>> +	/* If all 4 packets are non-inline */
>>> +	if (!inline_flag) {
>>> +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
>>> +				 BAD_PORT);
>>> +		/* get rid of unused upper 16 bit for each dport. */
>>> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
>>> +		return;
>>> +	}
>>> +
>>> +	/* Inline and non-inline packets */
>>> +	dst.x = dip;
>>> +	for (i = 0; i < FWDSTEP; i++) {
>>> +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
>>> +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
>>> +			dprt[i] = (uint16_t) (((next_hop &
>>> +						RTE_LPM_LOOKUP_SUCCESS)
>> != 0)
>>> +						? next_hop : BAD_PORT);
>>> +
>>> +		} else {
>>> +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
>>> +						(struct rte_lpm *)rt_ctx,
>>> +						 dst.u32[i], &next_hop) == 0)
>>> +						? next_hop : BAD_PORT);
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * Process single packets for destination port.
>>> + */
>>> +static inline void
>>> +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
>>> +		   uint16_t *dst_port)
>>> +{
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>> +	uint32_t next_hop;
>>> +	uint32_t dst_ip;
>>> +
>>> +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
>>> +
>> 	RTE_ETHER_HDR_LEN);
>>> +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
>>> +	pkt->l2_len = RTE_ETHER_HDR_LEN;
>>> +
>>> +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
>>> +		next_hop = get_hop_for_offload_pkt(pkt, 0);
>>> +		*dst_port = (uint16_t) (((next_hop &
>>> +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
>>> +					  ? next_hop : BAD_PORT);
>>> +	} else {
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>>> +		*dst_port = (uint16_t) ((rte_lpm_lookup(
>>> +						(struct rte_lpm *)rt_ctx,
>>> +						dst_ip, &next_hop) == 0)
>>> +						? next_hop : BAD_PORT);
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * Buffer optimized handling of IPv6 packets.
>>> + */
>>> +static inline void
>>> +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
>>> +{
>>> +	uint8_t dst_ip6[MAX_PKT_BURST][16];
>>> +	int32_t dst_port[MAX_PKT_BURST];
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	struct rte_ipv6_hdr *ipv6_hdr;
>>> +	int32_t hop[MAX_PKT_BURST];
>>> +	struct rte_mbuf *pkt;
>>> +	uint8_t lpm_pkts = 0;
>>> +	int32_t i;
>>> +
>>> +	if (nb_rx == 0)
>>> +		return;
>>> +
>>> +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
>>> +	 * have port ID in the SA
>>> +	 */
>>> +
>>> +	for (i = 0; i < nb_rx; i++) {
>>> +		pkt = pkts[i];
>>> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
>>> +
>> 	RTE_ETHER_HDR_LEN);
>>> +		pkt->l2_len = RTE_ETHER_HDR_LEN;
>>> +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
>>> +
>>> +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
>>> +			/* Security offload not enabled. So an LPM lookup is
>>> +			 * required to get the hop
>>> +			 */
>>> +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
>>> +			memcpy(&dst_ip6[lpm_pkts][0],
>>> +					ipv6_hdr->dst_addr, 16);
>>> +			lpm_pkts++;
>>> +		}
>>> +	}
>>> +
>>> +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
>>> +				  hop, lpm_pkts);
>>> +
>>> +	lpm_pkts = 0;
>>> +
>>> +	for (i = 0; i < nb_rx; i++) {
>>> +		pkt = pkts[i];
>>> +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
>>> +			/* Read hop from the SA */
>>> +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
>>> +		} else {
>>> +			/* Need to use hop returned by lookup */
>>> +			dst_port[i] = hop[lpm_pkts++];
>>> +		}
>>> +		if (dst_port[i] == -1)
>>> +			dst_port[i] = BAD_PORT;
>>> +	}
>>> +
>>> +	/* Send packets */
>>> +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
>>> +}
>>> +
>>> +/*
>>> + * Buffer optimized handling of IPv4 packets.
>>> + */
>>> +static inline void
>>> +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
>>> +		 uint64_t tx_offloads, bool ip_cksum)
>>> +{
>>> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>>> +	const int32_t m = nb_rx % FWDSTEP;
>>> +	uint16_t dst_port[MAX_PKT_BURST];
>>> +	uint64_t inline_flag = 0;
>>> +	int32x4_t dip;
>>> +	int32_t i;
>>> +
>>> +	if (nb_rx == 0)
>>> +		return;
>>> +
>>> +	for (i = 0; i != k; i += FWDSTEP) {
>>> +		processx4_step1(&pkts[i], &dip, &inline_flag);
>>> +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
>>> +				&dst_port[i]);
>>> +	}
>>> +
>>> +	/* Classify last up to 3 packets one by one */
>>> +	switch (m) {
>>> +	case 3:
>>> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 2:
>>> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 1:
>>> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
>>> +	}
>>> +
>>> +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
>>> +}
>>> +
>>> +#endif /* __IPSEC_LPM_NEON_H__ */
>>> diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-
>> secgw/ipsec_neon.h
>>> new file mode 100644
>>> index 0000000000..39dddcd1e3
>>> --- /dev/null
>>> +++ b/examples/ipsec-secgw/ipsec_neon.h
>>> @@ -0,0 +1,487 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(C) 2022 Marvell.
>>> + */
>>> +
>>> +#ifndef _IPSEC_NEON_H_
>>> +#define _IPSEC_NEON_H_
>>> +
>>> +#include "ipsec.h"
>>> +
>>> +#define FWDSTEP		4
>>> +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
>>> +#define BAD_PORT	((uint16_t)-1)
>>> +
>>> +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
>>> +
>>> +/*
>>> + * Group consecutive packets with the same destination port into one burst.
>>> + * To avoid extra latency this is done together with some other packet
>>> + * processing, but after we made a final decision about packet's destination.
>>> + * To do this we maintain:
>>> + * pnum - array of number of consecutive packets with the same dest port for
>>> + * each packet in the input burst.
>>> + * lp - pointer to the last updated element in the pnum.
>>> + * dlp - dest port value lp corresponds to.
>>> + */
>>> +
>>> +#define	GRPSZ	(1 << FWDSTEP)
>>> +#define	GRPMSK	(GRPSZ - 1)
>>> +
>>> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>>> +	if (likely((dlp) == (dcp)[(idx)])) {         \
>>> +		(lp)[0]++;                           \
>>> +	} else {                                     \
>>> +		(dlp) = (dcp)[idx];                  \
>>> +		(lp) = (pn) + (idx);                 \
>>> +		(lp)[0] = 1;                         \
>>> +	}                                            \
>>> +} while (0)
>>> +
>>> +static const struct {
>>> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>>> +	int32_t  idx;  /* index for new last updated elemnet. */
>>> +	uint16_t lpv;  /* add value to the last updated element. */
>>> +} gptbl[GRPSZ] = {
>>> +	{
>>> +		/* 0: a != b, b != c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100010001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 1: a == b, b != c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100010002),
>>> +		.idx = 4,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 2: a != b, b == c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100020001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 3: a == b, b == c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100020003),
>>> +		.idx = 4,
>>> +		.lpv = 2,
>>> +	},
>>> +	{
>>> +		/* 4: a != b, b != c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200010001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 5: a == b, b != c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200010002),
>>> +		.idx = 4,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 6: a != b, b == c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200030001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 7: a == b, b == c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200030004),
>>> +		.idx = 4,
>>> +		.lpv = 3,
>>> +	},
>>> +	{
>>> +		/* 8: a != b, b != c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100010001),
>>> +		.idx = 3,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 9: a == b, b != c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100010002),
>>> +		.idx = 3,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 0xa: a != b, b == c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100020001),
>>> +		.idx = 3,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 0xb: a == b, b == c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100020003),
>>> +		.idx = 3,
>>> +		.lpv = 2,
>>> +	},
>>> +	{
>>> +		/* 0xc: a != b, b != c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300010001),
>>> +		.idx = 2,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 0xd: a == b, b != c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300010002),
>>> +		.idx = 2,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 0xe: a != b, b == c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300040001),
>>> +		.idx = 1,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 0xf: a == b, b == c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300040005),
>>> +		.idx = 0,
>>> +		.lpv = 4,
>>> +	},
>>> +};
>>> +
>>> +
>>> +/*
>>> + * Update source and destination MAC addresses in the ethernet header.
>>> + */
>>> +static inline void
>>> +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t
>> dst_port[FWDSTEP],
>>> +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
>>> +{
>>> +	uint32x4_t te[FWDSTEP];
>>> +	uint32x4_t ve[FWDSTEP];
>>> +	uint32_t *p[FWDSTEP];
>>> +	struct rte_mbuf *pkt;
>>> +	uint8_t i;
>>> +
>>> +	for (i = 0; i < FWDSTEP; i++) {
>>> +		pkt = pkts[i];
>>> +
>>> +		/* Check if it is a large packet */
>>> +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
>>> +			*l_pkt |= 1;
>>> +
>>> +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
>>> +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
>>> +		te[i] = vld1q_u32(p[i]);
>>> +
>>> +		/* Update last 4 bytes */
>>> +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
>>> +		vst1q_u32(p[i], ve[i]);
>>> +
>>> +		if (ip_cksum) {
>>> +			struct rte_ipv4_hdr *ip;
>>> +
>>> +			pkt->ol_flags |= tx_offloads;
>>> +
>>> +			ip = (struct rte_ipv4_hdr *)
>>> +				(p[i] + RTE_ETHER_HDR_LEN + 1);
>>> +			ip->hdr_checksum = 0;
>>> +
>>> +			/* calculate IPv4 cksum in SW */
>>> +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
>>> +				ip->hdr_checksum = rte_ipv4_cksum(ip);
>>> +		}
>>> +
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * Group consecutive packets with the same destination port in bursts of 4.
>>> + * Suppose we have array of destination ports:
>>> + * dst_port[] = {a, b, c, d,, e, ... }
>>> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>>> + * We doing 4 comparisons at once and the result is 4 bit mask.
>>> + * This mask is used as an index into prebuild array of pnum values.
>>> + */
>>> +static inline uint16_t *
>>> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>>> +	     uint16x8_t dp2)
>>> +{
>>> +	union {
>>> +		uint16_t u16[FWDSTEP + 1];
>>> +		uint64_t u64;
>>> +	} *pnum = (void *)pn;
>>> +
>>> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>>> +	int32_t v;
>>> +
>>> +	dp1 = vceqq_u16(dp1, dp2);
>>> +	dp1 = vandq_u16(dp1, mask);
>>> +	v = vaddvq_u16(dp1);
>>> +
>>> +	/* update last port counter. */
>>> +	lp[0] += gptbl[v].lpv;
>>> +	rte_compiler_barrier();
>>> +
>>> +	/* if dest port value has changed. */
>>> +	if (v != GRPMSK) {
>>> +		pnum->u64 = gptbl[v].pnum;
>>> +		pnum->u16[FWDSTEP] = 1;
>>> +		lp = pnum->u16 + gptbl[v].idx;
>>> +	}
>>> +
>>> +	return lp;
>>> +}
>>> +
>>> +/**
>>> + * Process single packet:
>>> + * Update source and destination MAC addresses in the ethernet header.
>>> + */
>>> +static inline void
>>> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t
>> tx_offloads,
>>> +	       bool ip_cksum, uint8_t *l_pkt)
>>> +{
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	uint32x4_t te, ve;
>>> +
>>> +	/* Check if it is a large packet */
>>> +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
>>> +		*l_pkt |= 1;
>>> +
>>> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
>>> +
>>> +	te = vld1q_u32((uint32_t *)eth_hdr);
>>> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
>>> +
>>> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
>>> +	vst1q_u32((uint32_t *)eth_hdr, ve);
>>> +
>>> +	if (ip_cksum) {
>>> +		struct rte_ipv4_hdr *ip;
>>> +
>>> +		pkt->ol_flags |= tx_offloads;
>>> +
>>> +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		ip->hdr_checksum = 0;
>>> +
>>> +		/* calculate IPv4 cksum in SW */
>>> +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
>>> +			ip->hdr_checksum = rte_ipv4_cksum(ip);
>>> +	}
>>> +}
>>> +
>>> +static inline void
>>> +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool
>> is_ipv4)
>>> +{
>>> +	uint8_t proto;
>>> +	uint32_t i;
>>> +
>>> +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
>>> +	for (i = 0; i < num; i++)
>>> +		send_single_packet(m[i], port, proto);
>>> +}
>>> +
>>> +static inline void
>>> +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
>>> +{
>>> +	unsigned int lcoreid = rte_lcore_id();
>>> +	struct lcore_conf *qconf;
>>> +	uint32_t len, j, n;
>>> +
>>> +	qconf = &lcore_conf[lcoreid];
>>> +
>>> +	len = qconf->tx_mbufs[port].len;
>>> +
>>> +	/*
>>> +	 * If TX buffer for that queue is empty, and we have enough packets,
>>> +	 * then send them straightway.
>>> +	 */
>>> +	if (num >= MAX_TX_BURST && len == 0) {
>>> +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
>>> +		core_stats_update_tx(n);
>>> +		if (unlikely(n < num)) {
>>> +			do {
>>> +				rte_pktmbuf_free(m[n]);
>>> +			} while (++n < num);
>>> +		}
>>> +		return;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Put packets into TX buffer for that queue.
>>> +	 */
>>> +
>>> +	n = len + num;
>>> +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
>>> +
>>> +	j = 0;
>>> +	switch (n % FWDSTEP) {
>>> +	while (j < n) {
>>> +		case 0:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +			/* fallthrough */
>>> +		case 3:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +			/* fallthrough */
>>> +		case 2:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +			/* fallthrough */
>>> +		case 1:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +		}
>>> +	}
>>> +
>>> +	len += n;
>>> +
>>> +	/* enough pkts to be sent */
>>> +	if (unlikely(len == MAX_PKT_BURST)) {
>>> +
>>> +		send_burst(qconf, MAX_PKT_BURST, port);
>>> +
>>> +		/* copy rest of the packets into the TX buffer. */
>>> +		len = num - n;
>>> +		if (len == 0)
>>> +			goto exit;
>>> +
>>> +		j = 0;
>>> +		switch (len % FWDSTEP) {
>>> +		while (j < len) {
>>> +			case 0:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +				/* fallthrough */
>>> +			case 3:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +				/* fallthrough */
>>> +			case 2:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +				/* fallthrough */
>>> +			case 1:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +		}
>>> +		}
>>> +	}
>>> +
>>> +exit:
>>> +	qconf->tx_mbufs[port].len = len;
>>> +}
>>> +
>>> +/**
>>> + * Send packets burst to the ports in dst_port array
>>> + */
>>> +static __rte_always_inline void
>>> +send_multi_pkts(struct rte_mbuf **pkts, uint16_t
>> dst_port[MAX_PKT_BURST],
>>> +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
>>> +{
>>> +	unsigned int lcoreid = rte_lcore_id();
>>> +	uint16_t pnum[MAX_PKT_BURST + 1];
>>> +	uint8_t l_pkt = 0;
>>> +	uint16_t dlp, *lp;
>>> +	int i = 0, k;
>>> +
>>> +	/*
>>> +	 * Finish packet processing and group consecutive
>>> +	 * packets with the same destination port.
>>> +	 */
>>> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>>> +
>>> +	if (k != 0) {
>>> +		uint16x8_t dp1, dp2;
>>> +
>>> +		lp = pnum;
>>> +		lp[0] = 1;
>>> +
>>> +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
>>> +
>>> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
>>> +		dp1 = vld1q_u16(dst_port);
>>> +
>>> +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
>>> +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
>>> +					ip_cksum, &l_pkt);
>>> +
>>> +			/*
>>> +			 * dp2:
>>> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>>> +			 */
>>> +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
>>> +			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
>>> +
>>> +			/*
>>> +			 * dp1:
>>> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
>>> +			 */
>>> +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
>>> +		}
>>> +
>>> +		/*
>>> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
>>> +		 */
>>> +		dp2 = vextq_u16(dp1, dp1, 1);
>>> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
>>> +		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
>>> +
>>> +		/*
>>> +		 * remove values added by the last repeated
>>> +		 * dst port.
>>> +		 */
>>> +		lp[0]--;
>>> +		dlp = dst_port[i - 1];
>>> +	} else {
>>> +		/* set dlp and lp to the never used values. */
>>> +		dlp = BAD_PORT - 1;
>>> +		lp = pnum + MAX_PKT_BURST;
>>> +	}
>>> +
>>> +	/* Process up to last 3 packets one by one. */
>>> +	switch (nb_rx % FWDSTEP) {
>>> +	case 3:
>>> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
>>> +			       &l_pkt);
>>> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 2:
>>> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
>>> +			       &l_pkt);
>>> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 1:
>>> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
>>> +			       &l_pkt);
>>> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
>>> +	}
>>> +
>>> +	/*
>>> +	 * Send packets out, through destination port.
>>> +	 * Consecutive packets with the same destination port
>>> +	 * are already grouped together.
>>> +	 * If destination port for the packet equals BAD_PORT,
>>> +	 * then free the packet without sending it out.
>>> +	 */
>>> +	for (i = 0; i < nb_rx; i += k) {
>>> +
>>> +		uint16_t pn;
>>> +
>>> +		pn = dst_port[i];
>>> +		k = pnum[i];
>>> +
>>> +		if (likely(pn != BAD_PORT)) {
>>> +			if (l_pkt)
>>> +				/* Large packet is present, need to send
>>> +				 * individual packets with fragment
>>> +				 */
>>> +				send_packets(pkts + i, pn, k, is_ipv4);
>>> +			else
>>> +				send_packetsx4(pkts + i, pn, k);
>>> +
>>> +		} else {
>>> +			free_pkts(&pkts[i], k);
>>> +			if (is_ipv4)
>>> +				core_statistics[lcoreid].lpm4.miss++;
>>> +			else
>>> +				core_statistics[lcoreid].lpm6.miss++;
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +#endif /* _IPSEC_NEON_H_ */
>>> diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-
>> secgw/ipsec_worker.c
>>> index e1d4e3d864..803157d8ee 100644
>>> --- a/examples/ipsec-secgw/ipsec_worker.c
>>> +++ b/examples/ipsec-secgw/ipsec_worker.c
>>> @@ -12,6 +12,10 @@
>>>    #include "ipsec-secgw.h"
>>>    #include "ipsec_worker.h"
>>>
>>> +#if defined(__ARM_NEON)
>>> +#include "ipsec_lpm_neon.h"
>>> +#endif
>>> +
>>>    struct port_drv_mode_data {
>>>    	struct rte_security_session *sess;
>>>    	struct rte_security_ctx *ctx;
>>> @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
>>>    				v6_num = ip6.num;
>>>    			}
>>>
>>> +#if defined __ARM_NEON
>>> +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
>>> +			route6_pkts_neon(rt6_ctx, v6, v6_num);
>>> +#else
>>>    			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
>>>    			route6_pkts(rt6_ctx, v6, v6_num);
>>> +#endif
>>>    		}
>>>    	}
>>>    }
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-05-24  9:57 [PATCH] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
  2022-05-24 23:00 ` Konstantin Ananyev
@ 2022-06-17  7:42 ` Rahul Bhansali
  2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
                     ` (2 more replies)
  2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
  2 siblings, 3 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-17  7:42 UTC (permalink / raw)
  To: dev, Ruifeng Wang; +Cc: jerinj, Rahul Bhansali

This will make the packet grouping function common, so
that other examples can utilize as per need.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: New patch to address review comment.

 examples/common/neon_common.h |  50 ++++++++++++
 examples/common/pkt_group.h   | 139 ++++++++++++++++++++++++++++++++++
 examples/l3fwd/Makefile       |   5 +-
 examples/l3fwd/l3fwd.h        |   2 -
 examples/l3fwd/l3fwd_common.h | 129 +------------------------------
 examples/l3fwd/l3fwd_neon.h   |  43 +----------
 examples/meson.build          |   2 +-
 7 files changed, 198 insertions(+), 172 deletions(-)
 create mode 100644 examples/common/neon_common.h
 create mode 100644 examples/common/pkt_group.h

diff --git a/examples/common/neon_common.h b/examples/common/neon_common.h
new file mode 100644
index 0000000000..f01b5ab6bc
--- /dev/null
+++ b/examples/common/neon_common.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _NEON_COMMON_H_
+#define _NEON_COMMON_H_
+
+#include "pkt_group.h"
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+		  uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+	int32_t v;
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+	rte_compiler_barrier();
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+#endif /* _NEON_COMMON_H_ */
diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
new file mode 100644
index 0000000000..8b26d9380f
--- /dev/null
+++ b/examples/common/pkt_group.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _PKT_GROUP_H_
+#define _PKT_GROUP_H_
+
+#define FWDSTEP	4
+
+/*
+ * Group consecutive packets with the same destination port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {         \
+		(lp)[0]++;                           \
+	} else {                                     \
+		(dlp) = (dcp)[idx];                  \
+		(lp) = (pn) + (idx);                 \
+		(lp)[0] = 1;                         \
+	}                                            \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+#endif /* _PKT_GROUP_H_ */
diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile
index 8efe6378e2..8dbe85c2e6 100644
--- a/examples/l3fwd/Makefile
+++ b/examples/l3fwd/Makefile
@@ -22,6 +22,7 @@ shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 # Added for 'rte_eth_link_to_str()'
@@ -38,10 +39,10 @@ endif
 endif

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 8a52c90755..40b5f32a9e 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -44,8 +44,6 @@
 /* Used to mark destination port as 'invalid'. */
 #define	BAD_PORT ((uint16_t)-1)

-#define FWDSTEP	4
-
 /* replace first 12B of the ethernet header. */
 #define	MASK_ETH 0x3f

diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
index 8e4c27218f..224b1c08e8 100644
--- a/examples/l3fwd/l3fwd_common.h
+++ b/examples/l3fwd/l3fwd_common.h
@@ -7,6 +7,8 @@
 #ifndef _L3FWD_COMMON_H_
 #define _L3FWD_COMMON_H_

+#include "pkt_group.h"
+
 #ifdef DO_RFC_1812_CHECKS

 #define	IPV4_MIN_VER_IHL	0x45
@@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
 #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
 #endif /* DO_RFC_1812_CHECKS */

-/*
- * We group consecutive packets with the same destination port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-static const struct {
-	uint64_t pnum; /* prebuild 4 values for pnum[]. */
-	int32_t  idx;  /* index for new last updated element. */
-	uint16_t lpv;  /* add value to the last updated element. */
-} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-};
-
 static __rte_always_inline void
 send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
 		uint32_t num)
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index e3d33a5229..5fa765b640 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -7,6 +7,7 @@
 #define _L3FWD_NEON_H_

 #include "l3fwd.h"
+#include "neon_common.h"
 #include "l3fwd_common.h"

 /*
@@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
 			&dst_port[3], pkt[3]->packet_type);
 }

-/*
- * Group consecutive packets with the same destination port in bursts of 4.
- * Suppose we have array of destination ports:
- * dst_port[] = {a, b, c, d,, e, ... }
- * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
- * We doing 4 comparisons at once and the result is 4 bit mask.
- * This mask is used as an index into prebuild array of pnum values.
- */
-static inline uint16_t *
-port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
-	     uint16x8_t dp2)
-{
-	union {
-		uint16_t u16[FWDSTEP + 1];
-		uint64_t u64;
-	} *pnum = (void *)pn;
-
-	int32_t v;
-	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
-
-	dp1 = vceqq_u16(dp1, dp2);
-	dp1 = vandq_u16(dp1, mask);
-	v = vaddvq_u16(dp1);
-
-	/* update last port counter. */
-	lp[0] += gptbl[v].lpv;
-	rte_compiler_barrier();
-
-	/* if dest port value has changed. */
-	if (v != GRPMSK) {
-		pnum->u64 = gptbl[v].pnum;
-		pnum->u16[FWDSTEP] = 1;
-		lp = pnum->u16 + gptbl[v].idx;
-	}
-
-	return lp;
-}
-
 /**
  * Process one packet:
  * Update source and destination MAC addresses in the ethernet header.
@@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
 			 */
 			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
-			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);

 			/*
 			 * dp1:
@@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
 		 */
 		dp2 = vextq_u16(dp1, dp1, 1);
 		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
-		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);

 		/*
 		 * remove values added by the last repeated
diff --git a/examples/meson.build b/examples/meson.build
index 78de0e1f37..81e93799f2 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -97,7 +97,7 @@ foreach example: examples
     ldflags = default_ldflags

     ext_deps = []
-    includes = [include_directories(example)]
+    includes = [include_directories(example, 'common')]
     deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
     subdir(example)

--
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode
  2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
@ 2022-06-17  7:42   ` Rahul Bhansali
  2022-06-17  7:51     ` Rahul Bhansali
                       ` (2 more replies)
  2022-06-17  7:50   ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
  2022-06-20  7:49   ` [EXT] " Akhil Goyal
  2 siblings, 3 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-17  7:42 UTC (permalink / raw)
  To: dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang; +Cc: jerinj, Rahul Bhansali

This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by upto ~8% and inbound performance increased by
upto ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: Removed Neon packet grouping function and used
the common one.

 examples/ipsec-secgw/Makefile         |   5 +-
 examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
 examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++++++++
 examples/ipsec-secgw/ipsec_neon.h     | 321 ++++++++++++++++++++++++++
 examples/ipsec-secgw/ipsec_worker.c   |   9 +
 5 files changed, 571 insertions(+), 2 deletions(-)
 create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
 create mode 100644 examples/ipsec-secgw/ipsec_neon.h

diff --git a/examples/ipsec-secgw/Makefile b/examples/ipsec-secgw/Makefile
index 89af54bd37..ffe232774d 100644
--- a/examples/ipsec-secgw/Makefile
+++ b/examples/ipsec-secgw/Makefile
@@ -36,6 +36,7 @@ shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
@@ -53,10 +54,10 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -Wno-address-of-packed-member

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 4d8a4a71b8..b650668305 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@
 #include "parser.h"
 #include "sad.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;

 #define MAX_JUMBO_PKT_LEN  9600
@@ -100,6 +104,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
 	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };

+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];

 #define CMD_LINE_OPT_CONFIG		"config"
@@ -568,9 +578,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
 			process_pkts_outbound(&qconf->outbound, &traffic);
 	}

+#if defined __ARM_NEON
+	/* Neon optimized packet routing */
+	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+			 qconf->outbound.ipv4_offloads, true);
+	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
 	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
 		    qconf->outbound.ipv4_offloads, true);
 	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }

 static inline void
@@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
 		return -EINVAL;

 	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+			    (struct rte_ether_addr *)(val_eth + port));
 	return 0;
 }

@@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 			portid, rte_strerror(-ret));

 	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+			    (struct rte_ether_addr *)(val_eth + portid));
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+			    (struct rte_ether_addr *)(val_eth + portid) + 1);
+
 	print_ethaddr("Address: ", &ethaddr);
 	printf("\n");

diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+		uint64_t *inline_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+	int i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+							RTE_ETHER_HDR_LEN);
+		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		/* Fetch destination IPv4 address */
+		dst[i] = ipv4_hdr->dst_addr;
+		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+	}
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	uint32_t next_hop;
+	rte_xmm_t dst;
+	uint8_t i;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* If all 4 packets are non-inline */
+	if (!inline_flag) {
+		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+				 BAD_PORT);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+		return;
+	}
+
+	/* Inline and non-inline packets */
+	dst.x = dip;
+	for (i = 0; i < FWDSTEP; i++) {
+		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+			dprt[i] = (uint16_t) (((next_hop &
+						RTE_LPM_LOOKUP_SUCCESS) != 0)
+						? next_hop : BAD_PORT);
+
+		} else {
+			dprt[i] = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						 dst.u32[i], &next_hop) == 0)
+						? next_hop : BAD_PORT);
+		}
+	}
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+		   uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint32_t next_hop;
+	uint32_t dst_ip;
+
+	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+		next_hop = get_hop_for_offload_pkt(pkt, 0);
+		*dst_port = (uint16_t) (((next_hop &
+					  RTE_LPM_LOOKUP_SUCCESS) != 0)
+					  ? next_hop : BAD_PORT);
+	} else {
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+		*dst_port = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						dst_ip, &next_hop) == 0)
+						? next_hop : BAD_PORT);
+	}
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+	uint8_t dst_ip6[MAX_PKT_BURST][16];
+	int32_t dst_port[MAX_PKT_BURST];
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	int32_t hop[MAX_PKT_BURST];
+	struct rte_mbuf *pkt;
+	uint8_t lpm_pkts = 0;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	/* Need to do an LPM lookup for non-inline packets. Inline packets will
+	 * have port ID in the SA
+	 */
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+		pkt->l2_len = RTE_ETHER_HDR_LEN;
+		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+			/* Security offload not enabled. So an LPM lookup is
+			 * required to get the hop
+			 */
+			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+			memcpy(&dst_ip6[lpm_pkts][0],
+					ipv6_hdr->dst_addr, 16);
+			lpm_pkts++;
+		}
+	}
+
+	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+				  hop, lpm_pkts);
+
+	lpm_pkts = 0;
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			/* Read hop from the SA */
+			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+		} else {
+			/* Need to use hop returned by lookup */
+			dst_port[i] = hop[lpm_pkts++];
+		}
+		if (dst_port[i] == -1)
+			dst_port[i] = BAD_PORT;
+	}
+
+	/* Send packets */
+	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+		 uint64_t tx_offloads, bool ip_cksum)
+{
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+	uint16_t dst_port[MAX_PKT_BURST];
+	uint64_t inline_flag = 0;
+	int32x4_t dip;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	for (i = 0; i != k; i += FWDSTEP) {
+		processx4_step1(&pkts[i], &dip, &inline_flag);
+		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+				&dst_port[i]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (m) {
+	case 3:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+	}
+
+	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..0f72219ed0
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+#include "neon_common.h"
+
+#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
+#define BAD_PORT	((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+	struct rte_mbuf *pkt;
+	uint8_t i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		pkt = pkts[i];
+
+		/* Check if it is a large packet */
+		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+			*l_pkt |= 1;
+
+		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+		te[i] = vld1q_u32(p[i]);
+
+		/* Update last 4 bytes */
+		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+		vst1q_u32(p[i], ve[i]);
+
+		if (ip_cksum) {
+			struct rte_ipv4_hdr *ip;
+
+			pkt->ol_flags |= tx_offloads;
+
+			ip = (struct rte_ipv4_hdr *)
+				(p[i] + RTE_ETHER_HDR_LEN + 1);
+			ip->hdr_checksum = 0;
+
+			/* calculate IPv4 cksum in SW */
+			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+				ip->hdr_checksum = rte_ipv4_cksum(ip);
+		}
+
+	}
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+	       bool ip_cksum, uint8_t *l_pkt)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	/* Check if it is a large packet */
+	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+		*l_pkt |= 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+
+	if (ip_cksum) {
+		struct rte_ipv4_hdr *ip;
+
+		pkt->ol_flags |= tx_offloads;
+
+		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip->hdr_checksum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+			ip->hdr_checksum = rte_ipv4_cksum(ip);
+	}
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+	uint8_t proto;
+	uint32_t i;
+
+	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+	for (i = 0; i < num; i++)
+		send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	struct lcore_conf *qconf;
+	uint32_t len, j, n;
+
+	qconf = &lcore_conf[lcoreid];
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		core_stats_update_tx(n);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+		}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		if (len == 0)
+			goto exit;
+
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+			case 0:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 3:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 2:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 1:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+		}
+		}
+	}
+
+exit:
+	qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	uint16_t pnum[MAX_PKT_BURST + 1];
+	uint8_t l_pkt = 0;
+	uint16_t dlp, *lp;
+	int i = 0, k;
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (i = FWDSTEP; i != k; i += FWDSTEP) {
+			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+					ip_cksum, &l_pkt);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+			lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[i - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (i = 0; i < nb_rx; i += k) {
+
+		uint16_t pn;
+
+		pn = dst_port[i];
+		k = pnum[i];
+
+		if (likely(pn != BAD_PORT)) {
+			if (l_pkt)
+				/* Large packet is present, need to send
+				 * individual packets with fragment
+				 */
+				send_packets(pkts + i, pn, k, is_ipv4);
+			else
+				send_packetsx4(pkts + i, pn, k);
+
+		} else {
+			free_pkts(&pkts[i], k);
+			if (is_ipv4)
+				core_statistics[lcoreid].lpm4.miss++;
+			else
+				core_statistics[lcoreid].lpm6.miss++;
+		}
+	}
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
 	struct rte_security_session *sess;
 	struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
 				v6_num = ip6.num;
 			}

+#if defined __ARM_NEON
+			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+			route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
 			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
 			route6_pkts(rt6_ctx, v6, v6_num);
+#endif
 		}
 	}
 }
--
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
  2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
@ 2022-06-17  7:50   ` Rahul Bhansali
  2022-06-20 23:13     ` Konstantin Ananyev
  2022-06-20  7:49   ` [EXT] " Akhil Goyal
  2 siblings, 1 reply; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-17  7:50 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Ruifeng Wang
  Cc: Jerin Jacob Kollanukkaran, Konstantin Ananyev

CC: Konstantin Ananyev

> -----Original Message-----
> From: Rahul Bhansali <rbhansali@marvell.com>
> Sent: Friday, June 17, 2022 1:13 PM
> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
> <rbhansali@marvell.com>
> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
> 
> This will make the packet grouping function common, so that other examples
> can utilize as per need.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
> Changes in v2: New patch to address review comment.
> 
>  examples/common/neon_common.h |  50 ++++++++++++
>  examples/common/pkt_group.h   | 139
> ++++++++++++++++++++++++++++++++++
>  examples/l3fwd/Makefile       |   5 +-
>  examples/l3fwd/l3fwd.h        |   2 -
>  examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>  examples/l3fwd/l3fwd_neon.h   |  43 +----------
>  examples/meson.build          |   2 +-
>  7 files changed, 198 insertions(+), 172 deletions(-)  create mode 100644
> examples/common/neon_common.h  create mode 100644
> examples/common/pkt_group.h
> 
> diff --git a/examples/common/neon_common.h
> b/examples/common/neon_common.h new file mode 100644 index
> 0000000000..f01b5ab6bc
> --- /dev/null
> +++ b/examples/common/neon_common.h
> @@ -0,0 +1,50 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2018 Intel Corporation.
> + * Copyright(c) 2017-2018 Linaro Limited.
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _NEON_COMMON_H_
> +#define _NEON_COMMON_H_
> +
> +#include "pkt_group.h"
> +
> +/*
> + * Group consecutive packets with the same destination port in bursts of 4.
> + * Suppose we have array of destination ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisons at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +		  uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +	int32_t v;
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +	rte_compiler_barrier();
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +#endif /* _NEON_COMMON_H_ */
> diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
> new file mode 100644 index 0000000000..8b26d9380f
> --- /dev/null
> +++ b/examples/common/pkt_group.h
> @@ -0,0 +1,139 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2016-2018 Intel Corporation.
> + * Copyright(c) 2017-2018 Linaro Limited.
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _PKT_GROUP_H_
> +#define _PKT_GROUP_H_
> +
> +#define FWDSTEP	4
> +
> +/*
> + * Group consecutive packets with the same destination port into one burst.
> + * To avoid extra latency this is done together with some other packet
> + * processing, but after we made a final decision about packet's destination.
> + * To do this we maintain:
> + * pnum - array of number of consecutive packets with the same dest
> +port for
> + * each packet in the input burst.
> + * lp - pointer to the last updated element in the pnum.
> + * dlp - dest port value lp corresponds to.
> + */
> +
> +#define	GRPSZ	(1 << FWDSTEP)
> +#define	GRPMSK	(GRPSZ - 1)
> +
> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> +	if (likely((dlp) == (dcp)[(idx)])) {         \
> +		(lp)[0]++;                           \
> +	} else {                                     \
> +		(dlp) = (dcp)[idx];                  \
> +		(lp) = (pn) + (idx);                 \
> +		(lp)[0] = 1;                         \
> +	}                                            \
> +} while (0)
> +
> +static const struct {
> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> +	int32_t  idx;  /* index for new last updated elemnet. */
> +	uint16_t lpv;  /* add value to the last updated element. */ }
> +gptbl[GRPSZ] = {
> +	{
> +		/* 0: a != b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 1: a == b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 2: a != b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 3: a == b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020003),
> +		.idx = 4,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 4: a != b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 5: a == b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 6: a != b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 7: a == b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030004),
> +		.idx = 4,
> +		.lpv = 3,
> +	},
> +	{
> +		/* 8: a != b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 9: a == b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010002),
> +		.idx = 3,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xa: a != b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xb: a == b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020003),
> +		.idx = 3,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 0xc: a != b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010001),
> +		.idx = 2,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xd: a == b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010002),
> +		.idx = 2,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xe: a != b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040001),
> +		.idx = 1,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xf: a == b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040005),
> +		.idx = 0,
> +		.lpv = 4,
> +	},
> +};
> +
> +#endif /* _PKT_GROUP_H_ */
> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
> 8efe6378e2..8dbe85c2e6 100644
> --- a/examples/l3fwd/Makefile
> +++ b/examples/l3fwd/Makefile
> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
>  static: build/$(APP)-static
>  	ln -sf $(APP)-static build/$(APP)
> 
> +INCLUDES =-I../common
>  PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS += -O3
> $(shell $(PKGCONF) --cflags libdpdk)  # Added for 'rte_eth_link_to_str()'
> @@ -38,10 +39,10 @@ endif
>  endif
> 
>  build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_SHARED)
> 
>  build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_STATIC)
> 
>  build:
>  	@mkdir -p $@
> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
> 8a52c90755..40b5f32a9e 100644
> --- a/examples/l3fwd/l3fwd.h
> +++ b/examples/l3fwd/l3fwd.h
> @@ -44,8 +44,6 @@
>  /* Used to mark destination port as 'invalid'. */
>  #define	BAD_PORT ((uint16_t)-1)
> 
> -#define FWDSTEP	4
> -
>  /* replace first 12B of the ethernet header. */
>  #define	MASK_ETH 0x3f
> 
> diff --git a/examples/l3fwd/l3fwd_common.h
> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
> --- a/examples/l3fwd/l3fwd_common.h
> +++ b/examples/l3fwd/l3fwd_common.h
> @@ -7,6 +7,8 @@
>  #ifndef _L3FWD_COMMON_H_
>  #define _L3FWD_COMMON_H_
> 
> +#include "pkt_group.h"
> +
>  #ifdef DO_RFC_1812_CHECKS
> 
>  #define	IPV4_MIN_VER_IHL	0x45
> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t
> *dp, uint32_t ptype)
>  #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
>  #endif /* DO_RFC_1812_CHECKS */
> 
> -/*
> - * We group consecutive packets with the same destination port into one burst.
> - * To avoid extra latency this is done together with some other packet
> - * processing, but after we made a final decision about packet's destination.
> - * To do this we maintain:
> - * pnum - array of number of consecutive packets with the same dest port for
> - * each packet in the input burst.
> - * lp - pointer to the last updated element in the pnum.
> - * dlp - dest port value lp corresponds to.
> - */
> -
> -#define	GRPSZ	(1 << FWDSTEP)
> -#define	GRPMSK	(GRPSZ - 1)
> -
> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> -	if (likely((dlp) == (dcp)[(idx)])) {             \
> -		(lp)[0]++;                                   \
> -	} else {                                         \
> -		(dlp) = (dcp)[idx];                          \
> -		(lp) = (pn) + (idx);                         \
> -		(lp)[0] = 1;                                 \
> -	}                                                \
> -} while (0)
> -
> -static const struct {
> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> -	int32_t  idx;  /* index for new last updated element. */
> -	uint16_t lpv;  /* add value to the last updated element. */
> -} gptbl[GRPSZ] = {
> -	{
> -		/* 0: a != b, b != c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100010001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 1: a == b, b != c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100010002),
> -		.idx = 4,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 2: a != b, b == c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100020001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 3: a == b, b == c, c != d, d != e */
> -		.pnum = UINT64_C(0x0001000100020003),
> -		.idx = 4,
> -		.lpv = 2,
> -	},
> -	{
> -		/* 4: a != b, b != c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200010001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 5: a == b, b != c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200010002),
> -		.idx = 4,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 6: a != b, b == c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200030001),
> -		.idx = 4,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 7: a == b, b == c, c == d, d != e */
> -		.pnum = UINT64_C(0x0001000200030004),
> -		.idx = 4,
> -		.lpv = 3,
> -	},
> -	{
> -		/* 8: a != b, b != c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100010001),
> -		.idx = 3,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 9: a == b, b != c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100010002),
> -		.idx = 3,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 0xa: a != b, b == c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100020001),
> -		.idx = 3,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 0xb: a == b, b == c, c != d, d == e */
> -		.pnum = UINT64_C(0x0002000100020003),
> -		.idx = 3,
> -		.lpv = 2,
> -	},
> -	{
> -		/* 0xc: a != b, b != c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300010001),
> -		.idx = 2,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 0xd: a == b, b != c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300010002),
> -		.idx = 2,
> -		.lpv = 1,
> -	},
> -	{
> -		/* 0xe: a != b, b == c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300040001),
> -		.idx = 1,
> -		.lpv = 0,
> -	},
> -	{
> -		/* 0xf: a == b, b == c, c == d, d == e */
> -		.pnum = UINT64_C(0x0002000300040005),
> -		.idx = 0,
> -		.lpv = 4,
> -	},
> -};
> -
>  static __rte_always_inline void
>  send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
>  		uint32_t num)
> diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
> index e3d33a5229..5fa765b640 100644
> --- a/examples/l3fwd/l3fwd_neon.h
> +++ b/examples/l3fwd/l3fwd_neon.h
> @@ -7,6 +7,7 @@
>  #define _L3FWD_NEON_H_
> 
>  #include "l3fwd.h"
> +#include "neon_common.h"
>  #include "l3fwd_common.h"
> 
>  /*
> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
> uint16_t dst_port[FWDSTEP])
>  			&dst_port[3], pkt[3]->packet_type);
>  }
> 
> -/*
> - * Group consecutive packets with the same destination port in bursts of 4.
> - * Suppose we have array of destination ports:
> - * dst_port[] = {a, b, c, d,, e, ... }
> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> - * We doing 4 comparisons at once and the result is 4 bit mask.
> - * This mask is used as an index into prebuild array of pnum values.
> - */
> -static inline uint16_t *
> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> -	     uint16x8_t dp2)
> -{
> -	union {
> -		uint16_t u16[FWDSTEP + 1];
> -		uint64_t u64;
> -	} *pnum = (void *)pn;
> -
> -	int32_t v;
> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> -
> -	dp1 = vceqq_u16(dp1, dp2);
> -	dp1 = vandq_u16(dp1, mask);
> -	v = vaddvq_u16(dp1);
> -
> -	/* update last port counter. */
> -	lp[0] += gptbl[v].lpv;
> -	rte_compiler_barrier();
> -
> -	/* if dest port value has changed. */
> -	if (v != GRPMSK) {
> -		pnum->u64 = gptbl[v].pnum;
> -		pnum->u16[FWDSTEP] = 1;
> -		lp = pnum->u16 + gptbl[v].idx;
> -	}
> -
> -	return lp;
> -}
> -
>  /**
>   * Process one packet:
>   * Update source and destination MAC addresses in the ethernet header.
> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
> rte_mbuf **pkts_burst,
>  			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>  			 */
>  			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> dp2);
> 
>  			/*
>  			 * dp1:
> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
> rte_mbuf **pkts_burst,
>  		 */
>  		dp2 = vextq_u16(dp1, dp1, 1);
>  		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> 
>  		/*
>  		 * remove values added by the last repeated diff --git
> a/examples/meson.build b/examples/meson.build index
> 78de0e1f37..81e93799f2 100644
> --- a/examples/meson.build
> +++ b/examples/meson.build
> @@ -97,7 +97,7 @@ foreach example: examples
>      ldflags = default_ldflags
> 
>      ext_deps = []
> -    includes = [include_directories(example)]
> +    includes = [include_directories(example, 'common')]
>      deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
>      subdir(example)
> 
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode
  2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
@ 2022-06-17  7:51     ` Rahul Bhansali
  2022-06-21 12:55     ` Akhil Goyal
  2022-06-23  8:46     ` Zhang, Roy Fan
  2 siblings, 0 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-17  7:51 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang
  Cc: Jerin Jacob Kollanukkaran, Konstantin Ananyev

CC: Konstantin Ananyev

> -----Original Message-----
> From: Rahul Bhansali <rbhansali@marvell.com>
> Sent: Friday, June 17, 2022 1:13 PM
> To: dev@dpdk.org; Radu Nicolau <radu.nicolau@intel.com>; Akhil Goyal
> <gakhil@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
> <rbhansali@marvell.com>
> Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll
> mode
> 
> This adds the support of NEON based lpm lookup along with multi packet
> processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance increased by
> upto ~8% and inbound performance increased by upto ~6%.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
> Changes in v2: Removed Neon packet grouping function and used the common
> one.
> 
>  examples/ipsec-secgw/Makefile         |   5 +-
>  examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
>  examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++++++++
>  examples/ipsec-secgw/ipsec_neon.h     | 321 ++++++++++++++++++++++++++
>  examples/ipsec-secgw/ipsec_worker.c   |   9 +
>  5 files changed, 571 insertions(+), 2 deletions(-)  create mode 100644
> examples/ipsec-secgw/ipsec_lpm_neon.h
>  create mode 100644 examples/ipsec-secgw/ipsec_neon.h
> 
> diff --git a/examples/ipsec-secgw/Makefile b/examples/ipsec-secgw/Makefile
> index 89af54bd37..ffe232774d 100644
> --- a/examples/ipsec-secgw/Makefile
> +++ b/examples/ipsec-secgw/Makefile
> @@ -36,6 +36,7 @@ shared: build/$(APP)-shared
>  static: build/$(APP)-static
>  	ln -sf $(APP)-static build/$(APP)
> 
> +INCLUDES =-I../common
>  PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS += -O3
> $(shell $(PKGCONF) --cflags libdpdk)  LDFLAGS_SHARED = $(shell $(PKGCONF) --
> libs libdpdk) @@ -53,10 +54,10 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
> CFLAGS += -Wno-address-of-packed-member
> 
>  build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_SHARED)
> 
>  build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_STATIC)
> 
>  build:
>  	@mkdir -p $@
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-
> secgw/ipsec-secgw.c
> index 4d8a4a71b8..b650668305 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -56,6 +56,10 @@
>  #include "parser.h"
>  #include "sad.h"
> 
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>  volatile bool force_quit;
> 
>  #define MAX_JUMBO_PKT_LEN  9600
> @@ -100,6 +104,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS]
> = {
>  	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }  };
> 
> +/*
> + * To hold ethernet header per port, which will be applied
> + * to outgoing packets.
> + */
> +xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
>  struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
> 
>  #define CMD_LINE_OPT_CONFIG		"config"
> @@ -568,9 +578,16 @@ process_pkts(struct lcore_conf *qconf, struct
> rte_mbuf **pkts,
>  			process_pkts_outbound(&qconf->outbound, &traffic);
>  	}
> 
> +#if defined __ARM_NEON
> +	/* Neon optimized packet routing */
> +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> +			 qconf->outbound.ipv4_offloads, true);
> +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#else
>  	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>  		    qconf->outbound.ipv4_offloads, true);
>  	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#endif
>  }
> 
>  static inline void
> @@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct
> rte_ether_addr *addr)
>  		return -EINVAL;
> 
>  	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> +			    (struct rte_ether_addr *)(val_eth + port));
>  	return 0;
>  }
> 
> @@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads,
> uint64_t req_tx_offloads)
>  			portid, rte_strerror(-ret));
> 
>  	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> +
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> +			    (struct rte_ether_addr *)(val_eth + portid));
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> +
>  	print_ethaddr("Address: ", &ethaddr);
>  	printf("\n");
> 
> diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-
> secgw/ipsec_lpm_neon.h
> new file mode 100644
> index 0000000000..959a5a8666
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
> @@ -0,0 +1,213 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef __IPSEC_LPM_NEON_H__
> +#define __IPSEC_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +#include "ipsec_neon.h"
> +
> +/*
> + * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
> +		uint64_t *inline_flag)
> +{
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	struct rte_ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +	int i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
> +
> 	RTE_ETHER_HDR_LEN);
> +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
> +
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +
> +		/* Fetch destination IPv4 address */
> +		dst[i] = ipv4_hdr->dst_addr;
> +		*inline_flag |= pkt[i]->ol_flags &
> RTE_MBUF_F_TX_SEC_OFFLOAD;
> +	}
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + */
> +static inline void
> +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
> +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP]) {
> +	uint32_t next_hop;
> +	rte_xmm_t dst;
> +	uint8_t i;
> +
> +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> +
> +	/* If all 4 packets are non-inline */
> +	if (!inline_flag) {
> +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
> +				 BAD_PORT);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +		return;
> +	}
> +
> +	/* Inline and non-inline packets */
> +	dst.x = dip;
> +	for (i = 0; i < FWDSTEP; i++) {
> +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
> +			dprt[i] = (uint16_t) (((next_hop &
> +						RTE_LPM_LOOKUP_SUCCESS)
> != 0)
> +						? next_hop : BAD_PORT);
> +
> +		} else {
> +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						 dst.u32[i], &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +		}
> +	}
> +}
> +
> +/*
> + * Process single packets for destination port.
> + */
> +static inline void
> +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
> +		   uint16_t *dst_port)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	uint32_t next_hop;
> +	uint32_t dst_ip;
> +
> +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +
> 	RTE_ETHER_HDR_LEN);
> +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +	pkt->l2_len = RTE_ETHER_HDR_LEN;
> +
> +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +		next_hop = get_hop_for_offload_pkt(pkt, 0);
> +		*dst_port = (uint16_t) (((next_hop &
> +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
> +					  ? next_hop : BAD_PORT);
> +	} else {
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +		*dst_port = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						dst_ip, &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of IPv6 packets.
> + */
> +static inline void
> +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int
> +nb_rx) {
> +	uint8_t dst_ip6[MAX_PKT_BURST][16];
> +	int32_t dst_port[MAX_PKT_BURST];
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv6_hdr *ipv6_hdr;
> +	int32_t hop[MAX_PKT_BURST];
> +	struct rte_mbuf *pkt;
> +	uint8_t lpm_pkts = 0;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
> +	 * have port ID in the SA
> +	 */
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +
> 	RTE_ETHER_HDR_LEN);
> +		pkt->l2_len = RTE_ETHER_HDR_LEN;
> +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
> +
> +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
> +			/* Security offload not enabled. So an LPM lookup is
> +			 * required to get the hop
> +			 */
> +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +			memcpy(&dst_ip6[lpm_pkts][0],
> +					ipv6_hdr->dst_addr, 16);
> +			lpm_pkts++;
> +		}
> +	}
> +
> +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
> +				  hop, lpm_pkts);
> +
> +	lpm_pkts = 0;
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			/* Read hop from the SA */
> +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
> +		} else {
> +			/* Need to use hop returned by lookup */
> +			dst_port[i] = hop[lpm_pkts++];
> +		}
> +		if (dst_port[i] == -1)
> +			dst_port[i] = BAD_PORT;
> +	}
> +
> +	/* Send packets */
> +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false); }
> +
> +/*
> + * Buffer optimized handling of IPv4 packets.
> + */
> +static inline void
> +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
> +		 uint64_t tx_offloads, bool ip_cksum) {
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	const int32_t m = nb_rx % FWDSTEP;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	uint64_t inline_flag = 0;
> +	int32x4_t dip;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	for (i = 0; i != k; i += FWDSTEP) {
> +		processx4_step1(&pkts[i], &dip, &inline_flag);
> +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
> +				&dst_port[i]);
> +	}
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (m) {
> +	case 3:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +	}
> +
> +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true); }
> +
> +#endif /* __IPSEC_LPM_NEON_H__ */
> diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-
> secgw/ipsec_neon.h
> new file mode 100644
> index 0000000000..0f72219ed0
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_neon.h
> @@ -0,0 +1,321 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _IPSEC_NEON_H_
> +#define _IPSEC_NEON_H_
> +
> +#include "ipsec.h"
> +#include "neon_common.h"
> +
> +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
> +#define BAD_PORT	((uint16_t)-1)
> +
> +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t
> dst_port[FWDSTEP],
> +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt) {
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +	struct rte_mbuf *pkt;
> +	uint8_t i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		pkt = pkts[i];
> +
> +		/* Check if it is a large packet */
> +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +			*l_pkt |= 1;
> +
> +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
> +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
> +		te[i] = vld1q_u32(p[i]);
> +
> +		/* Update last 4 bytes */
> +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
> +		vst1q_u32(p[i], ve[i]);
> +
> +		if (ip_cksum) {
> +			struct rte_ipv4_hdr *ip;
> +
> +			pkt->ol_flags |= tx_offloads;
> +
> +			ip = (struct rte_ipv4_hdr *)
> +				(p[i] + RTE_ETHER_HDR_LEN + 1);
> +			ip->hdr_checksum = 0;
> +
> +			/* calculate IPv4 cksum in SW */
> +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +				ip->hdr_checksum = rte_ipv4_cksum(ip);
> +		}
> +
> +	}
> +}
> +
> +/**
> + * Process single packet:
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
> +	       bool ip_cksum, uint8_t *l_pkt)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	/* Check if it is a large packet */
> +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +		*l_pkt |= 1;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +
> +	if (ip_cksum) {
> +		struct rte_ipv4_hdr *ip;
> +
> +		pkt->ol_flags |= tx_offloads;
> +
> +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip->hdr_checksum = 0;
> +
> +		/* calculate IPv4 cksum in SW */
> +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +			ip->hdr_checksum = rte_ipv4_cksum(ip);
> +	}
> +}
> +
> +static inline void
> +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool
> +is_ipv4) {
> +	uint8_t proto;
> +	uint32_t i;
> +
> +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
> +	for (i = 0; i < num; i++)
> +		send_single_packet(m[i], port, proto); }
> +
> +static inline void
> +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num) {
> +	unsigned int lcoreid = rte_lcore_id();
> +	struct lcore_conf *qconf;
> +	uint32_t len, j, n;
> +
> +	qconf = &lcore_conf[lcoreid];
> +
> +	len = qconf->tx_mbufs[port].len;
> +
> +	/*
> +	 * If TX buffer for that queue is empty, and we have enough packets,
> +	 * then send them straightway.
> +	 */
> +	if (num >= MAX_TX_BURST && len == 0) {
> +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
> +		core_stats_update_tx(n);
> +		if (unlikely(n < num)) {
> +			do {
> +				rte_pktmbuf_free(m[n]);
> +			} while (++n < num);
> +		}
> +		return;
> +	}
> +
> +	/*
> +	 * Put packets into TX buffer for that queue.
> +	 */
> +
> +	n = len + num;
> +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
> +
> +	j = 0;
> +	switch (n % FWDSTEP) {
> +	while (j < n) {
> +		case 0:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 3:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 2:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 1:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +		}
> +	}
> +
> +	len += n;
> +
> +	/* enough pkts to be sent */
> +	if (unlikely(len == MAX_PKT_BURST)) {
> +
> +		send_burst(qconf, MAX_PKT_BURST, port);
> +
> +		/* copy rest of the packets into the TX buffer. */
> +		len = num - n;
> +		if (len == 0)
> +			goto exit;
> +
> +		j = 0;
> +		switch (len % FWDSTEP) {
> +		while (j < len) {
> +			case 0:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 3:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 2:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 1:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +		}
> +		}
> +	}
> +
> +exit:
> +	qconf->tx_mbufs[port].len = len;
> +}
> +
> +/**
> + * Send packets burst to the ports in dst_port array  */ static
> +__rte_always_inline void send_multi_pkts(struct rte_mbuf **pkts,
> +uint16_t dst_port[MAX_PKT_BURST],
> +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4) {
> +	unsigned int lcoreid = rte_lcore_id();
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +	uint8_t l_pkt = 0;
> +	uint16_t dlp, *lp;
> +	int i = 0, k;
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
> +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
> +					ip_cksum, &l_pkt);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
> +			lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1,
> dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> +		lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[i - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (i = 0; i < nb_rx; i += k) {
> +
> +		uint16_t pn;
> +
> +		pn = dst_port[i];
> +		k = pnum[i];
> +
> +		if (likely(pn != BAD_PORT)) {
> +			if (l_pkt)
> +				/* Large packet is present, need to send
> +				 * individual packets with fragment
> +				 */
> +				send_packets(pkts + i, pn, k, is_ipv4);
> +			else
> +				send_packetsx4(pkts + i, pn, k);
> +
> +		} else {
> +			free_pkts(&pkts[i], k);
> +			if (is_ipv4)
> +				core_statistics[lcoreid].lpm4.miss++;
> +			else
> +				core_statistics[lcoreid].lpm6.miss++;
> +		}
> +	}
> +}
> +
> +#endif /* _IPSEC_NEON_H_ */
> diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-
> secgw/ipsec_worker.c
> index e1d4e3d864..803157d8ee 100644
> --- a/examples/ipsec-secgw/ipsec_worker.c
> +++ b/examples/ipsec-secgw/ipsec_worker.c
> @@ -12,6 +12,10 @@
>  #include "ipsec-secgw.h"
>  #include "ipsec_worker.h"
> 
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>  struct port_drv_mode_data {
>  	struct rte_security_session *sess;
>  	struct rte_security_ctx *ctx;
> @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
>  				v6_num = ip6.num;
>  			}
> 
> +#if defined __ARM_NEON
> +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
> +			route6_pkts_neon(rt6_ctx, v6, v6_num); #else
>  			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
>  			route6_pkts(rt6_ctx, v6, v6_num);
> +#endif
>  		}
>  	}
>  }
> --
> 2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
  2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
  2022-06-17  7:50   ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
@ 2022-06-20  7:49   ` Akhil Goyal
  2022-06-20 10:45     ` Thomas Monjalon
  2022-06-21 12:56     ` Akhil Goyal
  2 siblings, 2 replies; 26+ messages in thread
From: Akhil Goyal @ 2022-06-20  7:49 UTC (permalink / raw)
  To: dev, Ruifeng Wang, thomas, ferruh.yigit, andrew.rybchenko,
	rasland, Konstantin Ananyev, maxime.coquelin, david.marchand
  Cc: Jerin Jacob Kollanukkaran, Rahul Bhansali


> This will make the packet grouping function common, so
> that other examples can utilize as per need.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---

Adding more people for review.

@thomas@monjalon.net: Can this patch be taken in next-crypto as the patch is 
primarily for ipsec-secgw?

> Changes in v2: New patch to address review comment.
> 
>  examples/common/neon_common.h |  50 ++++++++++++
>  examples/common/pkt_group.h   | 139
> ++++++++++++++++++++++++++++++++++
>  examples/l3fwd/Makefile       |   5 +-
>  examples/l3fwd/l3fwd.h        |   2 -
>  examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>  examples/l3fwd/l3fwd_neon.h   |  43 +----------
>  examples/meson.build          |   2 +-
>  7 files changed, 198 insertions(+), 172 deletions(-)
>  create mode 100644 examples/common/neon_common.h
>  create mode 100644 examples/common/pkt_group.h


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [EXT] [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-20  7:49   ` [EXT] " Akhil Goyal
@ 2022-06-20 10:45     ` Thomas Monjalon
  2022-06-21 12:56     ` Akhil Goyal
  1 sibling, 0 replies; 26+ messages in thread
From: Thomas Monjalon @ 2022-06-20 10:45 UTC (permalink / raw)
  To: dev, Ruifeng Wang, ferruh.yigit, andrew.rybchenko, rasland,
	Konstantin Ananyev, maxime.coquelin, Akhil Goyal
  Cc: david.marchand, Jerin Jacob Kollanukkaran, Rahul Bhansali

20/06/2022 09:49, Akhil Goyal:
> @thomas@monjalon.net: Can this patch be taken in next-crypto as the patch is 
> primarily for ipsec-secgw?

Yes that's fine.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-17  7:50   ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
@ 2022-06-20 23:13     ` Konstantin Ananyev
  2022-06-21 16:50       ` [EXT] " Rahul Bhansali
  0 siblings, 1 reply; 26+ messages in thread
From: Konstantin Ananyev @ 2022-06-20 23:13 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Ruifeng Wang; +Cc: Jerin Jacob Kollanukkaran

17/06/2022 08:50, Rahul Bhansali пишет:
> CC: Konstantin Ananyev
> 
>> -----Original Message-----
>> From: Rahul Bhansali <rbhansali@marvell.com>
>> Sent: Friday, June 17, 2022 1:13 PM
>> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
>> <rbhansali@marvell.com>
>> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
>>
>> This will make the packet grouping function common, so that other examples
>> can utilize as per need.
>>
>> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
>> ---
>> Changes in v2: New patch to address review comment.
>>
>>   examples/common/neon_common.h |  50 ++++++++++++
>>   examples/common/pkt_group.h   | 139
>> ++++++++++++++++++++++++++++++++++
>>   examples/l3fwd/Makefile       |   5 +-
>>   examples/l3fwd/l3fwd.h        |   2 -
>>   examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>>   examples/l3fwd/l3fwd_neon.h   |  43 +----------
>>   examples/meson.build          |   2 +-
>>   7 files changed, 198 insertions(+), 172 deletions(-)  create mode 100644
>> examples/common/neon_common.h  create mode 100644
>> examples/common/pkt_group.h
>>
>> diff --git a/examples/common/neon_common.h
>> b/examples/common/neon_common.h new file mode 100644 index
>> 0000000000..f01b5ab6bc
>> --- /dev/null
>> +++ b/examples/common/neon_common.h
>> @@ -0,0 +1,50 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2016-2018 Intel Corporation.
>> + * Copyright(c) 2017-2018 Linaro Limited.
>> + * Copyright(C) 2022 Marvell.
>> + */
>> +
>> +#ifndef _NEON_COMMON_H_
>> +#define _NEON_COMMON_H_
>> +
>> +#include "pkt_group.h"
>> +
>> +/*
>> + * Group consecutive packets with the same destination port in bursts of 4.
>> + * Suppose we have array of destination ports:
>> + * dst_port[] = {a, b, c, d,, e, ... }
>> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>> + * We doing 4 comparisons at once and the result is 4 bit mask.
>> + * This mask is used as an index into prebuild array of pnum values.
>> + */
>> +static inline uint16_t *
>> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>> +		  uint16x8_t dp2)
>> +{
>> +	union {
>> +		uint16_t u16[FWDSTEP + 1];
>> +		uint64_t u64;
>> +	} *pnum = (void *)pn;
>> +
>> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>> +	int32_t v;
>> +
>> +	dp1 = vceqq_u16(dp1, dp2);
>> +	dp1 = vandq_u16(dp1, mask);
>> +	v = vaddvq_u16(dp1);
>> +
>> +	/* update last port counter. */hh
>> +	lp[0] += gptbl[v].lpv;
>> +	rte_compiler_barrier();
>> +
>> +	/* if dest port value has changed. */
>> +	if (v != GRPMSK) {
>> +		pnum->u64 = gptbl[v].pnum;
>> +		pnum->u16[FWDSTEP] = 1;
>> +		lp = pnum->u16 + gptbl[v].idx;
>> +	}
>> +
>> +	return lp;
>> +}

Thanks for the effort.
As I can see this function: port_groupx4() is nearly identical for all 3 
platforms: sse/nenon/altivec (except of course built-in arch-specific 
instincts).
In fact, even comemnts are identical.
I wonder can we have something like:
examples/common/<arch>/port_group.h
and for each arch will have defined port_groupx4(...)
?

>> +
>> +#endif /* _NEON_COMMON_H_ */
>> diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
>> new file mode 100644 index 0000000000..8b26d9380f
>> --- /dev/null
>> +++ b/examples/common/pkt_group.h
>> @@ -0,0 +1,139 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2016-2018 Intel Corporation.
>> + * Copyright(c) 2017-2018 Linaro Limited.
>> + * Copyright(C) 2022 Marvell.
>> + */
>> +
>> +#ifndef _PKT_GROUP_H_
>> +#define _PKT_GROUP_H_
>> +
>> +#define FWDSTEP	4
>> +
>> +/*
>> + * Group consecutive packets with the same destination port into one burst.
>> + * To avoid extra latency this is done together with some other packet
>> + * processing, but after we made a final decision about packet's destination.
>> + * To do this we maintain:
>> + * pnum - array of number of consecutive packets with the same dest
>> +port for
>> + * each packet in the input burst.
>> + * lp - pointer to the last updated element in the pnum.
>> + * dlp - dest port value lp corresponds to.
>> + */
>> +
>> +#define	GRPSZ	(1 << FWDSTEP)
>> +#define	GRPMSK	(GRPSZ - 1)
>> +
>> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>> +	if (likely((dlp) == (dcp)[(idx)])) {         \
>> +		(lp)[0]++;                           \
>> +	} else {                                     \
>> +		(dlp) = (dcp)[idx];                  \
>> +		(lp) = (pn) + (idx);                 \
>> +		(lp)[0] = 1;                         \
>> +	}                                            \
>> +} while (0)
>> +
>> +static const struct {
>> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>> +	int32_t  idx;  /* index for new last updated elemnet. */
>> +	uint16_t lpv;  /* add value to the last updated element. */ }
>> +gptbl[GRPSZ] = {
>> +	{
>> +		/* 0: a != b, b != c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100010001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 1: a == b, b != c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100010002),
>> +		.idx = 4,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 2: a != b, b == c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100020001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 3: a == b, b == c, c != d, d != e */
>> +		.pnum = UINT64_C(0x0001000100020003),
>> +		.idx = 4,
>> +		.lpv = 2,
>> +	},
>> +	{
>> +		/* 4: a != b, b != c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200010001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 5: a == b, b != c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200010002),
>> +		.idx = 4,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 6: a != b, b == c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200030001),
>> +		.idx = 4,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 7: a == b, b == c, c == d, d != e */
>> +		.pnum = UINT64_C(0x0001000200030004),
>> +		.idx = 4,
>> +		.lpv = 3,
>> +	},
>> +	{
>> +		/* 8: a != b, b != c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100010001),
>> +		.idx = 3,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 9: a == b, b != c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100010002),
>> +		.idx = 3,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 0xa: a != b, b == c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100020001),
>> +		.idx = 3,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 0xb: a == b, b == c, c != d, d == e */
>> +		.pnum = UINT64_C(0x0002000100020003),
>> +		.idx = 3,
>> +		.lpv = 2,
>> +	},
>> +	{
>> +		/* 0xc: a != b, b != c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300010001),
>> +		.idx = 2,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 0xd: a == b, b != c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300010002),
>> +		.idx = 2,
>> +		.lpv = 1,
>> +	},
>> +	{
>> +		/* 0xe: a != b, b == c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300040001),
>> +		.idx = 1,
>> +		.lpv = 0,
>> +	},
>> +	{
>> +		/* 0xf: a == b, b == c, c == d, d == e */
>> +		.pnum = UINT64_C(0x0002000300040005),
>> +		.idx = 0,
>> +		.lpv = 4,
>> +	},
>> +};
>> +
>> +#endif /* _PKT_GROUP_H_ */
>> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
>> 8efe6378e2..8dbe85c2e6 100644
>> --- a/examples/l3fwd/Makefile
>> +++ b/examples/l3fwd/Makefile
>> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
>>   static: build/$(APP)-static
>>   	ln -sf $(APP)-static build/$(APP)
>>
>> +INCLUDES =-I../common
>>   PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS += -O3
>> $(shell $(PKGCONF) --cflags libdpdk)  # Added for 'rte_eth_link_to_str()'
>> @@ -38,10 +39,10 @@ endif
>>   endif
>>
>>   build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>> +$(LDFLAGS_SHARED)
>>
>>   build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>> +$(LDFLAGS_STATIC)
>>
>>   build:
>>   	@mkdir -p $@
>> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
>> 8a52c90755..40b5f32a9e 100644
>> --- a/examples/l3fwd/l3fwd.h
>> +++ b/examples/l3fwd/l3fwd.h
>> @@ -44,8 +44,6 @@
>>   /* Used to mark destination port as 'invalid'. */
>>   #define	BAD_PORT ((uint16_t)-1)
>>
>> -#define FWDSTEP	4
>> -
>>   /* replace first 12B of the ethernet header. */
>>   #define	MASK_ETH 0x3f
>>
>> diff --git a/examples/l3fwd/l3fwd_common.h
>> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
>> --- a/examples/l3fwd/l3fwd_common.h
>> +++ b/examples/l3fwd/l3fwd_common.h
>> @@ -7,6 +7,8 @@
>>   #ifndef _L3FWD_COMMON_H_
>>   #define _L3FWD_COMMON_H_
>>
>> +#include "pkt_group.h"
>> +
>>   #ifdef DO_RFC_1812_CHECKS
>>
>>   #define	IPV4_MIN_VER_IHL	0x45
>> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t
>> *dp, uint32_t ptype)
>>   #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
>>   #endif /* DO_RFC_1812_CHECKS */
>>
>> -/*
>> - * We group consecutive packets with the same destination port into one burst.
>> - * To avoid extra latency this is done together with some other packet
>> - * processing, but after we made a final decision about packet's destination.
>> - * To do this we maintain:
>> - * pnum - array of number of consecutive packets with the same dest port for
>> - * each packet in the input burst.
>> - * lp - pointer to the last updated element in the pnum.
>> - * dlp - dest port value lp corresponds to.
>> - */
>> -
>> -#define	GRPSZ	(1 << FWDSTEP)
>> -#define	GRPMSK	(GRPSZ - 1)
>> -
>> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>> -	if (likely((dlp) == (dcp)[(idx)])) {             \
>> -		(lp)[0]++;                                   \
>> -	} else {                                         \
>> -		(dlp) = (dcp)[idx];                          \
>> -		(lp) = (pn) + (idx);                         \
>> -		(lp)[0] = 1;                                 \
>> -	}                                                \
>> -} while (0)
>> -
>> -static const struct {
>> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>> -	int32_t  idx;  /* index for new last updated element. */
>> -	uint16_t lpv;  /* add value to the last updated element. */
>> -} gptbl[GRPSZ] = {
>> -	{
>> -		/* 0: a != b, b != c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100010001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 1: a == b, b != c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100010002),
>> -		.idx = 4,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 2: a != b, b == c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100020001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 3: a == b, b == c, c != d, d != e */
>> -		.pnum = UINT64_C(0x0001000100020003),
>> -		.idx = 4,
>> -		.lpv = 2,
>> -	},
>> -	{
>> -		/* 4: a != b, b != c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200010001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 5: a == b, b != c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200010002),
>> -		.idx = 4,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 6: a != b, b == c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200030001),
>> -		.idx = 4,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 7: a == b, b == c, c == d, d != e */
>> -		.pnum = UINT64_C(0x0001000200030004),
>> -		.idx = 4,
>> -		.lpv = 3,
>> -	},
>> -	{
>> -		/* 8: a != b, b != c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100010001),
>> -		.idx = 3,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 9: a == b, b != c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100010002),
>> -		.idx = 3,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 0xa: a != b, b == c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100020001),
>> -		.idx = 3,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 0xb: a == b, b == c, c != d, d == e */
>> -		.pnum = UINT64_C(0x0002000100020003),
>> -		.idx = 3,
>> -		.lpv = 2,
>> -	},
>> -	{
>> -		/* 0xc: a != b, b != c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300010001),
>> -		.idx = 2,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 0xd: a == b, b != c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300010002),
>> -		.idx = 2,
>> -		.lpv = 1,
>> -	},
>> -	{
>> -		/* 0xe: a != b, b == c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300040001),
>> -		.idx = 1,
>> -		.lpv = 0,
>> -	},
>> -	{
>> -		/* 0xf: a == b, b == c, c == d, d == e */
>> -		.pnum = UINT64_C(0x0002000300040005),
>> -		.idx = 0,
>> -		.lpv = 4,
>> -	},
>> -};
>> -
>>   static __rte_always_inline void
>>   send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
>>   		uint32_t num)
>> diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
>> index e3d33a5229..5fa765b640 100644
>> --- a/examples/l3fwd/l3fwd_neon.h
>> +++ b/examples/l3fwd/l3fwd_neon.h
>> @@ -7,6 +7,7 @@
>>   #define _L3FWD_NEON_H_
>>
>>   #include "l3fwd.h"
>> +#include "neon_common.h"
>>   #include "l3fwd_common.h"
>>
>>   /*
>> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
>> uint16_t dst_port[FWDSTEP])
>>   			&dst_port[3], pkt[3]->packet_type);
>>   }
>>
>> -/*
>> - * Group consecutive packets with the same destination port in bursts of 4.
>> - * Suppose we have array of destination ports:
>> - * dst_port[] = {a, b, c, d,, e, ... }
>> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>> - * We doing 4 comparisons at once and the result is 4 bit mask.
>> - * This mask is used as an index into prebuild array of pnum values.
>> - */
>> -static inline uint16_t *
>> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>> -	     uint16x8_t dp2)
>> -{
>> -	union {
>> -		uint16_t u16[FWDSTEP + 1];
>> -		uint64_t u64;
>> -	} *pnum = (void *)pn;
>> -
>> -	int32_t v;
>> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>> -
>> -	dp1 = vceqq_u16(dp1, dp2);
>> -	dp1 = vandq_u16(dp1, mask);
>> -	v = vaddvq_u16(dp1);
>> -
>> -	/* update last port counter. */
>> -	lp[0] += gptbl[v].lpv;
>> -	rte_compiler_barrier();
>> -
>> -	/* if dest port value has changed. */
>> -	if (v != GRPMSK) {
>> -		pnum->u64 = gptbl[v].pnum;
>> -		pnum->u16[FWDSTEP] = 1;
>> -		lp = pnum->u16 + gptbl[v].idx;
>> -	}
>> -
>> -	return lp;
>> -}
>> -
>>   /**
>>    * Process one packet:
>>    * Update source and destination MAC addresses in the ethernet header.
>> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
>> rte_mbuf **pkts_burst,
>>   			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>>   			 */
>>   			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
>> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
>> dp2);
>>
>>   			/*
>>   			 * dp1:
>> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf, struct
>> rte_mbuf **pkts_burst,
>>   		 */
>>   		dp2 = vextq_u16(dp1, dp1, 1);
>>   		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
>> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>
>>   		/*
>>   		 * remove values added by the last repeated diff --git
>> a/examples/meson.build b/examples/meson.build index
>> 78de0e1f37..81e93799f2 100644
>> --- a/examples/meson.build
>> +++ b/examples/meson.build
>> @@ -97,7 +97,7 @@ foreach example: examples
>>       ldflags = default_ldflags
>>
>>       ext_deps = []
>> -    includes = [include_directories(example)]
>> +    includes = [include_directories(example, 'common')]
>>       deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
>>       subdir(example)
>>
>> --
>> 2.25.1
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode
  2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
  2022-06-17  7:51     ` Rahul Bhansali
@ 2022-06-21 12:55     ` Akhil Goyal
  2022-06-23  8:46     ` Zhang, Roy Fan
  2 siblings, 0 replies; 26+ messages in thread
From: Akhil Goyal @ 2022-06-21 12:55 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Radu Nicolau, Ruifeng Wang,
	Konstantin Ananyev, Fan Zhang, hemant.agrawal
  Cc: Jerin Jacob Kollanukkaran, Rahul Bhansali

> This adds the support of NEON based lpm lookup along with
> multi packet processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance
> increased by upto ~8% and inbound performance increased by
> upto ~6%.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
Acked-by: Akhil Goyal <gakhil@marvell.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-20  7:49   ` [EXT] " Akhil Goyal
  2022-06-20 10:45     ` Thomas Monjalon
@ 2022-06-21 12:56     ` Akhil Goyal
  1 sibling, 0 replies; 26+ messages in thread
From: Akhil Goyal @ 2022-06-21 12:56 UTC (permalink / raw)
  To: dev, Ruifeng Wang, thomas, ferruh.yigit, andrew.rybchenko,
	rasland, Konstantin Ananyev, maxime.coquelin, david.marchand
  Cc: Jerin Jacob Kollanukkaran, Rahul Bhansali

> 
> > This will make the packet grouping function common, so
> > that other examples can utilize as per need.
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
Acked-by: Akhil Goyal <gakhil@marvell.com>

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-20 23:13     ` Konstantin Ananyev
@ 2022-06-21 16:50       ` Rahul Bhansali
  2022-06-22 23:25         ` Konstantin Ananyev
  0 siblings, 1 reply; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-21 16:50 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: Jerin Jacob Kollanukkaran, dev, Ruifeng Wang



> -----Original Message-----
> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Sent: Tuesday, June 21, 2022 4:43 AM
> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Ruifeng Wang
> <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group
> functionality
> 
> External Email
> 
> ----------------------------------------------------------------------
> 17/06/2022 08:50, Rahul Bhansali пишет:
> > CC: Konstantin Ananyev
> >
> >> -----Original Message-----
> >> From: Rahul Bhansali <rbhansali@marvell.com>
> >> Sent: Friday, June 17, 2022 1:13 PM
> >> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
> >> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
> >> <rbhansali@marvell.com>
> >> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group
> >> functionality
> >>
> >> This will make the packet grouping function common, so that other
> >> examples can utilize as per need.
> >>
> >> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> >> ---
> >> Changes in v2: New patch to address review comment.
> >>
> >>   examples/common/neon_common.h |  50 ++++++++++++
> >>   examples/common/pkt_group.h   | 139
> >> ++++++++++++++++++++++++++++++++++
> >>   examples/l3fwd/Makefile       |   5 +-
> >>   examples/l3fwd/l3fwd.h        |   2 -
> >>   examples/l3fwd/l3fwd_common.h | 129 +------------------------------
> >>   examples/l3fwd/l3fwd_neon.h   |  43 +----------
> >>   examples/meson.build          |   2 +-
> >>   7 files changed, 198 insertions(+), 172 deletions(-)  create mode
> >> 100644 examples/common/neon_common.h  create mode 100644
> >> examples/common/pkt_group.h
> >>
> >> diff --git a/examples/common/neon_common.h
> >> b/examples/common/neon_common.h new file mode 100644 index
> >> 0000000000..f01b5ab6bc
> >> --- /dev/null
> >> +++ b/examples/common/neon_common.h
> >> @@ -0,0 +1,50 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2016-2018 Intel Corporation.
> >> + * Copyright(c) 2017-2018 Linaro Limited.
> >> + * Copyright(C) 2022 Marvell.
> >> + */
> >> +
> >> +#ifndef _NEON_COMMON_H_
> >> +#define _NEON_COMMON_H_
> >> +
> >> +#include "pkt_group.h"
> >> +
> >> +/*
> >> + * Group consecutive packets with the same destination port in bursts of 4.
> >> + * Suppose we have array of destination ports:
> >> + * dst_port[] = {a, b, c, d,, e, ... }
> >> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> >> + * We doing 4 comparisons at once and the result is 4 bit mask.
> >> + * This mask is used as an index into prebuild array of pnum values.
> >> + */
> >> +static inline uint16_t *
> >> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t
> dp1,
> >> +		  uint16x8_t dp2)
> >> +{
> >> +	union {
> >> +		uint16_t u16[FWDSTEP + 1];
> >> +		uint64_t u64;
> >> +	} *pnum = (void *)pn;
> >> +
> >> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> >> +	int32_t v;
> >> +
> >> +	dp1 = vceqq_u16(dp1, dp2);
> >> +	dp1 = vandq_u16(dp1, mask);
> >> +	v = vaddvq_u16(dp1);
> >> +
> >> +	/* update last port counter. */hh
> >> +	lp[0] += gptbl[v].lpv;
> >> +	rte_compiler_barrier();
> >> +
> >> +	/* if dest port value has changed. */
> >> +	if (v != GRPMSK) {
> >> +		pnum->u64 = gptbl[v].pnum;
> >> +		pnum->u16[FWDSTEP] = 1;
> >> +		lp = pnum->u16 + gptbl[v].idx;
> >> +	}
> >> +
> >> +	return lp;
> >> +}
> 
> Thanks for the effort.
> As I can see this function: port_groupx4() is nearly identical for all 3
> platforms: sse/nenon/altivec (except of course built-in arch-specific instincts).
> In fact, even comemnts are identical.
> I wonder can we have something like:
> examples/common/<arch>/port_group.h
> and for each arch will have defined port_groupx4(...) ?
> 
Yes, It’s a good point. I was thinking to have arch in file name itself. But we can have arch specific directory and have different header files.
Do you want me to make changes for all 3 sse/neon/altivec or just neon ?
I can check compilation for all but functionality/perf validate for Neon only.

> >> +
> >> +#endif /* _NEON_COMMON_H_ */
> >> diff --git a/examples/common/pkt_group.h
> >> b/examples/common/pkt_group.h new file mode 100644 index
> >> 0000000000..8b26d9380f
> >> --- /dev/null
> >> +++ b/examples/common/pkt_group.h
> >> @@ -0,0 +1,139 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2016-2018 Intel Corporation.
> >> + * Copyright(c) 2017-2018 Linaro Limited.
> >> + * Copyright(C) 2022 Marvell.
> >> + */
> >> +
> >> +#ifndef _PKT_GROUP_H_
> >> +#define _PKT_GROUP_H_
> >> +
> >> +#define FWDSTEP	4
> >> +
> >> +/*
> >> + * Group consecutive packets with the same destination port into one burst.
> >> + * To avoid extra latency this is done together with some other
> >> +packet
> >> + * processing, but after we made a final decision about packet's destination.
> >> + * To do this we maintain:
> >> + * pnum - array of number of consecutive packets with the same dest
> >> +port for
> >> + * each packet in the input burst.
> >> + * lp - pointer to the last updated element in the pnum.
> >> + * dlp - dest port value lp corresponds to.
> >> + */
> >> +
> >> +#define	GRPSZ	(1 << FWDSTEP)
> >> +#define	GRPMSK	(GRPSZ - 1)
> >> +
> >> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> >> +	if (likely((dlp) == (dcp)[(idx)])) {         \
> >> +		(lp)[0]++;                           \
> >> +	} else {                                     \
> >> +		(dlp) = (dcp)[idx];                  \
> >> +		(lp) = (pn) + (idx);                 \
> >> +		(lp)[0] = 1;                         \
> >> +	}                                            \
> >> +} while (0)
> >> +
> >> +static const struct {
> >> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> >> +	int32_t  idx;  /* index for new last updated elemnet. */
> >> +	uint16_t lpv;  /* add value to the last updated element. */ }
> >> +gptbl[GRPSZ] = {
> >> +	{
> >> +		/* 0: a != b, b != c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100010001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 1: a == b, b != c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100010002),
> >> +		.idx = 4,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 2: a != b, b == c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100020001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 3: a == b, b == c, c != d, d != e */
> >> +		.pnum = UINT64_C(0x0001000100020003),
> >> +		.idx = 4,
> >> +		.lpv = 2,
> >> +	},
> >> +	{
> >> +		/* 4: a != b, b != c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200010001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 5: a == b, b != c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200010002),
> >> +		.idx = 4,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 6: a != b, b == c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200030001),
> >> +		.idx = 4,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 7: a == b, b == c, c == d, d != e */
> >> +		.pnum = UINT64_C(0x0001000200030004),
> >> +		.idx = 4,
> >> +		.lpv = 3,
> >> +	},
> >> +	{
> >> +		/* 8: a != b, b != c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100010001),
> >> +		.idx = 3,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 9: a == b, b != c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100010002),
> >> +		.idx = 3,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 0xa: a != b, b == c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100020001),
> >> +		.idx = 3,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 0xb: a == b, b == c, c != d, d == e */
> >> +		.pnum = UINT64_C(0x0002000100020003),
> >> +		.idx = 3,
> >> +		.lpv = 2,
> >> +	},
> >> +	{
> >> +		/* 0xc: a != b, b != c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300010001),
> >> +		.idx = 2,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 0xd: a == b, b != c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300010002),
> >> +		.idx = 2,
> >> +		.lpv = 1,
> >> +	},
> >> +	{
> >> +		/* 0xe: a != b, b == c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300040001),
> >> +		.idx = 1,
> >> +		.lpv = 0,
> >> +	},
> >> +	{
> >> +		/* 0xf: a == b, b == c, c == d, d == e */
> >> +		.pnum = UINT64_C(0x0002000300040005),
> >> +		.idx = 0,
> >> +		.lpv = 4,
> >> +	},
> >> +};
> >> +
> >> +#endif /* _PKT_GROUP_H_ */
> >> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
> >> 8efe6378e2..8dbe85c2e6 100644
> >> --- a/examples/l3fwd/Makefile
> >> +++ b/examples/l3fwd/Makefile
> >> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
> >>   static: build/$(APP)-static
> >>   	ln -sf $(APP)-static build/$(APP)
> >>
> >> +INCLUDES =-I../common
> >>   PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS
> >> += -O3 $(shell $(PKGCONF) --cflags libdpdk)  # Added for
> 'rte_eth_link_to_str()'
> >> @@ -38,10 +39,10 @@ endif
> >>   endif
> >>
> >>   build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> >> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> >> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> >> +$(LDFLAGS_SHARED)
> >>
> >>   build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> >> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> >> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> >> +$(LDFLAGS_STATIC)
> >>
> >>   build:
> >>   	@mkdir -p $@
> >> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
> >> 8a52c90755..40b5f32a9e 100644
> >> --- a/examples/l3fwd/l3fwd.h
> >> +++ b/examples/l3fwd/l3fwd.h
> >> @@ -44,8 +44,6 @@
> >>   /* Used to mark destination port as 'invalid'. */
> >>   #define	BAD_PORT ((uint16_t)-1)
> >>
> >> -#define FWDSTEP	4
> >> -
> >>   /* replace first 12B of the ethernet header. */
> >>   #define	MASK_ETH 0x3f
> >>
> >> diff --git a/examples/l3fwd/l3fwd_common.h
> >> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
> >> --- a/examples/l3fwd/l3fwd_common.h
> >> +++ b/examples/l3fwd/l3fwd_common.h
> >> @@ -7,6 +7,8 @@
> >>   #ifndef _L3FWD_COMMON_H_
> >>   #define _L3FWD_COMMON_H_
> >>
> >> +#include "pkt_group.h"
> >> +
> >>   #ifdef DO_RFC_1812_CHECKS
> >>
> >>   #define	IPV4_MIN_VER_IHL	0x45
> >> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr,
> >> uint16_t *dp, uint32_t ptype)
> >>   #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
> >>   #endif /* DO_RFC_1812_CHECKS */
> >>
> >> -/*
> >> - * We group consecutive packets with the same destination port into one
> burst.
> >> - * To avoid extra latency this is done together with some other
> >> packet
> >> - * processing, but after we made a final decision about packet's destination.
> >> - * To do this we maintain:
> >> - * pnum - array of number of consecutive packets with the same dest
> >> port for
> >> - * each packet in the input burst.
> >> - * lp - pointer to the last updated element in the pnum.
> >> - * dlp - dest port value lp corresponds to.
> >> - */
> >> -
> >> -#define	GRPSZ	(1 << FWDSTEP)
> >> -#define	GRPMSK	(GRPSZ - 1)
> >> -
> >> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> >> -	if (likely((dlp) == (dcp)[(idx)])) {             \
> >> -		(lp)[0]++;                                   \
> >> -	} else {                                         \
> >> -		(dlp) = (dcp)[idx];                          \
> >> -		(lp) = (pn) + (idx);                         \
> >> -		(lp)[0] = 1;                                 \
> >> -	}                                                \
> >> -} while (0)
> >> -
> >> -static const struct {
> >> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> >> -	int32_t  idx;  /* index for new last updated element. */
> >> -	uint16_t lpv;  /* add value to the last updated element. */
> >> -} gptbl[GRPSZ] = {
> >> -	{
> >> -		/* 0: a != b, b != c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100010001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 1: a == b, b != c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100010002),
> >> -		.idx = 4,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 2: a != b, b == c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100020001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 3: a == b, b == c, c != d, d != e */
> >> -		.pnum = UINT64_C(0x0001000100020003),
> >> -		.idx = 4,
> >> -		.lpv = 2,
> >> -	},
> >> -	{
> >> -		/* 4: a != b, b != c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200010001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 5: a == b, b != c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200010002),
> >> -		.idx = 4,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 6: a != b, b == c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200030001),
> >> -		.idx = 4,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 7: a == b, b == c, c == d, d != e */
> >> -		.pnum = UINT64_C(0x0001000200030004),
> >> -		.idx = 4,
> >> -		.lpv = 3,
> >> -	},
> >> -	{
> >> -		/* 8: a != b, b != c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100010001),
> >> -		.idx = 3,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 9: a == b, b != c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100010002),
> >> -		.idx = 3,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 0xa: a != b, b == c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100020001),
> >> -		.idx = 3,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 0xb: a == b, b == c, c != d, d == e */
> >> -		.pnum = UINT64_C(0x0002000100020003),
> >> -		.idx = 3,
> >> -		.lpv = 2,
> >> -	},
> >> -	{
> >> -		/* 0xc: a != b, b != c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300010001),
> >> -		.idx = 2,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 0xd: a == b, b != c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300010002),
> >> -		.idx = 2,
> >> -		.lpv = 1,
> >> -	},
> >> -	{
> >> -		/* 0xe: a != b, b == c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300040001),
> >> -		.idx = 1,
> >> -		.lpv = 0,
> >> -	},
> >> -	{
> >> -		/* 0xf: a == b, b == c, c == d, d == e */
> >> -		.pnum = UINT64_C(0x0002000300040005),
> >> -		.idx = 0,
> >> -		.lpv = 4,
> >> -	},
> >> -};
> >> -
> >>   static __rte_always_inline void
> >>   send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf
> *m[],
> >>   		uint32_t num)
> >> diff --git a/examples/l3fwd/l3fwd_neon.h
> >> b/examples/l3fwd/l3fwd_neon.h index e3d33a5229..5fa765b640 100644
> >> --- a/examples/l3fwd/l3fwd_neon.h
> >> +++ b/examples/l3fwd/l3fwd_neon.h
> >> @@ -7,6 +7,7 @@
> >>   #define _L3FWD_NEON_H_
> >>
> >>   #include "l3fwd.h"
> >> +#include "neon_common.h"
> >>   #include "l3fwd_common.h"
> >>
> >>   /*
> >> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
> >> uint16_t dst_port[FWDSTEP])
> >>   			&dst_port[3], pkt[3]->packet_type);
> >>   }
> >>
> >> -/*
> >> - * Group consecutive packets with the same destination port in bursts of 4.
> >> - * Suppose we have array of destination ports:
> >> - * dst_port[] = {a, b, c, d,, e, ... }
> >> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> >> - * We doing 4 comparisons at once and the result is 4 bit mask.
> >> - * This mask is used as an index into prebuild array of pnum values.
> >> - */
> >> -static inline uint16_t *
> >> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> >> -	     uint16x8_t dp2)
> >> -{
> >> -	union {
> >> -		uint16_t u16[FWDSTEP + 1];
> >> -		uint64_t u64;
> >> -	} *pnum = (void *)pn;
> >> -
> >> -	int32_t v;
> >> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> >> -
> >> -	dp1 = vceqq_u16(dp1, dp2);
> >> -	dp1 = vandq_u16(dp1, mask);
> >> -	v = vaddvq_u16(dp1);
> >> -
> >> -	/* update last port counter. */
> >> -	lp[0] += gptbl[v].lpv;
> >> -	rte_compiler_barrier();
> >> -
> >> -	/* if dest port value has changed. */
> >> -	if (v != GRPMSK) {
> >> -		pnum->u64 = gptbl[v].pnum;
> >> -		pnum->u16[FWDSTEP] = 1;
> >> -		lp = pnum->u16 + gptbl[v].idx;
> >> -	}
> >> -
> >> -	return lp;
> >> -}
> >> -
> >>   /**
> >>    * Process one packet:
> >>    * Update source and destination MAC addresses in the ethernet header.
> >> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf,
> >> struct rte_mbuf **pkts_burst,
> >>   			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> >>   			 */
> >>   			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> >> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> >> dp2);
> >>
> >>   			/*
> >>   			 * dp1:
> >> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf,
> >> struct rte_mbuf **pkts_burst,
> >>   		 */
> >>   		dp2 = vextq_u16(dp1, dp1, 1);
> >>   		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> >> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
> >>
> >>   		/*
> >>   		 * remove values added by the last repeated diff --git
> >> a/examples/meson.build b/examples/meson.build index
> >> 78de0e1f37..81e93799f2 100644
> >> --- a/examples/meson.build
> >> +++ b/examples/meson.build
> >> @@ -97,7 +97,7 @@ foreach example: examples
> >>       ldflags = default_ldflags
> >>
> >>       ext_deps = []
> >> -    includes = [include_directories(example)]
> >> +    includes = [include_directories(example, 'common')]
> >>       deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
> >>       subdir(example)
> >>
> >> --
> >> 2.25.1
> >


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group functionality
  2022-06-21 16:50       ` [EXT] " Rahul Bhansali
@ 2022-06-22 23:25         ` Konstantin Ananyev
  0 siblings, 0 replies; 26+ messages in thread
From: Konstantin Ananyev @ 2022-06-22 23:25 UTC (permalink / raw)
  To: Rahul Bhansali; +Cc: Jerin Jacob Kollanukkaran, dev, Ruifeng Wang

21/06/2022 17:50, Rahul Bhansali пишет:
> 
> 
>> -----Original Message-----
>> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>> Sent: Tuesday, June 21, 2022 4:43 AM
>> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Ruifeng Wang
>> <ruifeng.wang@arm.com>
>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
>> Subject: [EXT] Re: [PATCH v2 1/2] examples/l3fwd: common packet group
>> functionality
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> 17/06/2022 08:50, Rahul Bhansali пишет:
>>> CC: Konstantin Ananyev
>>>
>>>> -----Original Message-----
>>>> From: Rahul Bhansali <rbhansali@marvell.com>
>>>> Sent: Friday, June 17, 2022 1:13 PM
>>>> To: dev@dpdk.org; Ruifeng Wang <ruifeng.wang@arm.com>
>>>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
>>>> <rbhansali@marvell.com>
>>>> Subject: [PATCH v2 1/2] examples/l3fwd: common packet group
>>>> functionality
>>>>
>>>> This will make the packet grouping function common, so that other
>>>> examples can utilize as per need.
>>>>
>>>> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
>>>> ---
>>>> Changes in v2: New patch to address review comment.
>>>>
>>>>    examples/common/neon_common.h |  50 ++++++++++++
>>>>    examples/common/pkt_group.h   | 139
>>>> ++++++++++++++++++++++++++++++++++
>>>>    examples/l3fwd/Makefile       |   5 +-
>>>>    examples/l3fwd/l3fwd.h        |   2 -
>>>>    examples/l3fwd/l3fwd_common.h | 129 +------------------------------
>>>>    examples/l3fwd/l3fwd_neon.h   |  43 +----------
>>>>    examples/meson.build          |   2 +-
>>>>    7 files changed, 198 insertions(+), 172 deletions(-)  create mode
>>>> 100644 examples/common/neon_common.h  create mode 100644
>>>> examples/common/pkt_group.h
>>>>
>>>> diff --git a/examples/common/neon_common.h
>>>> b/examples/common/neon_common.h new file mode 100644 index
>>>> 0000000000..f01b5ab6bc
>>>> --- /dev/null
>>>> +++ b/examples/common/neon_common.h
>>>> @@ -0,0 +1,50 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2016-2018 Intel Corporation.
>>>> + * Copyright(c) 2017-2018 Linaro Limited.
>>>> + * Copyright(C) 2022 Marvell.
>>>> + */
>>>> +
>>>> +#ifndef _NEON_COMMON_H_
>>>> +#define _NEON_COMMON_H_
>>>> +
>>>> +#include "pkt_group.h"
>>>> +
>>>> +/*
>>>> + * Group consecutive packets with the same destination port in bursts of 4.
>>>> + * Suppose we have array of destination ports:
>>>> + * dst_port[] = {a, b, c, d,, e, ... }
>>>> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>>>> + * We doing 4 comparisons at once and the result is 4 bit mask.
>>>> + * This mask is used as an index into prebuild array of pnum values.
>>>> + */
>>>> +static inline uint16_t *
>>>> +neon_port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t
>> dp1,
>>>> +		  uint16x8_t dp2)
>>>> +{
>>>> +	union {
>>>> +		uint16_t u16[FWDSTEP + 1];
>>>> +		uint64_t u64;
>>>> +	} *pnum = (void *)pn;
>>>> +
>>>> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>>>> +	int32_t v;
>>>> +
>>>> +	dp1 = vceqq_u16(dp1, dp2);
>>>> +	dp1 = vandq_u16(dp1, mask);
>>>> +	v = vaddvq_u16(dp1);
>>>> +
>>>> +	/* update last port counter. */hh
>>>> +	lp[0] += gptbl[v].lpv;
>>>> +	rte_compiler_barrier();
>>>> +
>>>> +	/* if dest port value has changed. */
>>>> +	if (v != GRPMSK) {
>>>> +		pnum->u64 = gptbl[v].pnum;
>>>> +		pnum->u16[FWDSTEP] = 1;
>>>> +		lp = pnum->u16 + gptbl[v].idx;
>>>> +	}
>>>> +
>>>> +	return lp;
>>>> +}
>>
>> Thanks for the effort.
>> As I can see this function: port_groupx4() is nearly identical for all 3
>> platforms: sse/nenon/altivec (except of course built-in arch-specific instincts).
>> In fact, even comemnts are identical.
>> I wonder can we have something like:
>> examples/common/<arch>/port_group.h
>> and for each arch will have defined port_groupx4(...) ?
>>
> Yes, It’s a good point. I was thinking to have arch in file name itself. But we can have arch specific directory and have different header files.
> Do you want me to make changes for all 3 sse/neon/altivec or just neon ?

My thought was to move headers for all archs.

> I can check compilation for all but functionality/perf validate for Neon only.

I can do quick functional test for x86.
Plus I think l3fwd is part of release cycle testing anyway.
Thanks
Konstantin

>>>> +
>>>> +#endif /* _NEON_COMMON_H_ */
>>>> diff --git a/examples/common/pkt_group.h
>>>> b/examples/common/pkt_group.h new file mode 100644 index
>>>> 0000000000..8b26d9380f
>>>> --- /dev/null
>>>> +++ b/examples/common/pkt_group.h
>>>> @@ -0,0 +1,139 @@
>>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>>> + * Copyright(c) 2016-2018 Intel Corporation.
>>>> + * Copyright(c) 2017-2018 Linaro Limited.
>>>> + * Copyright(C) 2022 Marvell.
>>>> + */
>>>> +
>>>> +#ifndef _PKT_GROUP_H_
>>>> +#define _PKT_GROUP_H_
>>>> +
>>>> +#define FWDSTEP	4
>>>> +
>>>> +/*
>>>> + * Group consecutive packets with the same destination port into one burst.
>>>> + * To avoid extra latency this is done together with some other
>>>> +packet
>>>> + * processing, but after we made a final decision about packet's destination.
>>>> + * To do this we maintain:
>>>> + * pnum - array of number of consecutive packets with the same dest
>>>> +port for
>>>> + * each packet in the input burst.
>>>> + * lp - pointer to the last updated element in the pnum.
>>>> + * dlp - dest port value lp corresponds to.
>>>> + */
>>>> +
>>>> +#define	GRPSZ	(1 << FWDSTEP)
>>>> +#define	GRPMSK	(GRPSZ - 1)
>>>> +
>>>> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>>>> +	if (likely((dlp) == (dcp)[(idx)])) {         \
>>>> +		(lp)[0]++;                           \
>>>> +	} else {                                     \
>>>> +		(dlp) = (dcp)[idx];                  \
>>>> +		(lp) = (pn) + (idx);                 \
>>>> +		(lp)[0] = 1;                         \
>>>> +	}                                            \
>>>> +} while (0)
>>>> +
>>>> +static const struct {
>>>> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>>>> +	int32_t  idx;  /* index for new last updated elemnet. */
>>>> +	uint16_t lpv;  /* add value to the last updated element. */ }
>>>> +gptbl[GRPSZ] = {
>>>> +	{
>>>> +		/* 0: a != b, b != c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100010001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 1: a == b, b != c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100010002),
>>>> +		.idx = 4,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 2: a != b, b == c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100020001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 3: a == b, b == c, c != d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000100020003),
>>>> +		.idx = 4,
>>>> +		.lpv = 2,
>>>> +	},
>>>> +	{
>>>> +		/* 4: a != b, b != c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200010001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 5: a == b, b != c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200010002),
>>>> +		.idx = 4,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 6: a != b, b == c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200030001),
>>>> +		.idx = 4,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 7: a == b, b == c, c == d, d != e */
>>>> +		.pnum = UINT64_C(0x0001000200030004),
>>>> +		.idx = 4,
>>>> +		.lpv = 3,
>>>> +	},
>>>> +	{
>>>> +		/* 8: a != b, b != c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100010001),
>>>> +		.idx = 3,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 9: a == b, b != c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100010002),
>>>> +		.idx = 3,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 0xa: a != b, b == c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100020001),
>>>> +		.idx = 3,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 0xb: a == b, b == c, c != d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000100020003),
>>>> +		.idx = 3,
>>>> +		.lpv = 2,
>>>> +	},
>>>> +	{
>>>> +		/* 0xc: a != b, b != c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300010001),
>>>> +		.idx = 2,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 0xd: a == b, b != c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300010002),
>>>> +		.idx = 2,
>>>> +		.lpv = 1,
>>>> +	},
>>>> +	{
>>>> +		/* 0xe: a != b, b == c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300040001),
>>>> +		.idx = 1,
>>>> +		.lpv = 0,
>>>> +	},
>>>> +	{
>>>> +		/* 0xf: a == b, b == c, c == d, d == e */
>>>> +		.pnum = UINT64_C(0x0002000300040005),
>>>> +		.idx = 0,
>>>> +		.lpv = 4,
>>>> +	},
>>>> +};
>>>> +
>>>> +#endif /* _PKT_GROUP_H_ */
>>>> diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile index
>>>> 8efe6378e2..8dbe85c2e6 100644
>>>> --- a/examples/l3fwd/Makefile
>>>> +++ b/examples/l3fwd/Makefile
>>>> @@ -22,6 +22,7 @@ shared: build/$(APP)-shared
>>>>    static: build/$(APP)-static
>>>>    	ln -sf $(APP)-static build/$(APP)
>>>>
>>>> +INCLUDES =-I../common
>>>>    PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS
>>>> += -O3 $(shell $(PKGCONF) --cflags libdpdk)  # Added for
>> 'rte_eth_link_to_str()'
>>>> @@ -38,10 +39,10 @@ endif
>>>>    endif
>>>>
>>>>    build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
>>>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
>>>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>>>> +$(LDFLAGS_SHARED)
>>>>
>>>>    build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
>>>> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
>>>> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
>>>> +$(LDFLAGS_STATIC)
>>>>
>>>>    build:
>>>>    	@mkdir -p $@
>>>> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index
>>>> 8a52c90755..40b5f32a9e 100644
>>>> --- a/examples/l3fwd/l3fwd.h
>>>> +++ b/examples/l3fwd/l3fwd.h
>>>> @@ -44,8 +44,6 @@
>>>>    /* Used to mark destination port as 'invalid'. */
>>>>    #define	BAD_PORT ((uint16_t)-1)
>>>>
>>>> -#define FWDSTEP	4
>>>> -
>>>>    /* replace first 12B of the ethernet header. */
>>>>    #define	MASK_ETH 0x3f
>>>>
>>>> diff --git a/examples/l3fwd/l3fwd_common.h
>>>> b/examples/l3fwd/l3fwd_common.h index 8e4c27218f..224b1c08e8 100644
>>>> --- a/examples/l3fwd/l3fwd_common.h
>>>> +++ b/examples/l3fwd/l3fwd_common.h
>>>> @@ -7,6 +7,8 @@
>>>>    #ifndef _L3FWD_COMMON_H_
>>>>    #define _L3FWD_COMMON_H_
>>>>
>>>> +#include "pkt_group.h"
>>>> +
>>>>    #ifdef DO_RFC_1812_CHECKS
>>>>
>>>>    #define	IPV4_MIN_VER_IHL	0x45
>>>> @@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr,
>>>> uint16_t *dp, uint32_t ptype)
>>>>    #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
>>>>    #endif /* DO_RFC_1812_CHECKS */
>>>>
>>>> -/*
>>>> - * We group consecutive packets with the same destination port into one
>> burst.
>>>> - * To avoid extra latency this is done together with some other
>>>> packet
>>>> - * processing, but after we made a final decision about packet's destination.
>>>> - * To do this we maintain:
>>>> - * pnum - array of number of consecutive packets with the same dest
>>>> port for
>>>> - * each packet in the input burst.
>>>> - * lp - pointer to the last updated element in the pnum.
>>>> - * dlp - dest port value lp corresponds to.
>>>> - */
>>>> -
>>>> -#define	GRPSZ	(1 << FWDSTEP)
>>>> -#define	GRPMSK	(GRPSZ - 1)
>>>> -
>>>> -#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>>>> -	if (likely((dlp) == (dcp)[(idx)])) {             \
>>>> -		(lp)[0]++;                                   \
>>>> -	} else {                                         \
>>>> -		(dlp) = (dcp)[idx];                          \
>>>> -		(lp) = (pn) + (idx);                         \
>>>> -		(lp)[0] = 1;                                 \
>>>> -	}                                                \
>>>> -} while (0)
>>>> -
>>>> -static const struct {
>>>> -	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>>>> -	int32_t  idx;  /* index for new last updated element. */
>>>> -	uint16_t lpv;  /* add value to the last updated element. */
>>>> -} gptbl[GRPSZ] = {
>>>> -	{
>>>> -		/* 0: a != b, b != c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100010001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 1: a == b, b != c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100010002),
>>>> -		.idx = 4,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 2: a != b, b == c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100020001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 3: a == b, b == c, c != d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000100020003),
>>>> -		.idx = 4,
>>>> -		.lpv = 2,
>>>> -	},
>>>> -	{
>>>> -		/* 4: a != b, b != c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200010001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 5: a == b, b != c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200010002),
>>>> -		.idx = 4,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 6: a != b, b == c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200030001),
>>>> -		.idx = 4,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 7: a == b, b == c, c == d, d != e */
>>>> -		.pnum = UINT64_C(0x0001000200030004),
>>>> -		.idx = 4,
>>>> -		.lpv = 3,
>>>> -	},
>>>> -	{
>>>> -		/* 8: a != b, b != c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100010001),
>>>> -		.idx = 3,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 9: a == b, b != c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100010002),
>>>> -		.idx = 3,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 0xa: a != b, b == c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100020001),
>>>> -		.idx = 3,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 0xb: a == b, b == c, c != d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000100020003),
>>>> -		.idx = 3,
>>>> -		.lpv = 2,
>>>> -	},
>>>> -	{
>>>> -		/* 0xc: a != b, b != c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300010001),
>>>> -		.idx = 2,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 0xd: a == b, b != c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300010002),
>>>> -		.idx = 2,
>>>> -		.lpv = 1,
>>>> -	},
>>>> -	{
>>>> -		/* 0xe: a != b, b == c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300040001),
>>>> -		.idx = 1,
>>>> -		.lpv = 0,
>>>> -	},
>>>> -	{
>>>> -		/* 0xf: a == b, b == c, c == d, d == e */
>>>> -		.pnum = UINT64_C(0x0002000300040005),
>>>> -		.idx = 0,
>>>> -		.lpv = 4,
>>>> -	},
>>>> -};
>>>> -
>>>>    static __rte_always_inline void
>>>>    send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf
>> *m[],
>>>>    		uint32_t num)
>>>> diff --git a/examples/l3fwd/l3fwd_neon.h
>>>> b/examples/l3fwd/l3fwd_neon.h index e3d33a5229..5fa765b640 100644
>>>> --- a/examples/l3fwd/l3fwd_neon.h
>>>> +++ b/examples/l3fwd/l3fwd_neon.h
>>>> @@ -7,6 +7,7 @@
>>>>    #define _L3FWD_NEON_H_
>>>>
>>>>    #include "l3fwd.h"
>>>> +#include "neon_common.h"
>>>>    #include "l3fwd_common.h"
>>>>
>>>>    /*
>>>> @@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP],
>>>> uint16_t dst_port[FWDSTEP])
>>>>    			&dst_port[3], pkt[3]->packet_type);
>>>>    }
>>>>
>>>> -/*
>>>> - * Group consecutive packets with the same destination port in bursts of 4.
>>>> - * Suppose we have array of destination ports:
>>>> - * dst_port[] = {a, b, c, d,, e, ... }
>>>> - * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>>>> - * We doing 4 comparisons at once and the result is 4 bit mask.
>>>> - * This mask is used as an index into prebuild array of pnum values.
>>>> - */
>>>> -static inline uint16_t *
>>>> -port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>>>> -	     uint16x8_t dp2)
>>>> -{
>>>> -	union {
>>>> -		uint16_t u16[FWDSTEP + 1];
>>>> -		uint64_t u64;
>>>> -	} *pnum = (void *)pn;
>>>> -
>>>> -	int32_t v;
>>>> -	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>>>> -
>>>> -	dp1 = vceqq_u16(dp1, dp2);
>>>> -	dp1 = vandq_u16(dp1, mask);
>>>> -	v = vaddvq_u16(dp1);
>>>> -
>>>> -	/* update last port counter. */
>>>> -	lp[0] += gptbl[v].lpv;
>>>> -	rte_compiler_barrier();
>>>> -
>>>> -	/* if dest port value has changed. */
>>>> -	if (v != GRPMSK) {
>>>> -		pnum->u64 = gptbl[v].pnum;
>>>> -		pnum->u16[FWDSTEP] = 1;
>>>> -		lp = pnum->u16 + gptbl[v].idx;
>>>> -	}
>>>> -
>>>> -	return lp;
>>>> -}
>>>> -
>>>>    /**
>>>>     * Process one packet:
>>>>     * Update source and destination MAC addresses in the ethernet header.
>>>> @@ -161,7 +124,7 @@ send_packets_multi(struct lcore_conf *qconf,
>>>> struct rte_mbuf **pkts_burst,
>>>>    			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>>>>    			 */
>>>>    			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
>>>> -			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>>> +			lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
>>>> dp2);
>>>>
>>>>    			/*
>>>>    			 * dp1:
>>>> @@ -175,7 +138,7 @@ send_packets_multi(struct lcore_conf *qconf,
>>>> struct rte_mbuf **pkts_burst,
>>>>    		 */
>>>>    		dp2 = vextq_u16(dp1, dp1, 1);
>>>>    		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
>>>> -		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>>> +		lp  = neon_port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
>>>>
>>>>    		/*
>>>>    		 * remove values added by the last repeated diff --git
>>>> a/examples/meson.build b/examples/meson.build index
>>>> 78de0e1f37..81e93799f2 100644
>>>> --- a/examples/meson.build
>>>> +++ b/examples/meson.build
>>>> @@ -97,7 +97,7 @@ foreach example: examples
>>>>        ldflags = default_ldflags
>>>>
>>>>        ext_deps = []
>>>> -    includes = [include_directories(example)]
>>>> +    includes = [include_directories(example, 'common')]
>>>>        deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
>>>>        subdir(example)
>>>>
>>>> --
>>>> 2.25.1
>>>
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode
  2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
  2022-06-17  7:51     ` Rahul Bhansali
  2022-06-21 12:55     ` Akhil Goyal
@ 2022-06-23  8:46     ` Zhang, Roy Fan
  2022-06-23  9:37       ` Rahul Bhansali
  2 siblings, 1 reply; 26+ messages in thread
From: Zhang, Roy Fan @ 2022-06-23  8:46 UTC (permalink / raw)
  To: Rahul Bhansali, dev, Nicolau, Radu, Akhil Goyal, Ruifeng Wang; +Cc: jerinj

Hi Rahul

> -----Original Message-----
> From: Rahul Bhansali <rbhansali@marvell.com>
> Sent: Friday, June 17, 2022 8:43 AM
> To: dev@dpdk.org; Nicolau, Radu <radu.nicolau@intel.com>; Akhil Goyal
> <gakhil@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> Cc: jerinj@marvell.com; Rahul Bhansali <rbhansali@marvell.com>
> Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll
> mode
> 
> This adds the support of NEON based lpm lookup along with
> multi packet processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance
> increased by upto ~8% and inbound performance increased by
> upto ~6%.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
--snip--
> 
>  static inline void
> @@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct
> rte_ether_addr *addr)
>  		return -EINVAL;
> 

Fan: I failed to understand why do we need to overwrite address to do an address
copy here. Was it a bug?

>  	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> +			    (struct rte_ether_addr *)(val_eth + port));
>  	return 0;
>  }
> 
> @@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads,
> uint64_t req_tx_offloads)
>  			portid, rte_strerror(-ret));
> 
>  	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);

Fan: Same here 

> +
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> +			    (struct rte_ether_addr *)(val_eth + portid));
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> +
>  	print_ethaddr("Address: ", &ethaddr);
>  	printf("\n");

^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode
  2022-06-23  8:46     ` Zhang, Roy Fan
@ 2022-06-23  9:37       ` Rahul Bhansali
  0 siblings, 0 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-23  9:37 UTC (permalink / raw)
  To: Zhang, Roy Fan, dev, Nicolau, Radu, Akhil Goyal, Ruifeng Wang
  Cc: Jerin Jacob Kollanukkaran



> -----Original Message-----
> From: Zhang, Roy Fan <roy.fan.zhang@intel.com>
> Sent: Thursday, June 23, 2022 2:17 PM
> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Nicolau, Radu
> <radu.nicolau@intel.com>; Akhil Goyal <gakhil@marvell.com>; Ruifeng Wang
> <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: [EXT] RE: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON
> with poll mode
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi Rahul
> 
> > -----Original Message-----
> > From: Rahul Bhansali <rbhansali@marvell.com>
> > Sent: Friday, June 17, 2022 8:43 AM
> > To: dev@dpdk.org; Nicolau, Radu <radu.nicolau@intel.com>; Akhil Goyal
> > <gakhil@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> > Cc: jerinj@marvell.com; Rahul Bhansali <rbhansali@marvell.com>
> > Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with
> > poll mode
> >
> > This adds the support of NEON based lpm lookup along with multi packet
> > processing for burst send in packets routing.
> >
> > Performance impact:
> > On cn10k, with poll mode inline protocol, outbound performance
> > increased by upto ~8% and inbound performance increased by upto ~6%.
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
> --snip--
> >
> >  static inline void
> > @@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct
> > rte_ether_addr *addr)
> >  		return -EINVAL;
> >
> 
> Fan: I failed to understand why do we need to overwrite address to do an
> address copy here. Was it a bug?

It is not overwriting the ethaddr_tbl[port].dst address, instead it is copying from dst to  xmm_t val_eth data in a format as required by Neon based packet processing path on routing.

> 
> >  	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> > +			    (struct rte_ether_addr *)(val_eth + port));
> >  	return 0;
> >  }
> >
> > @@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t
> > req_rx_offloads, uint64_t req_tx_offloads)
> >  			portid, rte_strerror(-ret));
> >
> >  	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> 
> Fan: Same here
> 
> > +
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> > +			    (struct rte_ether_addr *)(val_eth + portid));
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> > +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> > +
> >  	print_ethaddr("Address: ", &ethaddr);
> >  	printf("\n");

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-05-24  9:57 [PATCH] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
  2022-05-24 23:00 ` Konstantin Ananyev
  2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
@ 2022-06-23  9:38 ` Rahul Bhansali
  2022-06-23  9:38   ` [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
                     ` (3 more replies)
  2 siblings, 4 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-23  9:38 UTC (permalink / raw)
  To: dev, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev
  Cc: jerinj, gakhil, Rahul Bhansali

This will make the packet grouping function common, so
that other examples can utilize as per need.

For each architecture sse/neon/altivec, port group
headers will be created under examples/common/<arch>.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v3: Created common port-group headers for
architectures sse/neon/altivec as suggested by Konstantin.

Changes in v2: New patch to address review comment.

 examples/common/altivec/port_group.h |  48 +++++++++
 examples/common/neon/port_group.h    |  50 ++++++++++
 examples/common/pkt_group.h          | 139 +++++++++++++++++++++++++++
 examples/common/sse/port_group.h     |  47 +++++++++
 examples/l3fwd/Makefile              |   5 +-
 examples/l3fwd/l3fwd.h               |   2 -
 examples/l3fwd/l3fwd_altivec.h       |  37 +------
 examples/l3fwd/l3fwd_common.h        | 129 +------------------------
 examples/l3fwd/l3fwd_neon.h          |  39 +-------
 examples/l3fwd/l3fwd_sse.h           |  36 +------
 examples/meson.build                 |   2 +-
 11 files changed, 293 insertions(+), 241 deletions(-)
 create mode 100644 examples/common/altivec/port_group.h
 create mode 100644 examples/common/neon/port_group.h
 create mode 100644 examples/common/pkt_group.h
 create mode 100644 examples/common/sse/port_group.h

diff --git a/examples/common/altivec/port_group.h b/examples/common/altivec/port_group.h
new file mode 100644
index 0000000000..d96d14ca94
--- /dev/null
+++ b/examples/common/altivec/port_group.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2017 IBM Corporation.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _PORT_GROUP_H_
+#define _PORT_GROUP_H_
+
+#include "pkt_group.h"
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
+	     __vector unsigned short dp1,
+	     __vector unsigned short dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+
+	v = vec_any_eq(dp1, dp2);
+
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+#endif /* _PORT_GROUP_H_ */
diff --git a/examples/common/neon/port_group.h b/examples/common/neon/port_group.h
new file mode 100644
index 0000000000..82c6ed6d73
--- /dev/null
+++ b/examples/common/neon/port_group.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _PORT_GROUP_H_
+#define _PORT_GROUP_H_
+
+#include "pkt_group.h"
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+		  uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+	int32_t v;
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+	rte_compiler_barrier();
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+#endif /* _PORT_GROUP_H_ */
diff --git a/examples/common/pkt_group.h b/examples/common/pkt_group.h
new file mode 100644
index 0000000000..8b26d9380f
--- /dev/null
+++ b/examples/common/pkt_group.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _PKT_GROUP_H_
+#define _PKT_GROUP_H_
+
+#define FWDSTEP	4
+
+/*
+ * Group consecutive packets with the same destination port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {         \
+		(lp)[0]++;                           \
+	} else {                                     \
+		(dlp) = (dcp)[idx];                  \
+		(lp) = (pn) + (idx);                 \
+		(lp)[0] = 1;                         \
+	}                                            \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+#endif /* _PKT_GROUP_H_ */
diff --git a/examples/common/sse/port_group.h b/examples/common/sse/port_group.h
new file mode 100644
index 0000000000..1ec09f8e4e
--- /dev/null
+++ b/examples/common/sse/port_group.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016 Intel Corporation.
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _PORT_GROUP_H_
+#define _PORT_GROUP_H_
+
+#include "pkt_group.h"
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1,
+		 __m128i dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+
+	dp1 = _mm_cmpeq_epi16(dp1, dp2);
+	dp1 = _mm_unpacklo_epi16(dp1, dp1);
+	v = _mm_movemask_ps((__m128)dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+#endif /* _PORT_GROUP_H_ */
diff --git a/examples/l3fwd/Makefile b/examples/l3fwd/Makefile
index 8efe6378e2..8dbe85c2e6 100644
--- a/examples/l3fwd/Makefile
+++ b/examples/l3fwd/Makefile
@@ -22,6 +22,7 @@ shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 # Added for 'rte_eth_link_to_str()'
@@ -38,10 +39,10 @@ endif
 endif

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 8a52c90755..40b5f32a9e 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -44,8 +44,6 @@
 /* Used to mark destination port as 'invalid'. */
 #define	BAD_PORT ((uint16_t)-1)

-#define FWDSTEP	4
-
 /* replace first 12B of the ethernet header. */
 #define	MASK_ETH 0x3f

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 88fb41843b..87018f5dbe 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -8,6 +8,7 @@
 #define _L3FWD_ALTIVEC_H_

 #include "l3fwd.h"
+#include "altivec/port_group.h"
 #include "l3fwd_common.h"

 /*
@@ -82,42 +83,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
 			&dst_port[3], pkt[3]->packet_type);
 }

-/*
- * Group consecutive packets with the same destination port in bursts of 4.
- * Suppose we have array of destination ports:
- * dst_port[] = {a, b, c, d,, e, ... }
- * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
- * We doing 4 comparisons at once and the result is 4 bit mask.
- * This mask is used as an index into prebuild array of pnum values.
- */
-static inline uint16_t *
-port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
-		__vector unsigned short dp1,
-		__vector unsigned short dp2)
-{
-	union {
-		uint16_t u16[FWDSTEP + 1];
-		uint64_t u64;
-	} *pnum = (void *)pn;
-
-	int32_t v;
-
-	v = vec_any_eq(dp1, dp2);
-
-
-	/* update last port counter. */
-	lp[0] += gptbl[v].lpv;
-
-	/* if dest port value has changed. */
-	if (v != GRPMSK) {
-		pnum->u64 = gptbl[v].pnum;
-		pnum->u16[FWDSTEP] = 1;
-		lp = pnum->u16 + gptbl[v].idx;
-	}
-
-	return lp;
-}
-
 /**
  * Process one packet:
  * Update source and destination MAC addresses in the ethernet header.
diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
index 8e4c27218f..224b1c08e8 100644
--- a/examples/l3fwd/l3fwd_common.h
+++ b/examples/l3fwd/l3fwd_common.h
@@ -7,6 +7,8 @@
 #ifndef _L3FWD_COMMON_H_
 #define _L3FWD_COMMON_H_

+#include "pkt_group.h"
+
 #ifdef DO_RFC_1812_CHECKS

 #define	IPV4_MIN_VER_IHL	0x45
@@ -50,133 +52,6 @@ rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
 #define	rfc1812_process(mb, dp, ptype)	do { } while (0)
 #endif /* DO_RFC_1812_CHECKS */

-/*
- * We group consecutive packets with the same destination port into one burst.
- * To avoid extra latency this is done together with some other packet
- * processing, but after we made a final decision about packet's destination.
- * To do this we maintain:
- * pnum - array of number of consecutive packets with the same dest port for
- * each packet in the input burst.
- * lp - pointer to the last updated element in the pnum.
- * dlp - dest port value lp corresponds to.
- */
-
-#define	GRPSZ	(1 << FWDSTEP)
-#define	GRPMSK	(GRPSZ - 1)
-
-#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
-	if (likely((dlp) == (dcp)[(idx)])) {             \
-		(lp)[0]++;                                   \
-	} else {                                         \
-		(dlp) = (dcp)[idx];                          \
-		(lp) = (pn) + (idx);                         \
-		(lp)[0] = 1;                                 \
-	}                                                \
-} while (0)
-
-static const struct {
-	uint64_t pnum; /* prebuild 4 values for pnum[]. */
-	int32_t  idx;  /* index for new last updated element. */
-	uint16_t lpv;  /* add value to the last updated element. */
-} gptbl[GRPSZ] = {
-	{
-		/* 0: a != b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 1: a == b, b != c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 2: a != b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 3: a == b, b == c, c != d, d != e */
-		.pnum = UINT64_C(0x0001000100020003),
-		.idx = 4,
-		.lpv = 2,
-	},
-	{
-		/* 4: a != b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 5: a == b, b != c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200010002),
-		.idx = 4,
-		.lpv = 1,
-	},
-	{
-		/* 6: a != b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030001),
-		.idx = 4,
-		.lpv = 0,
-	},
-	{
-		/* 7: a == b, b == c, c == d, d != e */
-		.pnum = UINT64_C(0x0001000200030004),
-		.idx = 4,
-		.lpv = 3,
-	},
-	{
-		/* 8: a != b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 9: a == b, b != c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100010002),
-		.idx = 3,
-		.lpv = 1,
-	},
-	{
-		/* 0xa: a != b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020001),
-		.idx = 3,
-		.lpv = 0,
-	},
-	{
-		/* 0xb: a == b, b == c, c != d, d == e */
-		.pnum = UINT64_C(0x0002000100020003),
-		.idx = 3,
-		.lpv = 2,
-	},
-	{
-		/* 0xc: a != b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010001),
-		.idx = 2,
-		.lpv = 0,
-	},
-	{
-		/* 0xd: a == b, b != c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300010002),
-		.idx = 2,
-		.lpv = 1,
-	},
-	{
-		/* 0xe: a != b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040001),
-		.idx = 1,
-		.lpv = 0,
-	},
-	{
-		/* 0xf: a == b, b == c, c == d, d == e */
-		.pnum = UINT64_C(0x0002000300040005),
-		.idx = 0,
-		.lpv = 4,
-	},
-};
-
 static __rte_always_inline void
 send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
 		uint32_t num)
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index e3d33a5229..ce515e0bc4 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -7,6 +7,7 @@
 #define _L3FWD_NEON_H_

 #include "l3fwd.h"
+#include "neon/port_group.h"
 #include "l3fwd_common.h"

 /*
@@ -62,44 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
 			&dst_port[3], pkt[3]->packet_type);
 }

-/*
- * Group consecutive packets with the same destination port in bursts of 4.
- * Suppose we have array of destination ports:
- * dst_port[] = {a, b, c, d,, e, ... }
- * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
- * We doing 4 comparisons at once and the result is 4 bit mask.
- * This mask is used as an index into prebuild array of pnum values.
- */
-static inline uint16_t *
-port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
-	     uint16x8_t dp2)
-{
-	union {
-		uint16_t u16[FWDSTEP + 1];
-		uint64_t u64;
-	} *pnum = (void *)pn;
-
-	int32_t v;
-	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
-
-	dp1 = vceqq_u16(dp1, dp2);
-	dp1 = vandq_u16(dp1, mask);
-	v = vaddvq_u16(dp1);
-
-	/* update last port counter. */
-	lp[0] += gptbl[v].lpv;
-	rte_compiler_barrier();
-
-	/* if dest port value has changed. */
-	if (v != GRPMSK) {
-		pnum->u64 = gptbl[v].pnum;
-		pnum->u16[FWDSTEP] = 1;
-		lp = pnum->u16 + gptbl[v].idx;
-	}
-
-	return lp;
-}
-
 /**
  * Process one packet:
  * Update source and destination MAC addresses in the ethernet header.
diff --git a/examples/l3fwd/l3fwd_sse.h b/examples/l3fwd/l3fwd_sse.h
index d5a717e18c..0f0d0323a2 100644
--- a/examples/l3fwd/l3fwd_sse.h
+++ b/examples/l3fwd/l3fwd_sse.h
@@ -7,6 +7,7 @@
 #define _L3FWD_SSE_H_

 #include "l3fwd.h"
+#include "sse/port_group.h"
 #include "l3fwd_common.h"

 /*
@@ -62,41 +63,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
 			&dst_port[3], pkt[3]->packet_type);
 }

-/*
- * Group consecutive packets with the same destination port in bursts of 4.
- * Suppose we have array of destination ports:
- * dst_port[] = {a, b, c, d,, e, ... }
- * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
- * We doing 4 comparisons at once and the result is 4 bit mask.
- * This mask is used as an index into prebuild array of pnum values.
- */
-static inline uint16_t *
-port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, __m128i dp1, __m128i dp2)
-{
-	union {
-		uint16_t u16[FWDSTEP + 1];
-		uint64_t u64;
-	} *pnum = (void *)pn;
-
-	int32_t v;
-
-	dp1 = _mm_cmpeq_epi16(dp1, dp2);
-	dp1 = _mm_unpacklo_epi16(dp1, dp1);
-	v = _mm_movemask_ps((__m128)dp1);
-
-	/* update last port counter. */
-	lp[0] += gptbl[v].lpv;
-
-	/* if dest port value has changed. */
-	if (v != GRPMSK) {
-		pnum->u64 = gptbl[v].pnum;
-		pnum->u16[FWDSTEP] = 1;
-		lp = pnum->u16 + gptbl[v].idx;
-	}
-
-	return lp;
-}
-
 /**
  * Process one packet:
  * Update source and destination MAC addresses in the ethernet header.
diff --git a/examples/meson.build b/examples/meson.build
index 78de0e1f37..81e93799f2 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -97,7 +97,7 @@ foreach example: examples
     ldflags = default_ldflags

     ext_deps = []
-    includes = [include_directories(example)]
+    includes = [include_directories(example, 'common')]
     deps = ['eal', 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
     subdir(example)

--
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode
  2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
@ 2022-06-23  9:38   ` Rahul Bhansali
  2022-06-26 19:00   ` [PATCH v3 1/2] examples/l3fwd: common packet group functionality Konstantin Ananyev
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-06-23  9:38 UTC (permalink / raw)
  To: dev, Radu Nicolau, Akhil Goyal, Ruifeng Wang
  Cc: jerinj, konstantin.v.ananyev, Rahul Bhansali

This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by ~8% and inbound performance increased by ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
Acked-by: Akhil Goyal <gakhil@marvell.com>
---
Changes in v3: updated port group header file name for Neon.

Changes in v2: Removed Neon packet grouping function and used
the common one.

 examples/ipsec-secgw/Makefile         |   5 +-
 examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
 examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++++++++
 examples/ipsec-secgw/ipsec_neon.h     | 321 ++++++++++++++++++++++++++
 examples/ipsec-secgw/ipsec_worker.c   |   9 +
 5 files changed, 571 insertions(+), 2 deletions(-)
 create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
 create mode 100644 examples/ipsec-secgw/ipsec_neon.h

diff --git a/examples/ipsec-secgw/Makefile b/examples/ipsec-secgw/Makefile
index 89af54bd37..ffe232774d 100644
--- a/examples/ipsec-secgw/Makefile
+++ b/examples/ipsec-secgw/Makefile
@@ -36,6 +36,7 @@ shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
@@ -53,10 +54,10 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -Wno-address-of-packed-member

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 4d8a4a71b8..b650668305 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@
 #include "parser.h"
 #include "sad.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;

 #define MAX_JUMBO_PKT_LEN  9600
@@ -100,6 +104,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
 	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };

+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];

 #define CMD_LINE_OPT_CONFIG		"config"
@@ -568,9 +578,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
 			process_pkts_outbound(&qconf->outbound, &traffic);
 	}

+#if defined __ARM_NEON
+	/* Neon optimized packet routing */
+	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+			 qconf->outbound.ipv4_offloads, true);
+	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
 	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
 		    qconf->outbound.ipv4_offloads, true);
 	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }

 static inline void
@@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
 		return -EINVAL;

 	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+			    (struct rte_ether_addr *)(val_eth + port));
 	return 0;
 }

@@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 			portid, rte_strerror(-ret));

 	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+			    (struct rte_ether_addr *)(val_eth + portid));
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+			    (struct rte_ether_addr *)(val_eth + portid) + 1);
+
 	print_ethaddr("Address: ", &ethaddr);
 	printf("\n");

diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+		uint64_t *inline_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+	int i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+							RTE_ETHER_HDR_LEN);
+		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		/* Fetch destination IPv4 address */
+		dst[i] = ipv4_hdr->dst_addr;
+		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+	}
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	uint32_t next_hop;
+	rte_xmm_t dst;
+	uint8_t i;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* If all 4 packets are non-inline */
+	if (!inline_flag) {
+		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+				 BAD_PORT);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+		return;
+	}
+
+	/* Inline and non-inline packets */
+	dst.x = dip;
+	for (i = 0; i < FWDSTEP; i++) {
+		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+			dprt[i] = (uint16_t) (((next_hop &
+						RTE_LPM_LOOKUP_SUCCESS) != 0)
+						? next_hop : BAD_PORT);
+
+		} else {
+			dprt[i] = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						 dst.u32[i], &next_hop) == 0)
+						? next_hop : BAD_PORT);
+		}
+	}
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+		   uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint32_t next_hop;
+	uint32_t dst_ip;
+
+	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+		next_hop = get_hop_for_offload_pkt(pkt, 0);
+		*dst_port = (uint16_t) (((next_hop &
+					  RTE_LPM_LOOKUP_SUCCESS) != 0)
+					  ? next_hop : BAD_PORT);
+	} else {
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+		*dst_port = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						dst_ip, &next_hop) == 0)
+						? next_hop : BAD_PORT);
+	}
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+	uint8_t dst_ip6[MAX_PKT_BURST][16];
+	int32_t dst_port[MAX_PKT_BURST];
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	int32_t hop[MAX_PKT_BURST];
+	struct rte_mbuf *pkt;
+	uint8_t lpm_pkts = 0;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	/* Need to do an LPM lookup for non-inline packets. Inline packets will
+	 * have port ID in the SA
+	 */
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+		pkt->l2_len = RTE_ETHER_HDR_LEN;
+		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+			/* Security offload not enabled. So an LPM lookup is
+			 * required to get the hop
+			 */
+			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+			memcpy(&dst_ip6[lpm_pkts][0],
+					ipv6_hdr->dst_addr, 16);
+			lpm_pkts++;
+		}
+	}
+
+	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+				  hop, lpm_pkts);
+
+	lpm_pkts = 0;
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			/* Read hop from the SA */
+			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+		} else {
+			/* Need to use hop returned by lookup */
+			dst_port[i] = hop[lpm_pkts++];
+		}
+		if (dst_port[i] == -1)
+			dst_port[i] = BAD_PORT;
+	}
+
+	/* Send packets */
+	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+		 uint64_t tx_offloads, bool ip_cksum)
+{
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+	uint16_t dst_port[MAX_PKT_BURST];
+	uint64_t inline_flag = 0;
+	int32x4_t dip;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	for (i = 0; i != k; i += FWDSTEP) {
+		processx4_step1(&pkts[i], &dip, &inline_flag);
+		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+				&dst_port[i]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (m) {
+	case 3:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+	}
+
+	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..4bda0b1a25
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+#include "neon/port_group.h"
+
+#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
+#define BAD_PORT	((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+	struct rte_mbuf *pkt;
+	uint8_t i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		pkt = pkts[i];
+
+		/* Check if it is a large packet */
+		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+			*l_pkt |= 1;
+
+		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+		te[i] = vld1q_u32(p[i]);
+
+		/* Update last 4 bytes */
+		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+		vst1q_u32(p[i], ve[i]);
+
+		if (ip_cksum) {
+			struct rte_ipv4_hdr *ip;
+
+			pkt->ol_flags |= tx_offloads;
+
+			ip = (struct rte_ipv4_hdr *)
+				(p[i] + RTE_ETHER_HDR_LEN + 1);
+			ip->hdr_checksum = 0;
+
+			/* calculate IPv4 cksum in SW */
+			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+				ip->hdr_checksum = rte_ipv4_cksum(ip);
+		}
+
+	}
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+	       bool ip_cksum, uint8_t *l_pkt)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	/* Check if it is a large packet */
+	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+		*l_pkt |= 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+
+	if (ip_cksum) {
+		struct rte_ipv4_hdr *ip;
+
+		pkt->ol_flags |= tx_offloads;
+
+		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip->hdr_checksum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+			ip->hdr_checksum = rte_ipv4_cksum(ip);
+	}
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+	uint8_t proto;
+	uint32_t i;
+
+	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+	for (i = 0; i < num; i++)
+		send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	struct lcore_conf *qconf;
+	uint32_t len, j, n;
+
+	qconf = &lcore_conf[lcoreid];
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		core_stats_update_tx(n);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+		}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		if (len == 0)
+			goto exit;
+
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+			case 0:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 3:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 2:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 1:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+		}
+		}
+	}
+
+exit:
+	qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	uint16_t pnum[MAX_PKT_BURST + 1];
+	uint8_t l_pkt = 0;
+	uint16_t dlp, *lp;
+	int i = 0, k;
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (i = FWDSTEP; i != k; i += FWDSTEP) {
+			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+					ip_cksum, &l_pkt);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[i - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (i = 0; i < nb_rx; i += k) {
+
+		uint16_t pn;
+
+		pn = dst_port[i];
+		k = pnum[i];
+
+		if (likely(pn != BAD_PORT)) {
+			if (l_pkt)
+				/* Large packet is present, need to send
+				 * individual packets with fragment
+				 */
+				send_packets(pkts + i, pn, k, is_ipv4);
+			else
+				send_packetsx4(pkts + i, pn, k);
+
+		} else {
+			free_pkts(&pkts[i], k);
+			if (is_ipv4)
+				core_statistics[lcoreid].lpm4.miss++;
+			else
+				core_statistics[lcoreid].lpm6.miss++;
+		}
+	}
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
 	struct rte_security_session *sess;
 	struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
 				v6_num = ip6.num;
 			}

+#if defined __ARM_NEON
+			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+			route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
 			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
 			route6_pkts(rt6_ctx, v6, v6_num);
+#endif
 		}
 	}
 }
--
2.25.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
  2022-06-23  9:38   ` [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
@ 2022-06-26 19:00   ` Konstantin Ananyev
  2022-06-28  8:54     ` [EXT] " Akhil Goyal
  2022-07-03 21:40   ` Thomas Monjalon
  2022-07-04 14:48   ` Thomas Monjalon
  3 siblings, 1 reply; 26+ messages in thread
From: Konstantin Ananyev @ 2022-06-26 19:00 UTC (permalink / raw)
  To: Rahul Bhansali, dev, David Christensen, Ruifeng Wang, Bruce Richardson
  Cc: jerinj, gakhil

23/06/2022 10:38, Rahul Bhansali пишет:
> This will make the packet grouping function common, so
> that other examples can utilize as per need.
> 
> For each architecture sse/neon/altivec, port group
> headers will be created under examples/common/<arch>.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
> Changes in v3: Created common port-group headers for
> architectures sse/neon/altivec as suggested by Konstantin.
> 
> Changes in v2: New patch to address review comment.
> 


Tested-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>



> 
> --
> 2.25.1
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-06-26 19:00   ` [PATCH v3 1/2] examples/l3fwd: common packet group functionality Konstantin Ananyev
@ 2022-06-28  8:54     ` Akhil Goyal
  0 siblings, 0 replies; 26+ messages in thread
From: Akhil Goyal @ 2022-06-28  8:54 UTC (permalink / raw)
  To: Konstantin Ananyev, Rahul Bhansali, dev, David Christensen,
	Ruifeng Wang, Bruce Richardson
  Cc: Jerin Jacob Kollanukkaran

> 23/06/2022 10:38, Rahul Bhansali пишет:
> > This will make the packet grouping function common, so
> > that other examples can utilize as per need.
> >
> > For each architecture sse/neon/altivec, port group
> > headers will be created under examples/common/<arch>.
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
> > Changes in v3: Created common port-group headers for
> > architectures sse/neon/altivec as suggested by Konstantin.
> >
> > Changes in v2: New patch to address review comment.
> >
> 
> 
> Tested-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
Series Applied to dpdk-next-crypto

Thanks.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
  2022-06-23  9:38   ` [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
  2022-06-26 19:00   ` [PATCH v3 1/2] examples/l3fwd: common packet group functionality Konstantin Ananyev
@ 2022-07-03 21:40   ` Thomas Monjalon
  2022-07-04 12:49     ` [EXT] " Rahul Bhansali
  2022-07-04 14:48   ` Thomas Monjalon
  3 siblings, 1 reply; 26+ messages in thread
From: Thomas Monjalon @ 2022-07-03 21:40 UTC (permalink / raw)
  To: Rahul Bhansali
  Cc: dev, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev, jerinj, gakhil, david.marchand

23/06/2022 11:38, Rahul Bhansali:
> This will make the packet grouping function common, so
> that other examples can utilize as per need.
> 
> For each architecture sse/neon/altivec, port group
> headers will be created under examples/common/<arch>.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
> Changes in v3: Created common port-group headers for
> architectures sse/neon/altivec as suggested by Konstantin.
> 
> Changes in v2: New patch to address review comment.
> 
>  examples/common/altivec/port_group.h |  48 +++++++++
>  examples/common/neon/port_group.h    |  50 ++++++++++
>  examples/common/pkt_group.h          | 139 +++++++++++++++++++++++++++
>  examples/common/sse/port_group.h     |  47 +++++++++
>  examples/l3fwd/Makefile              |   5 +-
>  examples/l3fwd/l3fwd.h               |   2 -
>  examples/l3fwd/l3fwd_altivec.h       |  37 +------
>  examples/l3fwd/l3fwd_common.h        | 129 +------------------------
>  examples/l3fwd/l3fwd_neon.h          |  39 +-------
>  examples/l3fwd/l3fwd_sse.h           |  36 +------
>  examples/meson.build                 |   2 +-

OK you move code from l3fwd to another place.
That's probably a step in the right direction.
What about taking the extra step of making it an EAL API?





^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-07-03 21:40   ` Thomas Monjalon
@ 2022-07-04 12:49     ` Rahul Bhansali
  2022-07-04 14:04       ` Thomas Monjalon
  0 siblings, 1 reply; 26+ messages in thread
From: Rahul Bhansali @ 2022-07-04 12:49 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev, Jerin Jacob Kollanukkaran, Akhil Goyal,
	david.marchand

Hi,

> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Monday, July 4, 2022 3:10 AM
> To: Rahul Bhansali <rbhansali@marvell.com>
> Cc: dev@dpdk.org; David Christensen <drc@linux.vnet.ibm.com>; Ruifeng Wang
> <ruifeng.wang@arm.com>; Bruce Richardson <bruce.richardson@intel.com>;
> Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; Akhil Goyal <gakhil@marvell.com>;
> david.marchand@redhat.com
> Subject: [EXT] Re: [PATCH v3 1/2] examples/l3fwd: common packet group
> functionality
> 
> External Email
> 
> ----------------------------------------------------------------------
> 23/06/2022 11:38, Rahul Bhansali:
> > This will make the packet grouping function common, so that other
> > examples can utilize as per need.
> >
> > For each architecture sse/neon/altivec, port group headers will be
> > created under examples/common/<arch>.
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
> > Changes in v3: Created common port-group headers for architectures
> > sse/neon/altivec as suggested by Konstantin.
> >
> > Changes in v2: New patch to address review comment.
> >
> >  examples/common/altivec/port_group.h |  48 +++++++++
> >  examples/common/neon/port_group.h    |  50 ++++++++++
> >  examples/common/pkt_group.h          | 139 +++++++++++++++++++++++++++
> >  examples/common/sse/port_group.h     |  47 +++++++++
> >  examples/l3fwd/Makefile              |   5 +-
> >  examples/l3fwd/l3fwd.h               |   2 -
> >  examples/l3fwd/l3fwd_altivec.h       |  37 +------
> >  examples/l3fwd/l3fwd_common.h        | 129 +------------------------
> >  examples/l3fwd/l3fwd_neon.h          |  39 +-------
> >  examples/l3fwd/l3fwd_sse.h           |  36 +------
> >  examples/meson.build                 |   2 +-
> 
> OK you move code from l3fwd to another place.
> That's probably a step in the right direction.
> What about taking the extra step of making it an EAL API?
Thanks for the suggestion. These changes are specific to fast path and I think EAL is more focused towards control path (Correct me if I am wrong).
Instead of EAL API, we can have it in library, but currently these are very few changes to form a library. 
Later in future if we can identify more such common APIs then we can form a library around these specific things, so that more examples/app/library can use it.
Please suggest if this makes sense.

> 
> 
> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [EXT] Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-07-04 12:49     ` [EXT] " Rahul Bhansali
@ 2022-07-04 14:04       ` Thomas Monjalon
  0 siblings, 0 replies; 26+ messages in thread
From: Thomas Monjalon @ 2022-07-04 14:04 UTC (permalink / raw)
  To: Rahul Bhansali
  Cc: dev, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev, Jerin Jacob Kollanukkaran, Akhil Goyal,
	david.marchand

04/07/2022 14:49, Rahul Bhansali:
> From: Thomas Monjalon <thomas@monjalon.net>
> > 23/06/2022 11:38, Rahul Bhansali:
> > > This will make the packet grouping function common, so that other
> > > examples can utilize as per need.
> > >
> > > For each architecture sse/neon/altivec, port group headers will be
> > > created under examples/common/<arch>.
> > >
> > > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > > ---
> > > Changes in v3: Created common port-group headers for architectures
> > > sse/neon/altivec as suggested by Konstantin.
> > >
> > > Changes in v2: New patch to address review comment.
> > >
> > >  examples/common/altivec/port_group.h |  48 +++++++++
> > >  examples/common/neon/port_group.h    |  50 ++++++++++
> > >  examples/common/pkt_group.h          | 139 +++++++++++++++++++++++++++
> > >  examples/common/sse/port_group.h     |  47 +++++++++
> > >  examples/l3fwd/Makefile              |   5 +-
> > >  examples/l3fwd/l3fwd.h               |   2 -
> > >  examples/l3fwd/l3fwd_altivec.h       |  37 +------
> > >  examples/l3fwd/l3fwd_common.h        | 129 +------------------------
> > >  examples/l3fwd/l3fwd_neon.h          |  39 +-------
> > >  examples/l3fwd/l3fwd_sse.h           |  36 +------
> > >  examples/meson.build                 |   2 +-
> > 
> > OK you move code from l3fwd to another place.
> > That's probably a step in the right direction.
> > What about taking the extra step of making it an EAL API?
> 
> Thanks for the suggestion. These changes are specific to fast path and I think EAL is more focused towards control path (Correct me if I am wrong).

No, EAL is just a set of basic functions.
Locks, time counters, bit ops are examples of EAL functions
which can be used in data path.

> Instead of EAL API, we can have it in library, but currently these are very few changes to form a library.
> Later in future if we can identify more such common APIs then we can form a library around these specific things, so that more examples/app/library can use it.
> Please suggest if this makes sense.

These are just computations, it can be a file in EAL.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
                     ` (2 preceding siblings ...)
  2022-07-03 21:40   ` Thomas Monjalon
@ 2022-07-04 14:48   ` Thomas Monjalon
  2022-07-05 16:11     ` [EXT] " Rahul Bhansali
  3 siblings, 1 reply; 26+ messages in thread
From: Thomas Monjalon @ 2022-07-04 14:48 UTC (permalink / raw)
  To: Rahul Bhansali
  Cc: dev, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev, jerinj, gakhil

23/06/2022 11:38, Rahul Bhansali:
> +#ifndef _PORT_GROUP_H_
> +#define _PORT_GROUP_H_

No need of underscores at begin and end.

> +
> +#include "pkt_group.h"
> +
> +/*
> + * Group consecutive packets with the same destination port in bursts of 4.
> + * Suppose we have array of destination ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisons at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */

This explanation is not clear to me.

> +static inline uint16_t *
> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,

array parameter is not standard, you should make it a simple pointer

> +	     __vector unsigned short dp1,
> +	     __vector unsigned short dp2)


longer parameter names would help

[...]
> --- a/examples/l3fwd/Makefile
> +++ b/examples/l3fwd/Makefile
> +INCLUDES =-I../common
>  PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
>  CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
>  # Added for 'rte_eth_link_to_str()'
> @@ -38,10 +39,10 @@ endif
>  endif
> 
>  build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> 
>  build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

No need to introduce INCLUDES, you can expand CFLAGS.

I will fix this last one while pulling.
Please work on better names and explanations for an EAL integration.




^ permalink raw reply	[flat|nested] 26+ messages in thread

* RE: [EXT] Re: [PATCH v3 1/2] examples/l3fwd: common packet group functionality
  2022-07-04 14:48   ` Thomas Monjalon
@ 2022-07-05 16:11     ` Rahul Bhansali
  0 siblings, 0 replies; 26+ messages in thread
From: Rahul Bhansali @ 2022-07-05 16:11 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: dev, David Christensen, Ruifeng Wang, Bruce Richardson,
	Konstantin Ananyev, Jerin Jacob Kollanukkaran, Akhil Goyal



> -----Original Message-----
> From: Thomas Monjalon <thomas@monjalon.net>
> Sent: Monday, July 4, 2022 8:18 PM
> To: Rahul Bhansali <rbhansali@marvell.com>
> Cc: dev@dpdk.org; David Christensen <drc@linux.vnet.ibm.com>; Ruifeng Wang
> <ruifeng.wang@arm.com>; Bruce Richardson <bruce.richardson@intel.com>;
> Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; Akhil Goyal <gakhil@marvell.com>
> Subject: [EXT] Re: [PATCH v3 1/2] examples/l3fwd: common packet group
> functionality
> 
> External Email
> 
> ----------------------------------------------------------------------
> 23/06/2022 11:38, Rahul Bhansali:
> > +#ifndef _PORT_GROUP_H_
> > +#define _PORT_GROUP_H_
> 
> No need of underscores at begin and end.
> 
> > +
> > +#include "pkt_group.h"
> > +
> > +/*
> > + * Group consecutive packets with the same destination port in bursts of 4.
> > + * Suppose we have array of destination ports:
> > + * dst_port[] = {a, b, c, d,, e, ... }
> > + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> > + * We doing 4 comparisons at once and the result is 4 bit mask.
> > + * This mask is used as an index into prebuild array of pnum values.
> > + */
> 
> This explanation is not clear to me.
> 
> > +static inline uint16_t *
> > +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
> 
> array parameter is not standard, you should make it a simple pointer
> 
> > +	     __vector unsigned short dp1,
> > +	     __vector unsigned short dp2)
> 
> 
> longer parameter names would help
> 
> [...]
> > --- a/examples/l3fwd/Makefile
> > +++ b/examples/l3fwd/Makefile
> > +INCLUDES =-I../common
> >  PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS +=
> > -O3 $(shell $(PKGCONF) --cflags libdpdk)  # Added for
> > 'rte_eth_link_to_str()'
> > @@ -38,10 +39,10 @@ endif
> >  endif
> >
> >  build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> > -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> > +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> > +$(LDFLAGS_SHARED)
> >
> >  build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> > -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> > +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> > +$(LDFLAGS_STATIC)
> 
> No need to introduce INCLUDES, you can expand CFLAGS.
> 
> I will fix this last one while pulling.
> Please work on better names and explanations for an EAL integration.
> 
Ack, will make changes for an EAL integration.

> 


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-07-05 16:12 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-24  9:57 [PATCH] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-05-24 23:00 ` Konstantin Ananyev
2022-05-25 11:03   ` [EXT] " Rahul Bhansali
2022-05-27 11:44     ` Konstantin Ananyev
2022-06-17  7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
2022-06-17  7:42   ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-06-17  7:51     ` Rahul Bhansali
2022-06-21 12:55     ` Akhil Goyal
2022-06-23  8:46     ` Zhang, Roy Fan
2022-06-23  9:37       ` Rahul Bhansali
2022-06-17  7:50   ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
2022-06-20 23:13     ` Konstantin Ananyev
2022-06-21 16:50       ` [EXT] " Rahul Bhansali
2022-06-22 23:25         ` Konstantin Ananyev
2022-06-20  7:49   ` [EXT] " Akhil Goyal
2022-06-20 10:45     ` Thomas Monjalon
2022-06-21 12:56     ` Akhil Goyal
2022-06-23  9:38 ` [PATCH v3 " Rahul Bhansali
2022-06-23  9:38   ` [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-06-26 19:00   ` [PATCH v3 1/2] examples/l3fwd: common packet group functionality Konstantin Ananyev
2022-06-28  8:54     ` [EXT] " Akhil Goyal
2022-07-03 21:40   ` Thomas Monjalon
2022-07-04 12:49     ` [EXT] " Rahul Bhansali
2022-07-04 14:04       ` Thomas Monjalon
2022-07-04 14:48   ` Thomas Monjalon
2022-07-05 16:11     ` [EXT] " Rahul Bhansali

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).