From: Rahul Bhansali <rbhansali@marvell.com>
To: <dev@dpdk.org>, Radu Nicolau <radu.nicolau@intel.com>,
Akhil Goyal <gakhil@marvell.com>,
Ruifeng Wang <ruifeng.wang@arm.com>
Cc: <jerinj@marvell.com>, Rahul Bhansali <rbhansali@marvell.com>
Subject: [PATCH] examples/ipsec-secgw: add support of NEON with poll mode
Date: Tue, 24 May 2022 15:27:17 +0530 [thread overview]
Message-ID: <20220524095717.3875284-1-rbhansali@marvell.com> (raw)
This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.
Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by upto ~8% and inbound performance increased by
upto ~6%.
Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
examples/ipsec-secgw/ipsec-secgw.c | 25 ++
examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
examples/ipsec-secgw/ipsec_neon.h | 487 ++++++++++++++++++++++++++
examples/ipsec-secgw/ipsec_worker.c | 9 +
4 files changed, 734 insertions(+)
create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
create mode 100644 examples/ipsec-secgw/ipsec_neon.h
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 25255e053c..038c4669f5 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@
#include "parser.h"
#include "sad.h"
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
volatile bool force_quit;
#define MAX_JUMBO_PKT_LEN 9600
@@ -96,6 +100,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
};
+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
#define CMD_LINE_OPT_CONFIG "config"
@@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
process_pkts_outbound(&qconf->outbound, &traffic);
}
+#if defined __ARM_NEON
+ /* Neon optimized packet routing */
+ route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+ qconf->outbound.ipv4_offloads, true);
+ route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
qconf->outbound.ipv4_offloads, true);
route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
}
static inline void
@@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
return -EINVAL;
ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+ rte_ether_addr_copy((struct rte_ether_addr *)ðaddr_tbl[port].dst,
+ (struct rte_ether_addr *)(val_eth + port));
return 0;
}
@@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
portid, rte_strerror(-ret));
ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(ðaddr);
+
+ rte_ether_addr_copy((struct rte_ether_addr *)ðaddr_tbl[portid].dst,
+ (struct rte_ether_addr *)(val_eth + portid));
+ rte_ether_addr_copy((struct rte_ether_addr *)ðaddr_tbl[portid].src,
+ (struct rte_ether_addr *)(val_eth + portid) + 1);
+
print_ethaddr("Address: ", ðaddr);
printf("\n");
diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+ uint64_t *inline_flag)
+{
+ struct rte_ipv4_hdr *ipv4_hdr;
+ struct rte_ether_hdr *eth_hdr;
+ int32_t dst[FWDSTEP];
+ int i;
+
+ for (i = 0; i < FWDSTEP; i++) {
+ eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+ RTE_ETHER_HDR_LEN);
+ pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+ pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+ ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+ /* Fetch destination IPv4 address */
+ dst[i] = ipv4_hdr->dst_addr;
+ *inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+ }
+
+ dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+ struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+ uint32_t next_hop;
+ rte_xmm_t dst;
+ uint8_t i;
+
+ dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+ /* If all 4 packets are non-inline */
+ if (!inline_flag) {
+ rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+ BAD_PORT);
+ /* get rid of unused upper 16 bit for each dport. */
+ vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+ return;
+ }
+
+ /* Inline and non-inline packets */
+ dst.x = dip;
+ for (i = 0; i < FWDSTEP; i++) {
+ if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+ next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+ dprt[i] = (uint16_t) (((next_hop &
+ RTE_LPM_LOOKUP_SUCCESS) != 0)
+ ? next_hop : BAD_PORT);
+
+ } else {
+ dprt[i] = (uint16_t) ((rte_lpm_lookup(
+ (struct rte_lpm *)rt_ctx,
+ dst.u32[i], &next_hop) == 0)
+ ? next_hop : BAD_PORT);
+ }
+ }
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+ uint16_t *dst_port)
+{
+ struct rte_ether_hdr *eth_hdr;
+ struct rte_ipv4_hdr *ipv4_hdr;
+ uint32_t next_hop;
+ uint32_t dst_ip;
+
+ eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+ RTE_ETHER_HDR_LEN);
+ pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+ pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+ if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+ next_hop = get_hop_for_offload_pkt(pkt, 0);
+ *dst_port = (uint16_t) (((next_hop &
+ RTE_LPM_LOOKUP_SUCCESS) != 0)
+ ? next_hop : BAD_PORT);
+ } else {
+ ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+ dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+ *dst_port = (uint16_t) ((rte_lpm_lookup(
+ (struct rte_lpm *)rt_ctx,
+ dst_ip, &next_hop) == 0)
+ ? next_hop : BAD_PORT);
+ }
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+ uint8_t dst_ip6[MAX_PKT_BURST][16];
+ int32_t dst_port[MAX_PKT_BURST];
+ struct rte_ether_hdr *eth_hdr;
+ struct rte_ipv6_hdr *ipv6_hdr;
+ int32_t hop[MAX_PKT_BURST];
+ struct rte_mbuf *pkt;
+ uint8_t lpm_pkts = 0;
+ int32_t i;
+
+ if (nb_rx == 0)
+ return;
+
+ /* Need to do an LPM lookup for non-inline packets. Inline packets will
+ * have port ID in the SA
+ */
+
+ for (i = 0; i < nb_rx; i++) {
+ pkt = pkts[i];
+ eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+ RTE_ETHER_HDR_LEN);
+ pkt->l2_len = RTE_ETHER_HDR_LEN;
+ pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+ if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+ /* Security offload not enabled. So an LPM lookup is
+ * required to get the hop
+ */
+ ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+ memcpy(&dst_ip6[lpm_pkts][0],
+ ipv6_hdr->dst_addr, 16);
+ lpm_pkts++;
+ }
+ }
+
+ rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+ hop, lpm_pkts);
+
+ lpm_pkts = 0;
+
+ for (i = 0; i < nb_rx; i++) {
+ pkt = pkts[i];
+ if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+ /* Read hop from the SA */
+ dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+ } else {
+ /* Need to use hop returned by lookup */
+ dst_port[i] = hop[lpm_pkts++];
+ }
+ if (dst_port[i] == -1)
+ dst_port[i] = BAD_PORT;
+ }
+
+ /* Send packets */
+ send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+ uint64_t tx_offloads, bool ip_cksum)
+{
+ const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+ const int32_t m = nb_rx % FWDSTEP;
+ uint16_t dst_port[MAX_PKT_BURST];
+ uint64_t inline_flag = 0;
+ int32x4_t dip;
+ int32_t i;
+
+ if (nb_rx == 0)
+ return;
+
+ for (i = 0; i != k; i += FWDSTEP) {
+ processx4_step1(&pkts[i], &dip, &inline_flag);
+ processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+ &dst_port[i]);
+ }
+
+ /* Classify last up to 3 packets one by one */
+ switch (m) {
+ case 3:
+ process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+ i++;
+ /* fallthrough */
+ case 2:
+ process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+ i++;
+ /* fallthrough */
+ case 1:
+ process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+ }
+
+ send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..39dddcd1e3
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+
+#define FWDSTEP 4
+#define MAX_TX_BURST (MAX_PKT_BURST / 2)
+#define BAD_PORT ((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Group consecutive packets with the same destination port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define GRPSZ (1 << FWDSTEP)
+#define GRPMSK (GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx) do { \
+ if (likely((dlp) == (dcp)[(idx)])) { \
+ (lp)[0]++; \
+ } else { \
+ (dlp) = (dcp)[idx]; \
+ (lp) = (pn) + (idx); \
+ (lp)[0] = 1; \
+ } \
+} while (0)
+
+static const struct {
+ uint64_t pnum; /* prebuild 4 values for pnum[]. */
+ int32_t idx; /* index for new last updated elemnet. */
+ uint16_t lpv; /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+ {
+ /* 0: a != b, b != c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100010001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 1: a == b, b != c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100010002),
+ .idx = 4,
+ .lpv = 1,
+ },
+ {
+ /* 2: a != b, b == c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100020001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 3: a == b, b == c, c != d, d != e */
+ .pnum = UINT64_C(0x0001000100020003),
+ .idx = 4,
+ .lpv = 2,
+ },
+ {
+ /* 4: a != b, b != c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200010001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 5: a == b, b != c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200010002),
+ .idx = 4,
+ .lpv = 1,
+ },
+ {
+ /* 6: a != b, b == c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200030001),
+ .idx = 4,
+ .lpv = 0,
+ },
+ {
+ /* 7: a == b, b == c, c == d, d != e */
+ .pnum = UINT64_C(0x0001000200030004),
+ .idx = 4,
+ .lpv = 3,
+ },
+ {
+ /* 8: a != b, b != c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100010001),
+ .idx = 3,
+ .lpv = 0,
+ },
+ {
+ /* 9: a == b, b != c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100010002),
+ .idx = 3,
+ .lpv = 1,
+ },
+ {
+ /* 0xa: a != b, b == c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100020001),
+ .idx = 3,
+ .lpv = 0,
+ },
+ {
+ /* 0xb: a == b, b == c, c != d, d == e */
+ .pnum = UINT64_C(0x0002000100020003),
+ .idx = 3,
+ .lpv = 2,
+ },
+ {
+ /* 0xc: a != b, b != c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300010001),
+ .idx = 2,
+ .lpv = 0,
+ },
+ {
+ /* 0xd: a == b, b != c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300010002),
+ .idx = 2,
+ .lpv = 1,
+ },
+ {
+ /* 0xe: a != b, b == c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300040001),
+ .idx = 1,
+ .lpv = 0,
+ },
+ {
+ /* 0xf: a == b, b == c, c == d, d == e */
+ .pnum = UINT64_C(0x0002000300040005),
+ .idx = 0,
+ .lpv = 4,
+ },
+};
+
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+ uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+ uint32x4_t te[FWDSTEP];
+ uint32x4_t ve[FWDSTEP];
+ uint32_t *p[FWDSTEP];
+ struct rte_mbuf *pkt;
+ uint8_t i;
+
+ for (i = 0; i < FWDSTEP; i++) {
+ pkt = pkts[i];
+
+ /* Check if it is a large packet */
+ if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+ *l_pkt |= 1;
+
+ p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+ ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+ te[i] = vld1q_u32(p[i]);
+
+ /* Update last 4 bytes */
+ ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+ vst1q_u32(p[i], ve[i]);
+
+ if (ip_cksum) {
+ struct rte_ipv4_hdr *ip;
+
+ pkt->ol_flags |= tx_offloads;
+
+ ip = (struct rte_ipv4_hdr *)
+ (p[i] + RTE_ETHER_HDR_LEN + 1);
+ ip->hdr_checksum = 0;
+
+ /* calculate IPv4 cksum in SW */
+ if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+ ip->hdr_checksum = rte_ipv4_cksum(ip);
+ }
+
+ }
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+ uint16x8_t dp2)
+{
+ union {
+ uint16_t u16[FWDSTEP + 1];
+ uint64_t u64;
+ } *pnum = (void *)pn;
+
+ uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+ int32_t v;
+
+ dp1 = vceqq_u16(dp1, dp2);
+ dp1 = vandq_u16(dp1, mask);
+ v = vaddvq_u16(dp1);
+
+ /* update last port counter. */
+ lp[0] += gptbl[v].lpv;
+ rte_compiler_barrier();
+
+ /* if dest port value has changed. */
+ if (v != GRPMSK) {
+ pnum->u64 = gptbl[v].pnum;
+ pnum->u16[FWDSTEP] = 1;
+ lp = pnum->u16 + gptbl[v].idx;
+ }
+
+ return lp;
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+ bool ip_cksum, uint8_t *l_pkt)
+{
+ struct rte_ether_hdr *eth_hdr;
+ uint32x4_t te, ve;
+
+ /* Check if it is a large packet */
+ if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+ *l_pkt |= 1;
+
+ eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+ te = vld1q_u32((uint32_t *)eth_hdr);
+ ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+ ve = vcopyq_laneq_u32(ve, 3, te, 3);
+ vst1q_u32((uint32_t *)eth_hdr, ve);
+
+ if (ip_cksum) {
+ struct rte_ipv4_hdr *ip;
+
+ pkt->ol_flags |= tx_offloads;
+
+ ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+ ip->hdr_checksum = 0;
+
+ /* calculate IPv4 cksum in SW */
+ if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+ ip->hdr_checksum = rte_ipv4_cksum(ip);
+ }
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+ uint8_t proto;
+ uint32_t i;
+
+ proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+ for (i = 0; i < num; i++)
+ send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+ unsigned int lcoreid = rte_lcore_id();
+ struct lcore_conf *qconf;
+ uint32_t len, j, n;
+
+ qconf = &lcore_conf[lcoreid];
+
+ len = qconf->tx_mbufs[port].len;
+
+ /*
+ * If TX buffer for that queue is empty, and we have enough packets,
+ * then send them straightway.
+ */
+ if (num >= MAX_TX_BURST && len == 0) {
+ n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+ core_stats_update_tx(n);
+ if (unlikely(n < num)) {
+ do {
+ rte_pktmbuf_free(m[n]);
+ } while (++n < num);
+ }
+ return;
+ }
+
+ /*
+ * Put packets into TX buffer for that queue.
+ */
+
+ n = len + num;
+ n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+ j = 0;
+ switch (n % FWDSTEP) {
+ while (j < n) {
+ case 0:
+ qconf->tx_mbufs[port].m_table[len + j] = m[j];
+ j++;
+ /* fallthrough */
+ case 3:
+ qconf->tx_mbufs[port].m_table[len + j] = m[j];
+ j++;
+ /* fallthrough */
+ case 2:
+ qconf->tx_mbufs[port].m_table[len + j] = m[j];
+ j++;
+ /* fallthrough */
+ case 1:
+ qconf->tx_mbufs[port].m_table[len + j] = m[j];
+ j++;
+ }
+ }
+
+ len += n;
+
+ /* enough pkts to be sent */
+ if (unlikely(len == MAX_PKT_BURST)) {
+
+ send_burst(qconf, MAX_PKT_BURST, port);
+
+ /* copy rest of the packets into the TX buffer. */
+ len = num - n;
+ if (len == 0)
+ goto exit;
+
+ j = 0;
+ switch (len % FWDSTEP) {
+ while (j < len) {
+ case 0:
+ qconf->tx_mbufs[port].m_table[j] = m[n + j];
+ j++;
+ /* fallthrough */
+ case 3:
+ qconf->tx_mbufs[port].m_table[j] = m[n + j];
+ j++;
+ /* fallthrough */
+ case 2:
+ qconf->tx_mbufs[port].m_table[j] = m[n + j];
+ j++;
+ /* fallthrough */
+ case 1:
+ qconf->tx_mbufs[port].m_table[j] = m[n + j];
+ j++;
+ }
+ }
+ }
+
+exit:
+ qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+ int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+ unsigned int lcoreid = rte_lcore_id();
+ uint16_t pnum[MAX_PKT_BURST + 1];
+ uint8_t l_pkt = 0;
+ uint16_t dlp, *lp;
+ int i = 0, k;
+
+ /*
+ * Finish packet processing and group consecutive
+ * packets with the same destination port.
+ */
+ k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+ if (k != 0) {
+ uint16x8_t dp1, dp2;
+
+ lp = pnum;
+ lp[0] = 1;
+
+ processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+ /* dp1: <d[0], d[1], d[2], d[3], ... > */
+ dp1 = vld1q_u16(dst_port);
+
+ for (i = FWDSTEP; i != k; i += FWDSTEP) {
+ processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+ ip_cksum, &l_pkt);
+
+ /*
+ * dp2:
+ * <d[j-3], d[j-2], d[j-1], d[j], ... >
+ */
+ dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+ lp = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+ /*
+ * dp1:
+ * <d[j], d[j+1], d[j+2], d[j+3], ... >
+ */
+ dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+ }
+
+ /*
+ * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+ */
+ dp2 = vextq_u16(dp1, dp1, 1);
+ dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+ lp = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+ /*
+ * remove values added by the last repeated
+ * dst port.
+ */
+ lp[0]--;
+ dlp = dst_port[i - 1];
+ } else {
+ /* set dlp and lp to the never used values. */
+ dlp = BAD_PORT - 1;
+ lp = pnum + MAX_PKT_BURST;
+ }
+
+ /* Process up to last 3 packets one by one. */
+ switch (nb_rx % FWDSTEP) {
+ case 3:
+ process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+ &l_pkt);
+ GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+ i++;
+ /* fallthrough */
+ case 2:
+ process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+ &l_pkt);
+ GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+ i++;
+ /* fallthrough */
+ case 1:
+ process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+ &l_pkt);
+ GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+ }
+
+ /*
+ * Send packets out, through destination port.
+ * Consecutive packets with the same destination port
+ * are already grouped together.
+ * If destination port for the packet equals BAD_PORT,
+ * then free the packet without sending it out.
+ */
+ for (i = 0; i < nb_rx; i += k) {
+
+ uint16_t pn;
+
+ pn = dst_port[i];
+ k = pnum[i];
+
+ if (likely(pn != BAD_PORT)) {
+ if (l_pkt)
+ /* Large packet is present, need to send
+ * individual packets with fragment
+ */
+ send_packets(pkts + i, pn, k, is_ipv4);
+ else
+ send_packetsx4(pkts + i, pn, k);
+
+ } else {
+ free_pkts(&pkts[i], k);
+ if (is_ipv4)
+ core_statistics[lcoreid].lpm4.miss++;
+ else
+ core_statistics[lcoreid].lpm6.miss++;
+ }
+ }
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@
#include "ipsec-secgw.h"
#include "ipsec_worker.h"
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
struct port_drv_mode_data {
struct rte_security_session *sess;
struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
v6_num = ip6.num;
}
+#if defined __ARM_NEON
+ route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+ route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
route4_pkts(rt4_ctx, v4, v4_num, 0, false);
route6_pkts(rt6_ctx, v6, v6_num);
+#endif
}
}
}
--
2.25.1
next reply other threads:[~2022-05-24 9:57 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-24 9:57 Rahul Bhansali [this message]
2022-05-24 23:00 ` Konstantin Ananyev
2022-05-25 11:03 ` [EXT] " Rahul Bhansali
2022-05-27 11:44 ` Konstantin Ananyev
2022-06-17 7:42 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
2022-06-17 7:42 ` [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-06-17 7:51 ` Rahul Bhansali
2022-06-21 12:55 ` Akhil Goyal
2022-06-23 8:46 ` Zhang, Roy Fan
2022-06-23 9:37 ` Rahul Bhansali
2022-06-17 7:50 ` [PATCH v2 1/2] examples/l3fwd: common packet group functionality Rahul Bhansali
2022-06-20 23:13 ` Konstantin Ananyev
2022-06-21 16:50 ` [EXT] " Rahul Bhansali
2022-06-22 23:25 ` Konstantin Ananyev
2022-06-20 7:49 ` [EXT] " Akhil Goyal
2022-06-20 10:45 ` Thomas Monjalon
2022-06-21 12:56 ` Akhil Goyal
2022-06-23 9:38 ` [PATCH v3 " Rahul Bhansali
2022-06-23 9:38 ` [PATCH v3 2/2] examples/ipsec-secgw: add support of NEON with poll mode Rahul Bhansali
2022-06-26 19:00 ` [PATCH v3 1/2] examples/l3fwd: common packet group functionality Konstantin Ananyev
2022-06-28 8:54 ` [EXT] " Akhil Goyal
2022-07-03 21:40 ` Thomas Monjalon
2022-07-04 12:49 ` [EXT] " Rahul Bhansali
2022-07-04 14:04 ` Thomas Monjalon
2022-07-04 14:48 ` Thomas Monjalon
2022-07-05 16:11 ` [EXT] " Rahul Bhansali
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220524095717.3875284-1-rbhansali@marvell.com \
--to=rbhansali@marvell.com \
--cc=dev@dpdk.org \
--cc=gakhil@marvell.com \
--cc=jerinj@marvell.com \
--cc=radu.nicolau@intel.com \
--cc=ruifeng.wang@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).