DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 0/3] improve MAC swap performance.
@ 2018-11-22 17:26 Qi Zhang
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 1/3] app/testpmd: code refactory for macswap Qi Zhang
                   ` (5 more replies)
  0 siblings, 6 replies; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:26 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

**The pathset is target for 19.02**

Improved testpmd macswap performance for x86 by take advantage
of SSE instructions.
On a broadwell 1.6GHz sever with a i40e 25G NIC.
We abserve 17.7% performance improvement for testpmd's macswap
test.

Qi Zhang (3):
  app/testpmd: code refactory for macswap
  app/testpmd: improve MAC swap performance for x86
  app/testpmd: further improve MAC swap performance for x86

 app/test-pmd/l2fwd.h          | 40 +++++++++++++++++++
 app/test-pmd/l2fwd_common.h   | 36 +++++++++++++++++
 app/test-pmd/macswap.c        | 36 +++--------------
 app/test-pmd/macswap.h        | 40 +++++++++++++++++++
 app/test-pmd/macswap_common.h | 36 +++++++++++++++++
 app/test-pmd/macswap_sse.h    | 90 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 248 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/l2fwd.h
 create mode 100644 app/test-pmd/l2fwd_common.h
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h
 create mode 100644 app/test-pmd/macswap_sse.h

-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 1/3] app/testpmd: code refactory for macswap
  2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
@ 2018-11-22 17:26 ` Qi Zhang
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:26 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/l2fwd.h          | 40 ++++++++++++++++++++++++++++++++++++++++
 app/test-pmd/l2fwd_common.h   | 36 ++++++++++++++++++++++++++++++++++++
 app/test-pmd/macswap.c        | 32 ++------------------------------
 app/test-pmd/macswap.h        | 40 ++++++++++++++++++++++++++++++++++++++++
 app/test-pmd/macswap_common.h | 36 ++++++++++++++++++++++++++++++++++++
 5 files changed, 154 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/l2fwd.h
 create mode 100644 app/test-pmd/l2fwd_common.h
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h

diff --git a/app/test-pmd/l2fwd.h b/app/test-pmd/l2fwd.h
new file mode 100644
index 000000000..6fcad4d75
--- /dev/null
+++ b/app/test-pmd/l2fwd.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
+
+#include "l2fwd_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	struct ether_addr addr;
+	uint64_t ol_flags;
+	int i;
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		ether_addr_copy(&eth_hdr->d_addr, &addr);
+		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+		ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/l2fwd_common.h b/app/test-pmd/l2fwd_common.h
new file mode 100644
index 000000000..2c01cbc8f
--- /dev/null
+++ b/app/test-pmd/l2fwd_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+	uint64_t ol_flags = 0;
+
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+			PKT_TX_VLAN_PKT : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+			PKT_TX_QINQ_PKT : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+			PKT_TX_MACSEC : 0;
+
+	return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+		uint16_t vlan, uint16_t vlan_outer)
+{
+	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+	mb->ol_flags |= ol_flags;
+	mb->l2_len = sizeof(struct ether_hdr);
+	mb->l3_len = sizeof(struct ipv4_hdr);
+	mb->vlan_tci = vlan;
+	mb->vlan_tci_outer = vlan_outer;
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..849194fe2 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,6 +66,7 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#include "macswap.h"
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
@@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 {
 	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
 	struct rte_port  *txp;
-	struct rte_mbuf  *mb;
-	struct ether_hdr *eth_hdr;
-	struct ether_addr addr;
 	uint16_t nb_rx;
 	uint16_t nb_tx;
-	uint16_t i;
 	uint32_t retry;
-	uint64_t ol_flags = 0;
-	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t start_tsc;
 	uint64_t end_tsc;
@@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 #endif
 	fs->rx_packets += nb_rx;
 	txp = &ports[fs->tx_port];
-	tx_offloads = txp->dev_conf.txmode.offloads;
-	if (tx_offloads	& DEV_TX_OFFLOAD_VLAN_INSERT)
-		ol_flags = PKT_TX_VLAN_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
-		ol_flags |= PKT_TX_QINQ_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
-		ol_flags |= PKT_TX_MACSEC;
-	for (i = 0; i < nb_rx; i++) {
-		if (likely(i < nb_rx - 1))
-			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
-						       void *));
-		mb = pkts_burst[i];
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
 
-		/* Swap dest and src mac addresses. */
-		ether_addr_copy(&eth_hdr->d_addr, &addr);
-		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
-		ether_addr_copy(&addr, &eth_hdr->s_addr);
+	do_macswap(pkts_burst, nb_rx, txp);
 
-		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
-		mb->ol_flags |= ol_flags;
-		mb->l2_len = sizeof(struct ether_hdr);
-		mb->l3_len = sizeof(struct ipv4_hdr);
-		mb->vlan_tci = txp->tx_vlan_id;
-		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
-	}
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
 	/*
 	 * Retry if necessary
diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h
new file mode 100644
index 000000000..bc8a95626
--- /dev/null
+++ b/app/test-pmd/macswap.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	struct ether_addr addr;
+	uint64_t ol_flags;
+	int i;
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		ether_addr_copy(&eth_hdr->d_addr, &addr);
+		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+		ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
new file mode 100644
index 000000000..2c01cbc8f
--- /dev/null
+++ b/app/test-pmd/macswap_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+	uint64_t ol_flags = 0;
+
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+			PKT_TX_VLAN_PKT : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+			PKT_TX_QINQ_PKT : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+			PKT_TX_MACSEC : 0;
+
+	return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+		uint16_t vlan, uint16_t vlan_outer)
+{
+	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+	mb->ol_flags |= ol_flags;
+	mb->l2_len = sizeof(struct ether_hdr);
+	mb->l3_len = sizeof(struct ipv4_hdr);
+	mb->vlan_tci = vlan;
+	mb->vlan_tci_outer = vlan_outer;
+}
+
+#endif /* _BPF_CMD_H_ */
+
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 2/3] app/testpmd: improve MAC swap performance for x86
  2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-11-22 17:26 ` Qi Zhang
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 3/3] app/testpmd: further " Qi Zhang
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:26 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c     |  4 ++++
 app/test-pmd/macswap_sse.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 app/test-pmd/macswap_sse.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 849194fe2..cbb41b728 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,7 +66,11 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#ifdef RTE_ARCH_X86
+#include "macswap_sse.h"
+#else
 #include "macswap.h"
+#endif
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
new file mode 100644
index 000000000..d5b0f6a21
--- /dev/null
+++ b/app/test-pmd/macswap_sse.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_SSE_H_
+#define _L2FWD_SSE_H_
+
+#include "macswap_common.h"
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	uint64_t ol_flags;
+	int i;
+	__m128i addr;
+	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+					5, 4, 3, 2,
+					1, 0, 11, 10,
+					9, 8, 7, 6);
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		addr = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr = _mm_shuffle_epi8(addr, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+
+		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
+
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH 3/3] app/testpmd: further improve MAC swap performance for x86
  2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 1/3] app/testpmd: code refactory for macswap Qi Zhang
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-11-22 17:26 ` Qi Zhang
  2018-11-22 17:38 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:26 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Do four packets macswap in same loop iterate to squeeze more
CPU cycles.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap_sse.h | 65 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index d5b0f6a21..0649539c2 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -10,11 +10,12 @@ static inline void
 do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		struct rte_port *txp)
 {
-	struct ether_hdr *eth_hdr;
-	struct rte_mbuf *mb;
+	struct ether_hdr *eth_hdr[4];
+	struct rte_mbuf *mb[4];
 	uint64_t ol_flags;
 	int i;
-	__m128i addr;
+	int r;
+	__m128i addr0, addr1, addr2, addr3;
 	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
 					5, 4, 3, 2,
 					1, 0, 11, 10,
@@ -22,19 +23,61 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 
 	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
 
-	for (i = 0; i < nb; i++) {
-		if (likely(i < nb - 1))
+	i = 0;
+	r = nb;
+
+	while (r >= 4) {
+		mb[0] = pkts[i++];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+
+		mb[1] = pkts[i++];
+		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+
+
+		mb[2] = pkts[i++];
+		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+
+		mb[3] = pkts[i++];
+		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
+
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
+		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
+
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
+		_mm_storeu_si128((__m128i *)eth_hdr[1], addr1);
+		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
+		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
+
+		mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		mbuf_field_set(mb[1], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		mbuf_field_set(mb[2], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		mbuf_field_set(mb[3], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		r -= 4;
+	}
+
+	for ( ; i < nb; i++) {
+		if (i < nb - 1)
 			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
-		mb = pkts[i];
+		mb[0] = pkts[i];
 
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
-		addr = _mm_loadu_si128((__m128i *)eth_hdr);
-		addr = _mm_shuffle_epi8(addr, shfl_msk);
-		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
 
-		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+		mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
 				txp->tx_vlan_id_outer);
 	}
 }
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance
  2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
                   ` (2 preceding siblings ...)
  2018-11-22 17:26 ` [dpdk-dev] [PATCH 3/3] app/testpmd: further " Qi Zhang
@ 2018-11-22 17:38 ` Qi Zhang
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
                     ` (2 more replies)
  2018-12-11  5:55 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
  2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
  5 siblings, 3 replies; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:38 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

**The pathset is target for 19.02**

Improved testpmd macswap performance for x86 by take advantage of SSE
instructions.
On a broadwell 1.6GHz sever with a i40e 25G NIC.
We abserve 17.7% performance improvement for testpmd's macswap test.

v2:
- remove uncessary files

Qi Zhang (3):
  app/testpmd: code refactory for macswap
  app/testpmd: improve MAC swap performance for x86
  app/testpmd: further improve MAC swap performance for x86

 app/test-pmd/macswap.c        | 36 +++---------------
 app/test-pmd/macswap.h        | 40 ++++++++++++++++++++
 app/test-pmd/macswap_common.h | 36 ++++++++++++++++++
 app/test-pmd/macswap_sse.h    | 86 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h
 create mode 100644 app/test-pmd/macswap_sse.h

-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-11-22 17:38 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
@ 2018-11-22 17:38   ` Qi Zhang
  2018-12-10 17:44     ` Ferruh Yigit
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: further " Qi Zhang
  2 siblings, 1 reply; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:38 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c        | 32 ++------------------------------
 app/test-pmd/macswap.h        | 40 ++++++++++++++++++++++++++++++++++++++++
 app/test-pmd/macswap_common.h | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 78 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..849194fe2 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,6 +66,7 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#include "macswap.h"
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
@@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 {
 	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
 	struct rte_port  *txp;
-	struct rte_mbuf  *mb;
-	struct ether_hdr *eth_hdr;
-	struct ether_addr addr;
 	uint16_t nb_rx;
 	uint16_t nb_tx;
-	uint16_t i;
 	uint32_t retry;
-	uint64_t ol_flags = 0;
-	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t start_tsc;
 	uint64_t end_tsc;
@@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 #endif
 	fs->rx_packets += nb_rx;
 	txp = &ports[fs->tx_port];
-	tx_offloads = txp->dev_conf.txmode.offloads;
-	if (tx_offloads	& DEV_TX_OFFLOAD_VLAN_INSERT)
-		ol_flags = PKT_TX_VLAN_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
-		ol_flags |= PKT_TX_QINQ_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
-		ol_flags |= PKT_TX_MACSEC;
-	for (i = 0; i < nb_rx; i++) {
-		if (likely(i < nb_rx - 1))
-			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
-						       void *));
-		mb = pkts_burst[i];
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
 
-		/* Swap dest and src mac addresses. */
-		ether_addr_copy(&eth_hdr->d_addr, &addr);
-		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
-		ether_addr_copy(&addr, &eth_hdr->s_addr);
+	do_macswap(pkts_burst, nb_rx, txp);
 
-		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
-		mb->ol_flags |= ol_flags;
-		mb->l2_len = sizeof(struct ether_hdr);
-		mb->l3_len = sizeof(struct ipv4_hdr);
-		mb->vlan_tci = txp->tx_vlan_id;
-		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
-	}
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
 	/*
 	 * Retry if necessary
diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h
new file mode 100644
index 000000000..bc8a95626
--- /dev/null
+++ b/app/test-pmd/macswap.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_H_
+#define _L2FWD_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	struct ether_addr addr;
+	uint64_t ol_flags;
+	int i;
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		ether_addr_copy(&eth_hdr->d_addr, &addr);
+		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+		ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
+
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
new file mode 100644
index 000000000..2c01cbc8f
--- /dev/null
+++ b/app/test-pmd/macswap_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_COMMON_H_
+#define _L2FWD_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+	uint64_t ol_flags = 0;
+
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+			PKT_TX_VLAN_PKT : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+			PKT_TX_QINQ_PKT : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+			PKT_TX_MACSEC : 0;
+
+	return ol_flags;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
+		uint16_t vlan, uint16_t vlan_outer)
+{
+	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+	mb->ol_flags |= ol_flags;
+	mb->l2_len = sizeof(struct ether_hdr);
+	mb->l3_len = sizeof(struct ipv4_hdr);
+	mb->vlan_tci = vlan;
+	mb->vlan_tci_outer = vlan_outer;
+}
+
+#endif /* _BPF_CMD_H_ */
+
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86
  2018-11-22 17:38 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-11-22 17:38   ` Qi Zhang
  2018-12-10 17:44     ` Ferruh Yigit
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: further " Qi Zhang
  2 siblings, 1 reply; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:38 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c     |  4 ++++
 app/test-pmd/macswap_sse.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 app/test-pmd/macswap_sse.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 849194fe2..cbb41b728 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,7 +66,11 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#ifdef RTE_ARCH_X86
+#include "macswap_sse.h"
+#else
 #include "macswap.h"
+#endif
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
new file mode 100644
index 000000000..d5b0f6a21
--- /dev/null
+++ b/app/test-pmd/macswap_sse.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_SSE_H_
+#define _L2FWD_SSE_H_
+
+#include "macswap_common.h"
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	uint64_t ol_flags;
+	int i;
+	__m128i addr;
+	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+					5, 4, 3, 2,
+					1, 0, 11, 10,
+					9, 8, 7, 6);
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		addr = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr = _mm_shuffle_epi8(addr, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+
+		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
+
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] app/testpmd: further improve MAC swap performance for x86
  2018-11-22 17:38 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-11-22 17:38   ` Qi Zhang
  2 siblings, 0 replies; 30+ messages in thread
From: Qi Zhang @ 2018-11-22 17:38 UTC (permalink / raw)
  To: bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Do four packets macswap in same loop iterate to squeeze more
CPU cycles.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap_sse.h | 65 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 11 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index d5b0f6a21..0649539c2 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -10,11 +10,12 @@ static inline void
 do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		struct rte_port *txp)
 {
-	struct ether_hdr *eth_hdr;
-	struct rte_mbuf *mb;
+	struct ether_hdr *eth_hdr[4];
+	struct rte_mbuf *mb[4];
 	uint64_t ol_flags;
 	int i;
-	__m128i addr;
+	int r;
+	__m128i addr0, addr1, addr2, addr3;
 	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
 					5, 4, 3, 2,
 					1, 0, 11, 10,
@@ -22,19 +23,61 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 
 	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
 
-	for (i = 0; i < nb; i++) {
-		if (likely(i < nb - 1))
+	i = 0;
+	r = nb;
+
+	while (r >= 4) {
+		mb[0] = pkts[i++];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+
+		mb[1] = pkts[i++];
+		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+
+
+		mb[2] = pkts[i++];
+		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+
+		mb[3] = pkts[i++];
+		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
+
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
+		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
+
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
+		_mm_storeu_si128((__m128i *)eth_hdr[1], addr1);
+		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
+		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
+
+		mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		mbuf_field_set(mb[1], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		mbuf_field_set(mb[2], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		mbuf_field_set(mb[3], ol_flags, txp->tx_vlan_id,
+				txp->tx_vlan_id_outer);
+		r -= 4;
+	}
+
+	for ( ; i < nb; i++) {
+		if (i < nb - 1)
 			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
-		mb = pkts[i];
+		mb[0] = pkts[i];
 
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
-		addr = _mm_loadu_si128((__m128i *)eth_hdr);
-		addr = _mm_shuffle_epi8(addr, shfl_msk);
-		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
 
-		mbuf_field_set(mb, ol_flags, txp->tx_vlan_id,
+		mbuf_field_set(mb[0], ol_flags, txp->tx_vlan_id,
 				txp->tx_vlan_id_outer);
 	}
 }
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-12-10 17:44     ` Ferruh Yigit
  2018-12-11  4:02       ` Zhang, Qi Z
  0 siblings, 1 reply; 30+ messages in thread
From: Ferruh Yigit @ 2018-12-10 17:44 UTC (permalink / raw)
  To: Qi Zhang, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Yongseok Koh

On 11/22/2018 5:38 PM, Qi Zhang wrote:
> Move macswap workload to dedicate function, so we can further enable
> platform specific optimized version.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

<...>

> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _L2FWD_H_
> +#define _L2FWD_H_

Looks like copy-paste artifact, there are a few more in patchset.

<...>

> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _L2FWD_COMMON_H_
> +#define _L2FWD_COMMON_H_
> +
> +static inline uint64_t
> +ol_flags_init(uint64_t tx_offload)
> +{
> +	uint64_t ol_flags = 0;
> +
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
> +			PKT_TX_VLAN_PKT : 0;

'PKT_TX_VLAN_PKT' is depreciated and replaced with 'PKT_TX_VLAN'. I think it is
better to keep as it is in this patch, since mainly it copies from one place to
another, but can you update this in new patch in this patchset?

> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
> +			PKT_TX_QINQ_PKT : 0;

Same here, 'PKT_TX_QINQ_PKT' replaced with 'PKT_TX_QINQ'.

> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
> +			PKT_TX_MACSEC : 0;
> +
> +	return ol_flags;
> +}
> +
> +static inline void
> +mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
> +		uint16_t vlan, uint16_t vlan_outer)
> +{
> +	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;

I guess above line is to prevent those bits overwritten, but with '|='
assignment below I think they will be preserved already, do we need above line?
cc'ed Yongseok.

> +	mb->ol_flags |= ol_flags;
> +	mb->l2_len = sizeof(struct ether_hdr);
> +	mb->l3_len = sizeof(struct ipv4_hdr);
> +	mb->vlan_tci = vlan;
> +	mb->vlan_tci_outer = vlan_outer;

Setting 'vlan_tci' or 'vlan_tci_outer' makes sense only if 'PKT_TX_VLAN' and
'PKT_TX_QINQ' set, since there is already an check for them above, does it make
sense to do these assignment in them, for better performance.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86
  2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-12-10 17:44     ` Ferruh Yigit
  0 siblings, 0 replies; 30+ messages in thread
From: Ferruh Yigit @ 2018-12-10 17:44 UTC (permalink / raw)
  To: Qi Zhang, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger

On 11/22/2018 5:38 PM, Qi Zhang wrote:
> The patch optimizes the mac swap operation by taking advantage
> of SSE instructions, it only impacts x86 platform.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

<...>

> +
> +#include "macswap_common.h"

And empty line after include can be good.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-12-10 17:44     ` Ferruh Yigit
@ 2018-12-11  4:02       ` Zhang, Qi Z
  0 siblings, 0 replies; 30+ messages in thread
From: Zhang, Qi Z @ 2018-12-11  4:02 UTC (permalink / raw)
  To: Yigit, Ferruh, Richardson, Bruce, Wiles, Keith, Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo, Iremonger, Bernard, Yongseok Koh



> -----Original Message-----
> From: Yigit, Ferruh
> Sent: Tuesday, December 11, 2018 1:44 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>; Ananyev,
> Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard
> <bernard.iremonger@intel.com>; Yongseok Koh <yskoh@mellanox.com>
> Subject: Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for
> macswap
> 
> On 11/22/2018 5:38 PM, Qi Zhang wrote:
> > Move macswap workload to dedicate function, so we can further enable
> > platform specific optimized version.
> >
> > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> 
> <...>
> 
> > @@ -0,0 +1,40 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation  */
> > +
> > +#ifndef _L2FWD_H_
> > +#define _L2FWD_H_
> 
> Looks like copy-paste artifact, there are a few more in patchset.
> 
> <...>
> 
> > @@ -0,0 +1,36 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation  */
> > +
> > +#ifndef _L2FWD_COMMON_H_
> > +#define _L2FWD_COMMON_H_
> > +
> > +static inline uint64_t
> > +ol_flags_init(uint64_t tx_offload)
> > +{
> > +	uint64_t ol_flags = 0;
> > +
> > +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
> > +			PKT_TX_VLAN_PKT : 0;
> 
> 'PKT_TX_VLAN_PKT' is depreciated and replaced with 'PKT_TX_VLAN'. I think it
> is better to keep as it is in this patch, since mainly it copies from one place to
> another, but can you update this in new patch in this patchset?

Ok, I will replace.

> 
> > +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
> > +			PKT_TX_QINQ_PKT : 0;
> 
> Same here, 'PKT_TX_QINQ_PKT' replaced with 'PKT_TX_QINQ'.
> 
> > +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
> > +			PKT_TX_MACSEC : 0;
> > +
> > +	return ol_flags;
> > +}
> > +
> > +static inline void
> > +mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags,
> > +		uint16_t vlan, uint16_t vlan_outer) {
> > +	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> 
> I guess above line is to prevent those bits overwritten, but with '|='
> assignment below I think they will be preserved already, do we need above line?
> cc'ed Yongseok.

I think above line also clean up other bits besides IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF
But I don't know if it is necessary, so I just keep it the same way as before.

> 
> > +	mb->ol_flags |= ol_flags;
> > +	mb->l2_len = sizeof(struct ether_hdr);
> > +	mb->l3_len = sizeof(struct ipv4_hdr);
> > +	mb->vlan_tci = vlan;
> > +	mb->vlan_tci_outer = vlan_outer;
> 
> Setting 'vlan_tci' or 'vlan_tci_outer' makes sense only if 'PKT_TX_VLAN' and
> 'PKT_TX_QINQ' set, since there is already an check for them above, does it
> make sense to do these assignment in them, for better performance.

Good point, we can skip these memory write if PKT_TX_VLAN and PKT_TX_QINQ is not set.

Thanks
Qi




^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance
  2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
                   ` (3 preceding siblings ...)
  2018-11-22 17:38 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
@ 2018-12-11  5:55 ` Qi Zhang
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
                     ` (2 more replies)
  2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
  5 siblings, 3 replies; 30+ messages in thread
From: Qi Zhang @ 2018-12-11  5:55 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Improved testpmd macswap performance for x86 by take advantage of SSE
instructions.
On a broadwell 1.6GHz sever with a i40e 25G NIC.
We abserve 17.7% performance improvement for testpmd's macswap test.

v2:

- replace PKT_TX_VLAN_PKT/PKT_TX_QINQ_PKT with PKT_TX_VLAN/PKT_TX_QINQ
- only set vlan / outer_vlan when related ol_flags is set.
- fix coding style

Qi Zhang (3):
  app/testpmd: code refactory for macswap
  app/testpmd: improve MAC swap performance for x86
  app/testpmd: further improve MAC swap performance for x86

 app/test-pmd/macswap.c        | 36 ++++---------------
 app/test-pmd/macswap.h        | 40 +++++++++++++++++++++
 app/test-pmd/macswap_common.h | 46 ++++++++++++++++++++++++
 app/test-pmd/macswap_sse.h    | 83 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 175 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h
 create mode 100644 app/test-pmd/macswap_sse.h

-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-12-11  5:55 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
@ 2018-12-11  5:55   ` Qi Zhang
  2018-12-11 15:48     ` Ferruh Yigit
  2018-12-14 11:14     ` Iremonger, Bernard
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: further " Qi Zhang
  2 siblings, 2 replies; 30+ messages in thread
From: Qi Zhang @ 2018-12-11  5:55 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c        | 32 ++---------------------------
 app/test-pmd/macswap.h        | 40 ++++++++++++++++++++++++++++++++++++
 app/test-pmd/macswap_common.h | 47 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..849194fe2 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,6 +66,7 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#include "macswap.h"
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
@@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 {
 	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
 	struct rte_port  *txp;
-	struct rte_mbuf  *mb;
-	struct ether_hdr *eth_hdr;
-	struct ether_addr addr;
 	uint16_t nb_rx;
 	uint16_t nb_tx;
-	uint16_t i;
 	uint32_t retry;
-	uint64_t ol_flags = 0;
-	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t start_tsc;
 	uint64_t end_tsc;
@@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 #endif
 	fs->rx_packets += nb_rx;
 	txp = &ports[fs->tx_port];
-	tx_offloads = txp->dev_conf.txmode.offloads;
-	if (tx_offloads	& DEV_TX_OFFLOAD_VLAN_INSERT)
-		ol_flags = PKT_TX_VLAN_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
-		ol_flags |= PKT_TX_QINQ_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
-		ol_flags |= PKT_TX_MACSEC;
-	for (i = 0; i < nb_rx; i++) {
-		if (likely(i < nb_rx - 1))
-			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
-						       void *));
-		mb = pkts_burst[i];
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
 
-		/* Swap dest and src mac addresses. */
-		ether_addr_copy(&eth_hdr->d_addr, &addr);
-		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
-		ether_addr_copy(&addr, &eth_hdr->s_addr);
+	do_macswap(pkts_burst, nb_rx, txp);
 
-		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
-		mb->ol_flags |= ol_flags;
-		mb->l2_len = sizeof(struct ether_hdr);
-		mb->l3_len = sizeof(struct ipv4_hdr);
-		mb->vlan_tci = txp->tx_vlan_id;
-		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
-	}
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
 	/*
 	 * Retry if necessary
diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h
new file mode 100644
index 000000000..14e665bd2
--- /dev/null
+++ b/app/test-pmd/macswap.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _MACSWAP_H_
+#define _MACSWAP_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	struct ether_addr addr;
+	uint64_t ol_flags;
+	int i;
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+	vlan_qinq_set(pkts, nb, ol_flags,
+			txp->tx_vlan_id, txp->tx_vlan_id_outer);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		ether_addr_copy(&eth_hdr->d_addr, &addr);
+		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+		ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+		mbuf_field_set(mb, ol_flags);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
new file mode 100644
index 000000000..ab0a5b5ef
--- /dev/null
+++ b/app/test-pmd/macswap_common.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _MACSWAP_COMMON_H_
+#define _MACSWAP_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+	uint64_t ol_flags = 0;
+
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+			PKT_TX_VLAN : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+			PKT_TX_QINQ : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+			PKT_TX_MACSEC : 0;
+
+	return ol_flags;
+}
+
+static inline void
+vlan_qinq_set(struct rte_mbuf *pkts[], uint16_t nb,
+		uint64_t ol_flags, uint16_t vlan, uint16_t outer_vlan)
+{
+	int i;
+
+	if (ol_flags & PKT_TX_VLAN)
+		for (i = 0; i < nb; i++)
+			pkts[i]->vlan_tci = vlan;
+	if (ol_flags & PKT_TX_QINQ)
+		for (i = 0; i < nb; i++)
+			pkts[i]->vlan_tci_outer = outer_vlan;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags)
+{
+	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+	mb->ol_flags |= ol_flags;
+	mb->l2_len = sizeof(struct ether_hdr);
+	mb->l3_len = sizeof(struct ipv4_hdr);
+}
+
+#endif /* _BPF_CMD_H_ */
+
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86
  2018-12-11  5:55 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-12-11  5:55   ` Qi Zhang
  2018-12-13 19:50     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
  2018-12-14 11:21     ` [dpdk-dev] " Iremonger, Bernard
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: further " Qi Zhang
  2 siblings, 2 replies; 30+ messages in thread
From: Qi Zhang @ 2018-12-11  5:55 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c        |  4 ++++
 app/test-pmd/macswap_common.h |  1 -
 app/test-pmd/macswap_sse.h    | 45 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 app/test-pmd/macswap_sse.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 849194fe2..cbb41b728 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,7 +66,11 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#ifdef RTE_ARCH_X86
+#include "macswap_sse.h"
+#else
 #include "macswap.h"
+#endif
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
index ab0a5b5ef..8c5518596 100644
--- a/app/test-pmd/macswap_common.h
+++ b/app/test-pmd/macswap_common.h
@@ -44,4 +44,3 @@ mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags)
 }
 
 #endif /* _BPF_CMD_H_ */
-
diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
new file mode 100644
index 000000000..79f4f9a7c
--- /dev/null
+++ b/app/test-pmd/macswap_sse.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _L2FWD_SSE_H_
+#define _L2FWD_SSE_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	uint64_t ol_flags;
+	int i;
+	__m128i addr;
+	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+					5, 4, 3, 2,
+					1, 0, 11, 10,
+					9, 8, 7, 6);
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+	vlan_qinq_set(pkts, nb, ol_flags,
+			txp->tx_vlan_id, txp->tx_vlan_id_outer);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		addr = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr = _mm_shuffle_epi8(addr, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+
+		mbuf_field_set(mb, ol_flags);
+	}
+}
+
+#endif /* _BPF_CMD_H_ */
+
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v2 3/3] app/testpmd: further improve MAC swap performance for x86
  2018-12-11  5:55 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-12-11  5:55   ` Qi Zhang
  2 siblings, 0 replies; 30+ messages in thread
From: Qi Zhang @ 2018-12-11  5:55 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Do four packets macswap in same loop iterate to squeeze more
CPU cycles.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap_sse.h | 62 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index 79f4f9a7c..df2875ace 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -11,11 +11,12 @@ static inline void
 do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		struct rte_port *txp)
 {
-	struct ether_hdr *eth_hdr;
-	struct rte_mbuf *mb;
+	struct ether_hdr *eth_hdr[4];
+	struct rte_mbuf *mb[4];
 	uint64_t ol_flags;
 	int i;
-	__m128i addr;
+	int r;
+	__m128i addr0, addr1, addr2, addr3;
 	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
 					5, 4, 3, 2,
 					1, 0, 11, 10,
@@ -25,19 +26,56 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 	vlan_qinq_set(pkts, nb, ol_flags,
 			txp->tx_vlan_id, txp->tx_vlan_id_outer);
 
-	for (i = 0; i < nb; i++) {
-		if (likely(i < nb - 1))
-			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
-		mb = pkts[i];
+	i = 0;
+	r = nb;
+
+	while (r >= 4) {
+		mb[0] = pkts[i++];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+
+		mb[1] = pkts[i++];
+		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+
+
+		mb[2] = pkts[i++];
+		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+
+		mb[3] = pkts[i++];
+		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
 
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
+		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
+
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
+		_mm_storeu_si128((__m128i *)eth_hdr[1], addr1);
+		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
+		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
+
+		mbuf_field_set(mb[0], ol_flags);
+		mbuf_field_set(mb[1], ol_flags);
+		mbuf_field_set(mb[2], ol_flags);
+		mbuf_field_set(mb[3], ol_flags);
+		r -= 4;
+	}
+
+	for ( ; i < nb; i++) {
+		if (i < nb - 1)
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb[0] = pkts[i];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
-		addr = _mm_loadu_si128((__m128i *)eth_hdr);
-		addr = _mm_shuffle_epi8(addr, shfl_msk);
-		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
 
-		mbuf_field_set(mb, ol_flags);
+		mbuf_field_set(mb[0], ol_flags);
 	}
 }
 
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-12-11 15:48     ` Ferruh Yigit
  2018-12-14 11:14     ` Iremonger, Bernard
  1 sibling, 0 replies; 30+ messages in thread
From: Ferruh Yigit @ 2018-12-11 15:48 UTC (permalink / raw)
  To: Qi Zhang, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger

On 12/11/2018 5:55 AM, Qi Zhang wrote:
> Move macswap workload to dedicate function, so we can further enable
> platform specific optimized version.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

<...>

> +}
> +
> +#endif /* _BPF_CMD_H_ */

Comment is remaining from copy-pate, there are a few more in the patchset.

<...>

> +static inline uint64_t
> +ol_flags_init(uint64_t tx_offload)
> +{
> +	uint64_t ol_flags = 0;
> +
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
> +			PKT_TX_VLAN : 0;
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
> +			PKT_TX_QINQ : 0;
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
> +			PKT_TX_MACSEC : 0;
> +
> +	return ol_flags;
> +}
> +
> +static inline void
> +vlan_qinq_set(struct rte_mbuf *pkts[], uint16_t nb,
> +		uint64_t ol_flags, uint16_t vlan, uint16_t outer_vlan)
> +{
> +	int i;
> +
> +	if (ol_flags & PKT_TX_VLAN)
> +		for (i = 0; i < nb; i++)
> +			pkts[i]->vlan_tci = vlan;
> +	if (ol_flags & PKT_TX_QINQ)
> +		for (i = 0; i < nb; i++)
> +			pkts[i]->vlan_tci_outer = outer_vlan;
> +}
> +
> +static inline void
> +mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags)
> +{
> +	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> +	mb->ol_flags |= ol_flags;
> +	mb->l2_len = sizeof(struct ether_hdr);
> +	mb->l3_len = sizeof(struct ipv4_hdr);

'vlan_qinq_set()' goes through all packets in burst already, why not merge
'vlan_qinq_set' and 'mbuf_field_set' as before but only add 'ol_flags' checks?

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [EXT] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-12-13 19:50     ` Jerin Jacob Kollanukkaran
  2018-12-14 11:21     ` [dpdk-dev] " Iremonger, Bernard
  1 sibling, 0 replies; 30+ messages in thread
From: Jerin Jacob Kollanukkaran @ 2018-12-13 19:50 UTC (permalink / raw)
  To: qi.z.zhang, keith.wiles, ferruh.yigit, bruce.richardson,
	konstantin.ananyev
  Cc: bernard.iremonger, dev, wenzhuo.lu

On Tue, 2018-12-11 at 13:55 +0800, Qi Zhang wrote:
> 
> The patch optimizes the mac swap operation by taking advantage
> of SSE instructions, it only impacts x86 platform.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
>  app/test-pmd/macswap.c        |  4 ++++
>  app/test-pmd/macswap_common.h |  1 -
>  app/test-pmd/macswap_sse.h    | 45
> +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 49 insertions(+), 1 deletion(-)
>  create mode 100644 app/test-pmd/macswap_sse.h
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> index 849194fe2..cbb41b728 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -66,7 +66,11 @@
>  #include <rte_flow.h>
> 
>  #include "testpmd.h"
> +#ifdef RTE_ARCH_X86
> +#include "macswap_sse.h"
> +#else
>  #include "macswap.h"
> +#endif
> 
>  /*
>   * MAC swap forwarding mode: Swap the source and the destination
> Ethernet
> diff --git a/app/test-pmd/macswap_common.h b/app/test-
> pmd/macswap_common.h
> index ab0a5b5ef..8c5518596 100644
> --- a/app/test-pmd/macswap_common.h
> +++ b/app/test-pmd/macswap_common.h
> @@ -44,4 +44,3 @@ mbuf_field_set(struct rte_mbuf *mb, uint64_t
> ol_flags)
>  }
> 
>  #endif /* _BPF_CMD_H_ */
> -
> diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
> new file mode 100644
> index 000000000..79f4f9a7c
> --- /dev/null
> +++ b/app/test-pmd/macswap_sse.h
> @@ -0,0 +1,45 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _L2FWD_SSE_H_
> +#define _L2FWD_SSE_H_

Copy paste error.


> +
> +#include "macswap_common.h"
> +
> +static inline void
> +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> +               struct rte_port *txp)
> +{
> +       struct ether_hdr *eth_hdr;
> +       struct rte_mbuf *mb;
> +       uint64_t ol_flags;
> +       int i;
> +       __m128i addr;
> +       __m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> +                                       5, 4, 3, 2,
> +                                       1, 0, 11, 10,
> +                                       9, 8, 7, 6);
> +
> +       ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
> +       vlan_qinq_set(pkts, nb, ol_flags,
> +                       txp->tx_vlan_id, txp->tx_vlan_id_outer);
> +
> +       for (i = 0; i < nb; i++) {
> +               if (likely(i < nb - 1))
> +                       rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1],
> void *));
> +               mb = pkts[i];
> +
> +               eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> +
> +               /* Swap dest and src mac addresses. */
> +               addr = _mm_loadu_si128((__m128i *)eth_hdr);
> +               addr = _mm_shuffle_epi8(addr, shfl_msk);
> +               _mm_storeu_si128((__m128i *)eth_hdr, addr);
> +
> +               mbuf_field_set(mb, ol_flags);
> +       }
> +}
> +
> +#endif /* _BPF_CMD_H_ */

Copy paste error.

> +
> --
> 2.13.6
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
  2018-12-11 15:48     ` Ferruh Yigit
@ 2018-12-14 11:14     ` Iremonger, Bernard
  2018-12-14 11:21       ` Zhang, Qi Z
  1 sibling, 1 reply; 30+ messages in thread
From: Iremonger, Bernard @ 2018-12-14 11:14 UTC (permalink / raw)
  To: Zhang, Qi Z, Yigit, Ferruh, Richardson, Bruce, Wiles, Keith,
	Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo

Hi Qi,

> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Tuesday, December 11, 2018 5:55 AM
> To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v2 1/3] app/testpmd: code refactory for macswap
> 
> Move macswap workload to dedicate function, so we can further enable
> platform specific optimized version.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
>  app/test-pmd/macswap.c        | 32 ++---------------------------
>  app/test-pmd/macswap.h        | 40
> ++++++++++++++++++++++++++++++++++++
>  app/test-pmd/macswap_common.h | 47
> +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 89 insertions(+), 30 deletions(-)  create mode 100644
> app/test-pmd/macswap.h  create mode 100644 app/test-
> pmd/macswap_common.h
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> a8384d5b8..849194fe2 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -66,6 +66,7 @@
>  #include <rte_flow.h>
> 
>  #include "testpmd.h"
> +#include "macswap.h"
> 
>  /*
>   * MAC swap forwarding mode: Swap the source and the destination
> Ethernet @@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> {
>  	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
>  	struct rte_port  *txp;
> -	struct rte_mbuf  *mb;
> -	struct ether_hdr *eth_hdr;
> -	struct ether_addr addr;
>  	uint16_t nb_rx;
>  	uint16_t nb_tx;
> -	uint16_t i;
>  	uint32_t retry;
> -	uint64_t ol_flags = 0;
> -	uint64_t tx_offloads;
>  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>  	uint64_t start_tsc;
>  	uint64_t end_tsc;
> @@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> #endif
>  	fs->rx_packets += nb_rx;
>  	txp = &ports[fs->tx_port];
> -	tx_offloads = txp->dev_conf.txmode.offloads;
> -	if (tx_offloads	& DEV_TX_OFFLOAD_VLAN_INSERT)
> -		ol_flags = PKT_TX_VLAN_PKT;
> -	if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
> -		ol_flags |= PKT_TX_QINQ_PKT;
> -	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
> -		ol_flags |= PKT_TX_MACSEC;
> -	for (i = 0; i < nb_rx; i++) {
> -		if (likely(i < nb_rx - 1))
> -			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
> -						       void *));
> -		mb = pkts_burst[i];
> -		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> 
> -		/* Swap dest and src mac addresses. */
> -		ether_addr_copy(&eth_hdr->d_addr, &addr);
> -		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> -		ether_addr_copy(&addr, &eth_hdr->s_addr);
> +	do_macswap(pkts_burst, nb_rx, txp);
> 
> -		mb->ol_flags &= IND_ATTACHED_MBUF |
> EXT_ATTACHED_MBUF;
> -		mb->ol_flags |= ol_flags;
> -		mb->l2_len = sizeof(struct ether_hdr);
> -		mb->l3_len = sizeof(struct ipv4_hdr);
> -		mb->vlan_tci = txp->tx_vlan_id;
> -		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
> -	}
>  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> nb_rx);
>  	/*
>  	 * Retry if necessary
> diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h new file
> mode 100644 index 000000000..14e665bd2
> --- /dev/null
> +++ b/app/test-pmd/macswap.h
> @@ -0,0 +1,40 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _MACSWAP_H_
> +#define _MACSWAP_H_
> +
> +#include "macswap_common.h"
> +
> +static inline void
> +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> +		struct rte_port *txp)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct rte_mbuf *mb;
> +	struct ether_addr addr;
> +	uint64_t ol_flags;
> +	int i;
> +
> +	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
> +	vlan_qinq_set(pkts, nb, ol_flags,
> +			txp->tx_vlan_id, txp->tx_vlan_id_outer);
> +
> +	for (i = 0; i < nb; i++) {
> +		if (likely(i < nb - 1))
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void
> *));
> +		mb = pkts[i];
> +
> +		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> +
> +		/* Swap dest and src mac addresses. */
> +		ether_addr_copy(&eth_hdr->d_addr, &addr);
> +		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> +		ether_addr_copy(&addr, &eth_hdr->s_addr);
> +
> +		mbuf_field_set(mb, ol_flags);
> +	}
> +}
> +
> +#endif /* _BPF_CMD_H_ */

Previous line should be _MACSWAP_H_

> diff --git a/app/test-pmd/macswap_common.h b/app/test-
> pmd/macswap_common.h new file mode 100644 index
> 000000000..ab0a5b5ef
> --- /dev/null
> +++ b/app/test-pmd/macswap_common.h
> @@ -0,0 +1,47 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _MACSWAP_COMMON_H_
> +#define _MACSWAP_COMMON_H_
> +
> +static inline uint64_t
> +ol_flags_init(uint64_t tx_offload)
> +{
> +	uint64_t ol_flags = 0;
> +
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
> +			PKT_TX_VLAN : 0;
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
> +			PKT_TX_QINQ : 0;
> +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
> +			PKT_TX_MACSEC : 0;
> +
> +	return ol_flags;
> +}
> +
> +static inline void
> +vlan_qinq_set(struct rte_mbuf *pkts[], uint16_t nb,
> +		uint64_t ol_flags, uint16_t vlan, uint16_t outer_vlan) {
> +	int i;
> +
> +	if (ol_flags & PKT_TX_VLAN)
> +		for (i = 0; i < nb; i++)
> +			pkts[i]->vlan_tci = vlan;
> +	if (ol_flags & PKT_TX_QINQ)
> +		for (i = 0; i < nb; i++)
> +			pkts[i]->vlan_tci_outer = outer_vlan; }
> +
> +static inline void
> +mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags) {
> +	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> +	mb->ol_flags |= ol_flags;
> +	mb->l2_len = sizeof(struct ether_hdr);
> +	mb->l3_len = sizeof(struct ipv4_hdr);
> +}
> +
> +#endif /* _BPF_CMD_H_ */

Previous line should be _MACSWAP_COMMON_H_

> +
> --
> 2.13.6

When apply the patch a whitespace error is reported

Applying: app/testpmd: code refactory for macswap
.git/rebase-apply/patch:169: new blank line at EOF.

Regards,

Bernard

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap
  2018-12-14 11:14     ` Iremonger, Bernard
@ 2018-12-14 11:21       ` Zhang, Qi Z
  0 siblings, 0 replies; 30+ messages in thread
From: Zhang, Qi Z @ 2018-12-14 11:21 UTC (permalink / raw)
  To: Iremonger, Bernard, Yigit, Ferruh, Richardson, Bruce, Wiles,
	Keith, Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo



> -----Original Message-----
> From: Iremonger, Bernard
> Sent: Friday, December 14, 2018 7:15 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> <keith.wiles@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: RE: [PATCH v2 1/3] app/testpmd: code refactory for macswap
> 
> Hi Qi,
> 
> > -----Original Message-----
> > From: Zhang, Qi Z
> > Sent: Tuesday, December 11, 2018 5:55 AM
> > To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> > Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>
> > Subject: [PATCH v2 1/3] app/testpmd: code refactory for macswap
> >
> > Move macswap workload to dedicate function, so we can further enable
> > platform specific optimized version.
> >
> > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > ---
> >  app/test-pmd/macswap.c        | 32 ++---------------------------
> >  app/test-pmd/macswap.h        | 40
> > ++++++++++++++++++++++++++++++++++++
> >  app/test-pmd/macswap_common.h | 47
> > +++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 89 insertions(+), 30 deletions(-)  create mode
> > 100644 app/test-pmd/macswap.h  create mode 100644 app/test-
> > pmd/macswap_common.h
> >
> > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > a8384d5b8..849194fe2 100644
> > --- a/app/test-pmd/macswap.c
> > +++ b/app/test-pmd/macswap.c
> > @@ -66,6 +66,7 @@
> >  #include <rte_flow.h>
> >
> >  #include "testpmd.h"
> > +#include "macswap.h"
> >
> >  /*
> >   * MAC swap forwarding mode: Swap the source and the destination
> > Ethernet @@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > {
> >  	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
> >  	struct rte_port  *txp;
> > -	struct rte_mbuf  *mb;
> > -	struct ether_hdr *eth_hdr;
> > -	struct ether_addr addr;
> >  	uint16_t nb_rx;
> >  	uint16_t nb_tx;
> > -	uint16_t i;
> >  	uint32_t retry;
> > -	uint64_t ol_flags = 0;
> > -	uint64_t tx_offloads;
> >  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  	uint64_t start_tsc;
> >  	uint64_t end_tsc;
> > @@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs) #endif
> >  	fs->rx_packets += nb_rx;
> >  	txp = &ports[fs->tx_port];
> > -	tx_offloads = txp->dev_conf.txmode.offloads;
> > -	if (tx_offloads	& DEV_TX_OFFLOAD_VLAN_INSERT)
> > -		ol_flags = PKT_TX_VLAN_PKT;
> > -	if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
> > -		ol_flags |= PKT_TX_QINQ_PKT;
> > -	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
> > -		ol_flags |= PKT_TX_MACSEC;
> > -	for (i = 0; i < nb_rx; i++) {
> > -		if (likely(i < nb_rx - 1))
> > -			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
> > -						       void *));
> > -		mb = pkts_burst[i];
> > -		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> >
> > -		/* Swap dest and src mac addresses. */
> > -		ether_addr_copy(&eth_hdr->d_addr, &addr);
> > -		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> > -		ether_addr_copy(&addr, &eth_hdr->s_addr);
> > +	do_macswap(pkts_burst, nb_rx, txp);
> >
> > -		mb->ol_flags &= IND_ATTACHED_MBUF |
> > EXT_ATTACHED_MBUF;
> > -		mb->ol_flags |= ol_flags;
> > -		mb->l2_len = sizeof(struct ether_hdr);
> > -		mb->l3_len = sizeof(struct ipv4_hdr);
> > -		mb->vlan_tci = txp->tx_vlan_id;
> > -		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
> > -	}
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> > nb_rx);
> >  	/*
> >  	 * Retry if necessary
> > diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h new file
> > mode 100644 index 000000000..14e665bd2
> > --- /dev/null
> > +++ b/app/test-pmd/macswap.h
> > @@ -0,0 +1,40 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation  */
> > +
> > +#ifndef _MACSWAP_H_
> > +#define _MACSWAP_H_
> > +
> > +#include "macswap_common.h"
> > +
> > +static inline void
> > +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> > +		struct rte_port *txp)
> > +{
> > +	struct ether_hdr *eth_hdr;
> > +	struct rte_mbuf *mb;
> > +	struct ether_addr addr;
> > +	uint64_t ol_flags;
> > +	int i;
> > +
> > +	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
> > +	vlan_qinq_set(pkts, nb, ol_flags,
> > +			txp->tx_vlan_id, txp->tx_vlan_id_outer);
> > +
> > +	for (i = 0; i < nb; i++) {
> > +		if (likely(i < nb - 1))
> > +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void
> > *));
> > +		mb = pkts[i];
> > +
> > +		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> > +
> > +		/* Swap dest and src mac addresses. */
> > +		ether_addr_copy(&eth_hdr->d_addr, &addr);
> > +		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> > +		ether_addr_copy(&addr, &eth_hdr->s_addr);
> > +
> > +		mbuf_field_set(mb, ol_flags);
> > +	}
> > +}
> > +
> > +#endif /* _BPF_CMD_H_ */
> 
> Previous line should be _MACSWAP_H_

Ah, I only fix the header, forgot the tail, thanks for capture this.
> 
> > diff --git a/app/test-pmd/macswap_common.h b/app/test-
> > pmd/macswap_common.h new file mode 100644 index 000000000..ab0a5b5ef
> > --- /dev/null
> > +++ b/app/test-pmd/macswap_common.h
> > @@ -0,0 +1,47 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation  */
> > +
> > +#ifndef _MACSWAP_COMMON_H_
> > +#define _MACSWAP_COMMON_H_
> > +
> > +static inline uint64_t
> > +ol_flags_init(uint64_t tx_offload)
> > +{
> > +	uint64_t ol_flags = 0;
> > +
> > +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
> > +			PKT_TX_VLAN : 0;
> > +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
> > +			PKT_TX_QINQ : 0;
> > +	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
> > +			PKT_TX_MACSEC : 0;
> > +
> > +	return ol_flags;
> > +}
> > +
> > +static inline void
> > +vlan_qinq_set(struct rte_mbuf *pkts[], uint16_t nb,
> > +		uint64_t ol_flags, uint16_t vlan, uint16_t outer_vlan) {
> > +	int i;
> > +
> > +	if (ol_flags & PKT_TX_VLAN)
> > +		for (i = 0; i < nb; i++)
> > +			pkts[i]->vlan_tci = vlan;
> > +	if (ol_flags & PKT_TX_QINQ)
> > +		for (i = 0; i < nb; i++)
> > +			pkts[i]->vlan_tci_outer = outer_vlan; }
> > +
> > +static inline void
> > +mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags) {
> > +	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> > +	mb->ol_flags |= ol_flags;
> > +	mb->l2_len = sizeof(struct ether_hdr);
> > +	mb->l3_len = sizeof(struct ipv4_hdr); }
> > +
> > +#endif /* _BPF_CMD_H_ */
> 
> Previous line should be _MACSWAP_COMMON_H_
> 
> > +
> > --
> > 2.13.6
> 
> When apply the patch a whitespace error is reported
> 
> Applying: app/testpmd: code refactory for macswap
> .git/rebase-apply/patch:169: new blank line at EOF.

Will fix this.

Thanks
Qi
> 
> Regards,
> 
> Bernard

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86
  2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
  2018-12-13 19:50     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
@ 2018-12-14 11:21     ` Iremonger, Bernard
  2018-12-14 11:36       ` Zhang, Qi Z
  1 sibling, 1 reply; 30+ messages in thread
From: Iremonger, Bernard @ 2018-12-14 11:21 UTC (permalink / raw)
  To: Zhang, Qi Z, Yigit, Ferruh, Richardson, Bruce, Wiles, Keith,
	Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo

Hi Qi,

> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Tuesday, December 11, 2018 5:55 AM
> To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v2 2/3] app/testpmd: improve MAC swap performance for
> x86
> 
> The patch optimizes the mac swap operation by taking advantage of SSE
> instructions, it only impacts x86 platform.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
>  app/test-pmd/macswap.c        |  4 ++++
>  app/test-pmd/macswap_common.h |  1 -
>  app/test-pmd/macswap_sse.h    | 45
> +++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 49 insertions(+), 1 deletion(-)  create mode 100644 app/test-
> pmd/macswap_sse.h
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> 849194fe2..cbb41b728 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -66,7 +66,11 @@
>  #include <rte_flow.h>
> 
>  #include "testpmd.h"
> +#ifdef RTE_ARCH_X86
> +#include "macswap_sse.h"
> +#else
>  #include "macswap.h"
> +#endif
> 
>  /*
>   * MAC swap forwarding mode: Swap the source and the destination
> Ethernet diff --git a/app/test-pmd/macswap_common.h b/app/test-
> pmd/macswap_common.h index ab0a5b5ef..8c5518596 100644
> --- a/app/test-pmd/macswap_common.h
> +++ b/app/test-pmd/macswap_common.h
> @@ -44,4 +44,3 @@ mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags)
> }
> 
>  #endif /* _BPF_CMD_H_ */
> -
> diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
> new file mode 100644 index 000000000..79f4f9a7c
> --- /dev/null
> +++ b/app/test-pmd/macswap_sse.h
> @@ -0,0 +1,45 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _L2FWD_SSE_H_
> +#define _L2FWD_SSE_H_

_L2FWD_SSE_H_  should be replaced by _MACSWAP_SSE_H_  in the lines above.

> +
> +#include "macswap_common.h"
> +
> +static inline void
> +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> +		struct rte_port *txp)
> +{
> +	struct ether_hdr *eth_hdr;
> +	struct rte_mbuf *mb;
> +	uint64_t ol_flags;
> +	int i;
> +	__m128i addr;

Some comments explaining the parameters to _mm_set_epi8() would be useful.

> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> +					5, 4, 3, 2,
> +					1, 0, 11, 10,
> +					9, 8, 7, 6);
> +
> +	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
> +	vlan_qinq_set(pkts, nb, ol_flags,
> +			txp->tx_vlan_id, txp->tx_vlan_id_outer);
> +
> +	for (i = 0; i < nb; i++) {
> +		if (likely(i < nb - 1))
> +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void
> *));
> +		mb = pkts[i];
> +
> +		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> +
> +		/* Swap dest and src mac addresses. */
> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
> +
> +		mbuf_field_set(mb, ol_flags);
> +	}
> +}
> +
> +#endif /* _BPF_CMD_H_ */

_BPF_CMD_H should be replaced by _MACSWAP_SSE_H_ in the line above.

> +
> --
> 2.13.6

A white space error is reported when applying this patch

Applying: app/testpmd: improve MAC swap performance for x86
.git/rebase-apply/patch:83: new blank line at EOF.

Regards,

Bernard.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86
  2018-12-14 11:21     ` [dpdk-dev] " Iremonger, Bernard
@ 2018-12-14 11:36       ` Zhang, Qi Z
  0 siblings, 0 replies; 30+ messages in thread
From: Zhang, Qi Z @ 2018-12-14 11:36 UTC (permalink / raw)
  To: Iremonger, Bernard, Yigit, Ferruh, Richardson, Bruce, Wiles,
	Keith, Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo



> -----Original Message-----
> From: Iremonger, Bernard
> Sent: Friday, December 14, 2018 7:22 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yigit, Ferruh <ferruh.yigit@intel.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> <keith.wiles@intel.com>; Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>
> Subject: RE: [PATCH v2 2/3] app/testpmd: improve MAC swap performance for
> x86
> 
> Hi Qi,
> 
> > -----Original Message-----
> > From: Zhang, Qi Z
> > Sent: Tuesday, December 11, 2018 5:55 AM
> > To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> > Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>
> > Subject: [PATCH v2 2/3] app/testpmd: improve MAC swap performance for
> > x86
> >
> > The patch optimizes the mac swap operation by taking advantage of SSE
> > instructions, it only impacts x86 platform.
> >
> > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > ---
> >  app/test-pmd/macswap.c        |  4 ++++
> >  app/test-pmd/macswap_common.h |  1 -
> >  app/test-pmd/macswap_sse.h    | 45
> > +++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 49 insertions(+), 1 deletion(-)  create mode 100644
> > app/test- pmd/macswap_sse.h
> >
> > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > 849194fe2..cbb41b728 100644
> > --- a/app/test-pmd/macswap.c
> > +++ b/app/test-pmd/macswap.c
> > @@ -66,7 +66,11 @@
> >  #include <rte_flow.h>
> >
> >  #include "testpmd.h"
> > +#ifdef RTE_ARCH_X86
> > +#include "macswap_sse.h"
> > +#else
> >  #include "macswap.h"
> > +#endif
> >
> >  /*
> >   * MAC swap forwarding mode: Swap the source and the destination
> > Ethernet diff --git a/app/test-pmd/macswap_common.h b/app/test-
> > pmd/macswap_common.h index ab0a5b5ef..8c5518596 100644
> > --- a/app/test-pmd/macswap_common.h
> > +++ b/app/test-pmd/macswap_common.h
> > @@ -44,4 +44,3 @@ mbuf_field_set(struct rte_mbuf *mb, uint64_t
> > ol_flags) }
> >
> >  #endif /* _BPF_CMD_H_ */
> > -
> > diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
> > new file mode 100644 index 000000000..79f4f9a7c
> > --- /dev/null
> > +++ b/app/test-pmd/macswap_sse.h
> > @@ -0,0 +1,45 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation  */
> > +
> > +#ifndef _L2FWD_SSE_H_
> > +#define _L2FWD_SSE_H_
> 
> _L2FWD_SSE_H_  should be replaced by _MACSWAP_SSE_H_  in the lines
> above.

Yes, during re-work, I saw this and EOF issue also, will fix

> 
> > +
> > +#include "macswap_common.h"
> > +
> > +static inline void
> > +do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
> > +		struct rte_port *txp)
> > +{
> > +	struct ether_hdr *eth_hdr;
> > +	struct rte_mbuf *mb;
> > +	uint64_t ol_flags;
> > +	int i;
> > +	__m128i addr;
> 
> Some comments explaining the parameters to _mm_set_epi8() would be useful.

OK, will add.

Thanks
Qi

> 
> > +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> > +					5, 4, 3, 2,
> > +					1, 0, 11, 10,
> > +					9, 8, 7, 6);
> > +
> > +	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
> > +	vlan_qinq_set(pkts, nb, ol_flags,
> > +			txp->tx_vlan_id, txp->tx_vlan_id_outer);
> > +
> > +	for (i = 0; i < nb; i++) {
> > +		if (likely(i < nb - 1))
> > +			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void
> > *));
> > +		mb = pkts[i];
> > +
> > +		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> > +
> > +		/* Swap dest and src mac addresses. */
> > +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> > +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> > +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
> > +
> > +		mbuf_field_set(mb, ol_flags);
> > +	}
> > +}
> > +
> > +#endif /* _BPF_CMD_H_ */
> 
> _BPF_CMD_H should be replaced by _MACSWAP_SSE_H_ in the line above.
> 
> > +
> > --
> > 2.13.6
> 
> A white space error is reported when applying this patch
> 
> Applying: app/testpmd: improve MAC swap performance for x86
> .git/rebase-apply/patch:83: new blank line at EOF.
> 
> Regards,
> 
> Bernard.
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance
  2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
                   ` (4 preceding siblings ...)
  2018-12-11  5:55 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
@ 2018-12-16  0:58 ` Qi Zhang
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap Qi Zhang
                     ` (3 more replies)
  5 siblings, 4 replies; 30+ messages in thread
From: Qi Zhang @ 2018-12-16  0:58 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Improved testpmd macswap performance for x86 by take advantage of SSE
instructions.
On a broadwell 1.6GHz sever with a i40e 25G NIC.
We abserve 17.7% performance improvement for testpmd's macswap test.

v3:
- fix wrong comment after #endif.
- fix EOF space line.
- add comment to explain shuffle mask.

v2:
- replace PKT_TX_VLAN_PKT/PKT_TX_QINQ_PKT with PKT_TX_VLAN/PKT_TX_QINQ
- only set vlan / outer_vlan when related ol_flags is set.
- fix coding style

*** BLURB HERE ***

Qi Zhang (3):
  app/testpmd: code refactory for macswap
  app/testpmd: improve MAC swap performance for x86
  app/testpmd: further improve MAC swap performance for x86

 app/test-pmd/macswap.c        | 36 +++---------------
 app/test-pmd/macswap.h        | 40 ++++++++++++++++++++
 app/test-pmd/macswap_common.h | 46 +++++++++++++++++++++++
 app/test-pmd/macswap_sse.h    | 87 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h
 create mode 100644 app/test-pmd/macswap_sse.h

-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap
  2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
@ 2018-12-16  0:58   ` Qi Zhang
  2018-12-18 11:06     ` Iremonger, Bernard
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 30+ messages in thread
From: Qi Zhang @ 2018-12-16  0:58 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Move macswap workload to dedicate function, so we can further enable
platform specific optimized version.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c        | 32 ++----------------------------
 app/test-pmd/macswap.h        | 40 +++++++++++++++++++++++++++++++++++++
 app/test-pmd/macswap_common.h | 46 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 30 deletions(-)
 create mode 100644 app/test-pmd/macswap.h
 create mode 100644 app/test-pmd/macswap_common.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..849194fe2 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,6 +66,7 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#include "macswap.h"
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
@@ -76,15 +77,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 {
 	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
 	struct rte_port  *txp;
-	struct rte_mbuf  *mb;
-	struct ether_hdr *eth_hdr;
-	struct ether_addr addr;
 	uint16_t nb_rx;
 	uint16_t nb_tx;
-	uint16_t i;
 	uint32_t retry;
-	uint64_t ol_flags = 0;
-	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t start_tsc;
 	uint64_t end_tsc;
@@ -108,32 +103,9 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
 #endif
 	fs->rx_packets += nb_rx;
 	txp = &ports[fs->tx_port];
-	tx_offloads = txp->dev_conf.txmode.offloads;
-	if (tx_offloads	& DEV_TX_OFFLOAD_VLAN_INSERT)
-		ol_flags = PKT_TX_VLAN_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_QINQ_INSERT)
-		ol_flags |= PKT_TX_QINQ_PKT;
-	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
-		ol_flags |= PKT_TX_MACSEC;
-	for (i = 0; i < nb_rx; i++) {
-		if (likely(i < nb_rx - 1))
-			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i + 1],
-						       void *));
-		mb = pkts_burst[i];
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
 
-		/* Swap dest and src mac addresses. */
-		ether_addr_copy(&eth_hdr->d_addr, &addr);
-		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
-		ether_addr_copy(&addr, &eth_hdr->s_addr);
+	do_macswap(pkts_burst, nb_rx, txp);
 
-		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
-		mb->ol_flags |= ol_flags;
-		mb->l2_len = sizeof(struct ether_hdr);
-		mb->l3_len = sizeof(struct ipv4_hdr);
-		mb->vlan_tci = txp->tx_vlan_id;
-		mb->vlan_tci_outer = txp->tx_vlan_id_outer;
-	}
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
 	/*
 	 * Retry if necessary
diff --git a/app/test-pmd/macswap.h b/app/test-pmd/macswap.h
new file mode 100644
index 000000000..bfa9b0eda
--- /dev/null
+++ b/app/test-pmd/macswap.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _MACSWAP_H_
+#define _MACSWAP_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	struct ether_addr addr;
+	uint64_t ol_flags;
+	int i;
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+	vlan_qinq_set(pkts, nb, ol_flags,
+			txp->tx_vlan_id, txp->tx_vlan_id_outer);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		ether_addr_copy(&eth_hdr->d_addr, &addr);
+		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
+		ether_addr_copy(&addr, &eth_hdr->s_addr);
+
+		mbuf_field_set(mb, ol_flags);
+	}
+}
+
+#endif /* _MACSWAP_H_ */
diff --git a/app/test-pmd/macswap_common.h b/app/test-pmd/macswap_common.h
new file mode 100644
index 000000000..19754cdd1
--- /dev/null
+++ b/app/test-pmd/macswap_common.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _MACSWAP_COMMON_H_
+#define _MACSWAP_COMMON_H_
+
+static inline uint64_t
+ol_flags_init(uint64_t tx_offload)
+{
+	uint64_t ol_flags = 0;
+
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_VLAN_INSERT) ?
+			PKT_TX_VLAN : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_QINQ_INSERT) ?
+			PKT_TX_QINQ : 0;
+	ol_flags |= (tx_offload & DEV_TX_OFFLOAD_MACSEC_INSERT) ?
+			PKT_TX_MACSEC : 0;
+
+	return ol_flags;
+}
+
+static inline void
+vlan_qinq_set(struct rte_mbuf *pkts[], uint16_t nb,
+		uint64_t ol_flags, uint16_t vlan, uint16_t outer_vlan)
+{
+	int i;
+
+	if (ol_flags & PKT_TX_VLAN)
+		for (i = 0; i < nb; i++)
+			pkts[i]->vlan_tci = vlan;
+	if (ol_flags & PKT_TX_QINQ)
+		for (i = 0; i < nb; i++)
+			pkts[i]->vlan_tci_outer = outer_vlan;
+}
+
+static inline void
+mbuf_field_set(struct rte_mbuf *mb, uint64_t ol_flags)
+{
+	mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
+	mb->ol_flags |= ol_flags;
+	mb->l2_len = sizeof(struct ether_hdr);
+	mb->l3_len = sizeof(struct ipv4_hdr);
+}
+
+#endif /* _MACSWAP_COMMON_H_ */
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 2/3] app/testpmd: improve MAC swap performance for x86
  2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-12-16  0:58   ` Qi Zhang
  2018-12-18 11:07     ` Iremonger, Bernard
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 3/3] app/testpmd: further " Qi Zhang
  2018-12-18  0:15   ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Ferruh Yigit
  3 siblings, 1 reply; 30+ messages in thread
From: Qi Zhang @ 2018-12-16  0:58 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c     |  4 ++++
 app/test-pmd/macswap_sse.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 app/test-pmd/macswap_sse.h

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 849194fe2..cbb41b728 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -66,7 +66,11 @@
 #include <rte_flow.h>
 
 #include "testpmd.h"
+#ifdef RTE_ARCH_X86
+#include "macswap_sse.h"
+#else
 #include "macswap.h"
+#endif
 
 /*
  * MAC swap forwarding mode: Swap the source and the destination Ethernet
diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
new file mode 100644
index 000000000..79c7e9883
--- /dev/null
+++ b/app/test-pmd/macswap_sse.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _MACSWAP_SSE_H_
+#define _MACSWAP_SSE_H_
+
+#include "macswap_common.h"
+
+static inline void
+do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
+		struct rte_port *txp)
+{
+	struct ether_hdr *eth_hdr;
+	struct rte_mbuf *mb;
+	uint64_t ol_flags;
+	int i;
+	__m128i addr;
+	/**
+	 * shuffle mask be used to shuffle the 16 bytes.
+	 * byte 0-5 wills be swapped with byte 6-11.
+	 * byte 12-15 will keep unchanged.
+	 */
+	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+					5, 4, 3, 2,
+					1, 0, 11, 10,
+					9, 8, 7, 6);
+
+	ol_flags = ol_flags_init(txp->dev_conf.txmode.offloads);
+	vlan_qinq_set(pkts, nb, ol_flags,
+			txp->tx_vlan_id, txp->tx_vlan_id_outer);
+
+	for (i = 0; i < nb; i++) {
+		if (likely(i < nb - 1))
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb = pkts[i];
+
+		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+
+		/* Swap dest and src mac addresses. */
+		addr = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr = _mm_shuffle_epi8(addr, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+
+		mbuf_field_set(mb, ol_flags);
+	}
+}
+
+#endif /* _MACSWAP_SSE_H_ */
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [dpdk-dev] [PATCH v3 3/3] app/testpmd: further improve MAC swap performance for x86
  2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap Qi Zhang
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-12-16  0:58   ` Qi Zhang
  2018-12-18 11:09     ` Iremonger, Bernard
  2018-12-18  0:15   ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Ferruh Yigit
  3 siblings, 1 reply; 30+ messages in thread
From: Qi Zhang @ 2018-12-16  0:58 UTC (permalink / raw)
  To: ferruh.yigit, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger, Qi Zhang

Do four packets macswap in same loop iterate to squeeze more
CPU cycles.

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap_sse.h | 62 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/app/test-pmd/macswap_sse.h b/app/test-pmd/macswap_sse.h
index 79c7e9883..7d268bfbb 100644
--- a/app/test-pmd/macswap_sse.h
+++ b/app/test-pmd/macswap_sse.h
@@ -11,11 +11,12 @@ static inline void
 do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 		struct rte_port *txp)
 {
-	struct ether_hdr *eth_hdr;
-	struct rte_mbuf *mb;
+	struct ether_hdr *eth_hdr[4];
+	struct rte_mbuf *mb[4];
 	uint64_t ol_flags;
 	int i;
-	__m128i addr;
+	int r;
+	__m128i addr0, addr1, addr2, addr3;
 	/**
 	 * shuffle mask be used to shuffle the 16 bytes.
 	 * byte 0-5 wills be swapped with byte 6-11.
@@ -30,19 +31,56 @@ do_macswap(struct rte_mbuf *pkts[], uint16_t nb,
 	vlan_qinq_set(pkts, nb, ol_flags,
 			txp->tx_vlan_id, txp->tx_vlan_id_outer);
 
-	for (i = 0; i < nb; i++) {
-		if (likely(i < nb - 1))
-			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
-		mb = pkts[i];
+	i = 0;
+	r = nb;
+
+	while (r >= 4) {
+		mb[0] = pkts[i++];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr[0]);
+
+		mb[1] = pkts[i++];
+		eth_hdr[1] = rte_pktmbuf_mtod(mb[1], struct ether_hdr *);
+		addr1 = _mm_loadu_si128((__m128i *)eth_hdr[1]);
+
+
+		mb[2] = pkts[i++];
+		eth_hdr[2] = rte_pktmbuf_mtod(mb[2], struct ether_hdr *);
+		addr2 = _mm_loadu_si128((__m128i *)eth_hdr[2]);
+
+		mb[3] = pkts[i++];
+		eth_hdr[3] = rte_pktmbuf_mtod(mb[3], struct ether_hdr *);
+		addr3 = _mm_loadu_si128((__m128i *)eth_hdr[3]);
 
-		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		addr1 = _mm_shuffle_epi8(addr1, shfl_msk);
+		addr2 = _mm_shuffle_epi8(addr2, shfl_msk);
+		addr3 = _mm_shuffle_epi8(addr3, shfl_msk);
+
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
+		_mm_storeu_si128((__m128i *)eth_hdr[1], addr1);
+		_mm_storeu_si128((__m128i *)eth_hdr[2], addr2);
+		_mm_storeu_si128((__m128i *)eth_hdr[3], addr3);
+
+		mbuf_field_set(mb[0], ol_flags);
+		mbuf_field_set(mb[1], ol_flags);
+		mbuf_field_set(mb[2], ol_flags);
+		mbuf_field_set(mb[3], ol_flags);
+		r -= 4;
+	}
+
+	for ( ; i < nb; i++) {
+		if (i < nb - 1)
+			rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));
+		mb[0] = pkts[i];
+		eth_hdr[0] = rte_pktmbuf_mtod(mb[0], struct ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
-		addr = _mm_loadu_si128((__m128i *)eth_hdr);
-		addr = _mm_shuffle_epi8(addr, shfl_msk);
-		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+		addr0 = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr0 = _mm_shuffle_epi8(addr0, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr[0], addr0);
 
-		mbuf_field_set(mb, ol_flags);
+		mbuf_field_set(mb[0], ol_flags);
 	}
 }
 
-- 
2.13.6

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance
  2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
                     ` (2 preceding siblings ...)
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 3/3] app/testpmd: further " Qi Zhang
@ 2018-12-18  0:15   ` Ferruh Yigit
  2018-12-18 16:29     ` Ferruh Yigit
  3 siblings, 1 reply; 30+ messages in thread
From: Ferruh Yigit @ 2018-12-18  0:15 UTC (permalink / raw)
  To: Qi Zhang, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger

On 12/16/2018 12:58 AM, Qi Zhang wrote:
> Improved testpmd macswap performance for x86 by take advantage of SSE
> instructions.
> On a broadwell 1.6GHz sever with a i40e 25G NIC.
> We abserve 17.7% performance improvement for testpmd's macswap test.
> 
> v3:
> - fix wrong comment after #endif.
> - fix EOF space line.
> - add comment to explain shuffle mask.
> 
> v2:
> - replace PKT_TX_VLAN_PKT/PKT_TX_QINQ_PKT with PKT_TX_VLAN/PKT_TX_QINQ
> - only set vlan / outer_vlan when related ol_flags is set.
> - fix coding style
> 
> *** BLURB HERE ***
> 
> Qi Zhang (3):
>   app/testpmd: code refactory for macswap
>   app/testpmd: improve MAC swap performance for x86
>   app/testpmd: further improve MAC swap performance for x86

For series,
Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap Qi Zhang
@ 2018-12-18 11:06     ` Iremonger, Bernard
  0 siblings, 0 replies; 30+ messages in thread
From: Iremonger, Bernard @ 2018-12-18 11:06 UTC (permalink / raw)
  To: Zhang, Qi Z, Yigit, Ferruh, Richardson, Bruce, Wiles, Keith,
	Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo

> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Sunday, December 16, 2018 12:59 AM
> To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v3 1/3] app/testpmd: code refactory for macswap
> 
> Move macswap workload to dedicate function, so we can further enable
> platform specific optimized version.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

Acked-by: Bernard Iremonger <bernard.iremonger@intel.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/3] app/testpmd: improve MAC swap performance for x86
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
@ 2018-12-18 11:07     ` Iremonger, Bernard
  0 siblings, 0 replies; 30+ messages in thread
From: Iremonger, Bernard @ 2018-12-18 11:07 UTC (permalink / raw)
  To: Zhang, Qi Z, Yigit, Ferruh, Richardson, Bruce, Wiles, Keith,
	Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo

> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Sunday, December 16, 2018 12:59 AM
> To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v3 2/3] app/testpmd: improve MAC swap performance for
> x86
> 
> The patch optimizes the mac swap operation by taking advantage of SSE
> instructions, it only impacts x86 platform.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

Acked-by: Bernard Iremonger <bernard.iremonger@intel.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 3/3] app/testpmd: further improve MAC swap performance for x86
  2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 3/3] app/testpmd: further " Qi Zhang
@ 2018-12-18 11:09     ` Iremonger, Bernard
  0 siblings, 0 replies; 30+ messages in thread
From: Iremonger, Bernard @ 2018-12-18 11:09 UTC (permalink / raw)
  To: Zhang, Qi Z, Yigit, Ferruh, Richardson, Bruce, Wiles, Keith,
	Ananyev, Konstantin
  Cc: dev, Lu, Wenzhuo

> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Sunday, December 16, 2018 12:59 AM
> To: Yigit, Ferruh <ferruh.yigit@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: [PATCH v3 3/3] app/testpmd: further improve MAC swap
> performance for x86
> 
> Do four packets macswap in same loop iterate to squeeze more CPU cycles.
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>

Acked-by: Bernard Iremonger <bernard.iremonger@intel.com>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance
  2018-12-18  0:15   ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Ferruh Yigit
@ 2018-12-18 16:29     ` Ferruh Yigit
  0 siblings, 0 replies; 30+ messages in thread
From: Ferruh Yigit @ 2018-12-18 16:29 UTC (permalink / raw)
  To: Qi Zhang, bruce.richardson, keith.wiles, konstantin.ananyev
  Cc: dev, wenzhuo.lu, bernard.iremonger

On 12/18/2018 12:15 AM, Ferruh Yigit wrote:
> On 12/16/2018 12:58 AM, Qi Zhang wrote:
>> Improved testpmd macswap performance for x86 by take advantage of SSE
>> instructions.
>> On a broadwell 1.6GHz sever with a i40e 25G NIC.
>> We abserve 17.7% performance improvement for testpmd's macswap test.
>>
>> v3:
>> - fix wrong comment after #endif.
>> - fix EOF space line.
>> - add comment to explain shuffle mask.
>>
>> v2:
>> - replace PKT_TX_VLAN_PKT/PKT_TX_QINQ_PKT with PKT_TX_VLAN/PKT_TX_QINQ
>> - only set vlan / outer_vlan when related ol_flags is set.
>> - fix coding style
>>
>> *** BLURB HERE ***
>>
>> Qi Zhang (3):
>>   app/testpmd: code refactory for macswap
>>   app/testpmd: improve MAC swap performance for x86
>>   app/testpmd: further improve MAC swap performance for x86
> 
> For series,
> Reviewed-by: Ferruh Yigit <ferruh.yigit@intel.com>


Series applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2018-12-18 16:30 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-22 17:26 [dpdk-dev] [PATCH 0/3] improve MAC swap performance Qi Zhang
2018-11-22 17:26 ` [dpdk-dev] [PATCH 1/3] app/testpmd: code refactory for macswap Qi Zhang
2018-11-22 17:26 ` [dpdk-dev] [PATCH 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
2018-11-22 17:26 ` [dpdk-dev] [PATCH 3/3] app/testpmd: further " Qi Zhang
2018-11-22 17:38 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
2018-12-10 17:44     ` Ferruh Yigit
2018-12-11  4:02       ` Zhang, Qi Z
2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
2018-12-10 17:44     ` Ferruh Yigit
2018-11-22 17:38   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: further " Qi Zhang
2018-12-11  5:55 ` [dpdk-dev] [PATCH v2 0/3] improve MAC swap performance Qi Zhang
2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 1/3] app/testpmd: code refactory for macswap Qi Zhang
2018-12-11 15:48     ` Ferruh Yigit
2018-12-14 11:14     ` Iremonger, Bernard
2018-12-14 11:21       ` Zhang, Qi Z
2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
2018-12-13 19:50     ` [dpdk-dev] [EXT] " Jerin Jacob Kollanukkaran
2018-12-14 11:21     ` [dpdk-dev] " Iremonger, Bernard
2018-12-14 11:36       ` Zhang, Qi Z
2018-12-11  5:55   ` [dpdk-dev] [PATCH v2 3/3] app/testpmd: further " Qi Zhang
2018-12-16  0:58 ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Qi Zhang
2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 1/3] app/testpmd: code refactory for macswap Qi Zhang
2018-12-18 11:06     ` Iremonger, Bernard
2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 2/3] app/testpmd: improve MAC swap performance for x86 Qi Zhang
2018-12-18 11:07     ` Iremonger, Bernard
2018-12-16  0:58   ` [dpdk-dev] [PATCH v3 3/3] app/testpmd: further " Qi Zhang
2018-12-18 11:09     ` Iremonger, Bernard
2018-12-18  0:15   ` [dpdk-dev] [PATCH v3 0/3] improve MAC swap performance Ferruh Yigit
2018-12-18 16:29     ` Ferruh Yigit

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).