DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix
@ 2016-03-03 17:23 Tomasz Kulasek
  2016-03-07  6:19 ` Xu, Qian Q
  2016-03-08 12:58 ` [dpdk-dev] [PATCH v2] " Tomasz Kulasek
  0 siblings, 2 replies; 25+ messages in thread
From: Tomasz Kulasek @ 2016-03-03 17:23 UTC (permalink / raw)
  To: dev

It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    2 ++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   12 ++----------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index da6d369..207a60a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -84,6 +84,8 @@ struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index f6a65d8..c8c781d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -305,7 +305,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#ifdef NO_HASH_MULTI_LOOKUP
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -552,8 +552,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..517815a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])
 {
@@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])
 {
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index 4c6d14f..7f10af4 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index e0ed3c4..8df762d 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -158,8 +158,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 0e33039..130817c 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix
  2016-03-03 17:23 [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix Tomasz Kulasek
@ 2016-03-07  6:19 ` Xu, Qian Q
  2016-03-08 12:58 ` [dpdk-dev] [PATCH v2] " Tomasz Kulasek
  1 sibling, 0 replies; 25+ messages in thread
From: Xu, Qian Q @ 2016-03-07  6:19 UTC (permalink / raw)
  To: Kulasek, TomaszX, dev

Tested-by: Qian Xu <qian.q.xu@intel.com>

- Test Commit: 8f6f24342281f59de0df7bd976a32f714d39b9a9
- OS/Kernel: Fedora 21/4.1.13
- GCC: gcc (GCC) 4.9.2 20141101 (Red Hat 4.9.2-1)
- CPU: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10
- NIC: Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection (rev 01)
- Target: Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection (rev 01)
- Total 1 cases, 1 passed, 0 failed. 

Test Case1: test_l3fwd_exact_match_perf 
==================================
Compare two sample performance report on the same platform, same traffic, the only difference is the l3fwd sample. 
1. Using the dpdk-2.2 l3fwd sample, compiled it and run the l3fwd as below:
 ./examples/l3fwd/build/l3fwd -c 0x3c0000 -n 4 -- -p 0xf --config '(0,0,18),(1,0,19),(2,0,20),(3,0,21)' --hash-entry-num 0x400000

2. Using the latest dpdk with the updated l3fwd sample, run it: 
 ./examples/l3fwd/build/l3fwd -c 0x3c0000 -n 4 -- -E -p 0xf --config '(0,0,18),(1,0,19),(2,0,20),(3,0,21)' --hash-entry-num 0x400000

The traffic is same for both samples: 
 The traffic I send to port0 is as below:
 DEST MAC= PORT0's MAC
 Protocol: IPV4
 TCP/IP
 DEST IP: 201.0.0.0     Mode: Continuous Increment Host, Mask: 255.240.0.0---  So about 0x100000 different DEST IP items. 

 Source IP: 200.20.0.1  Mode: Fixed
 SRC PORT: 12
 DEST PORT: 102
 Packet size=64B, sending at line rate,

Test results, the performance with the fix is similar as before, without fix there will be 11% performance drop. 


Thanks
Qian


-----Original Message-----
From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tomasz Kulasek
Sent: Friday, March 04, 2016 1:23 AM
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix

It seems that for the most use cases, previous hash_multi_lookup provides better performance, and more, sequential lookup can cause significant performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used tx ports.

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    2 ++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   12 ++----------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index da6d369..207a60a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -84,6 +84,8 @@ struct lcore_rx_queue {  struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c index f6a65d8..c8c781d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -305,7 +305,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#ifdef NO_HASH_MULTI_LOOKUP
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -552,8 +552,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..517815a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])  { @@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);  }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])  { diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h index 4c6d14f..7f10af4 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in 
+Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's 
+disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global 
+define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c index e0ed3c4..8df762d 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -158,8 +158,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 0e33039..130817c 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
--
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2] examples/l3fwd: em path performance fix
  2016-03-03 17:23 [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix Tomasz Kulasek
  2016-03-07  6:19 ` Xu, Qian Q
@ 2016-03-08 12:58 ` Tomasz Kulasek
  2016-03-11 11:14   ` Thomas Monjalon
  2016-03-11 12:10   ` [dpdk-dev] [PATCH v3] " Tomasz Kulasek
  1 sibling, 2 replies; 25+ messages in thread
From: Tomasz Kulasek @ 2016-03-08 12:58 UTC (permalink / raw)
  To: dev

It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.

v2 changes:
 - fixed copy-paste error causing that not all packets are classified right
   in hash_multi_lookup implementation when burst size is not divisible
   by 8

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    2 ++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index da6d369..207a60a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -84,6 +84,8 @@ struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index f6a65d8..c8c781d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -305,7 +305,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#ifdef NO_HASH_MULTI_LOOKUP
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -552,8 +552,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..f0cf421 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])
 {
@@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])
 {
@@ -322,17 +314,17 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 
 		} else {
 			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j], portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
 		}
 	}
 
-	for (; j < n; j++)
+	for (; j < nb_rx; j++)
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index 4c6d14f..7f10af4 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index e0ed3c4..8df762d 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -158,8 +158,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 0e33039..130817c 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] examples/l3fwd: em path performance fix
  2016-03-08 12:58 ` [dpdk-dev] [PATCH v2] " Tomasz Kulasek
@ 2016-03-11 11:14   ` Thomas Monjalon
  2016-03-11 12:28     ` Kulasek, TomaszX
  2016-03-11 12:10   ` [dpdk-dev] [PATCH v3] " Tomasz Kulasek
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-11 11:14 UTC (permalink / raw)
  To: Tomasz Kulasek; +Cc: dev

There is a compilation error:
examples/l3fwd/l3fwd_em_hlm_sse.h:330:40: error:
passing argument 3 of ‘send_packets_multi’ from incompatible pointer type

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-08 12:58 ` [dpdk-dev] [PATCH v2] " Tomasz Kulasek
  2016-03-11 11:14   ` Thomas Monjalon
@ 2016-03-11 12:10   ` Tomasz Kulasek
  2016-03-11 16:23     ` Thomas Monjalon
  2016-03-18  9:36     ` [dpdk-dev] [PATCH v4] " Tomasz Kulasek
  1 sibling, 2 replies; 25+ messages in thread
From: Tomasz Kulasek @ 2016-03-11 12:10 UTC (permalink / raw)
  To: dev

It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.

v3 changes:
 - "lpm: extend IPv4 next hop field" patch extends dst_port table from
   uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
   what causes incompatible pointer type error after turning on this header

v2 changes:
 - fixed copy-paste error causing that not all packets are classified right
   in hash_multi_lookup implementation when burst size is not divisible
   by 8

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")
Fixes: dc81ebbacaeb ("lpm: extend IPv4 next hop field")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    2 ++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   34 +++++++++++++---------------------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index da6d369..207a60a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -84,6 +84,8 @@ struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index f6a65d8..c8c781d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -305,7 +305,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#ifdef NO_HASH_MULTI_LOOKUP
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -552,8 +552,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..328dcb8 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,19 +34,11 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+		uint8_t portid, uint32_t dst_port[8])
 {
 	int32_t ret[8];
 	union ipv4_5tuple_host key[8];
@@ -168,9 +160,9 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
-		uint8_t portid, uint16_t dst_port[8])
+		uint8_t portid, uint32_t dst_port[8])
 {
 	int32_t ret[8];
 	union ipv6_5tuple_host key[8];
@@ -292,7 +284,7 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		uint8_t portid, struct lcore_conf *qconf)
 {
 	int32_t j;
-	uint16_t dst_port[MAX_PKT_BURST];
+	uint32_t dst_port[MAX_PKT_BURST];
 
 	/*
 	 * Send nb_rx - nb_rx%8 packets
@@ -322,17 +314,17 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 
 		} else {
 			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j], portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
 		}
 	}
 
-	for (; j < n; j++)
+	for (; j < nb_rx; j++)
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index d4a2a2d..8bd150a 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index a354797..990a7f1 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -159,8 +159,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 0e33039..130817c 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v2] examples/l3fwd: em path performance fix
  2016-03-11 11:14   ` Thomas Monjalon
@ 2016-03-11 12:28     ` Kulasek, TomaszX
  0 siblings, 0 replies; 25+ messages in thread
From: Kulasek, TomaszX @ 2016-03-11 12:28 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, March 11, 2016 12:15
> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v2] examples/l3fwd: em path performance fix
> 
> There is a compilation error:
> examples/l3fwd/l3fwd_em_hlm_sse.h:330:40: error:
> passing argument 3 of ‘send_packets_multi’ from incompatible pointer type

Ok, fix sent.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-11 12:10   ` [dpdk-dev] [PATCH v3] " Tomasz Kulasek
@ 2016-03-11 16:23     ` Thomas Monjalon
  2016-03-11 17:48       ` Kulasek, TomaszX
  2016-03-18  9:36     ` [dpdk-dev] [PATCH v4] " Tomasz Kulasek
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-11 16:23 UTC (permalink / raw)
  To: Tomasz Kulasek; +Cc: dev

There is an error:
examples/l3fwd/l3fwd_em_hlm_sse.h:72:38: error:
	incompatible type for argument 2 of ‘_mm_and_si128’

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-11 16:23     ` Thomas Monjalon
@ 2016-03-11 17:48       ` Kulasek, TomaszX
  2016-03-15 14:31         ` Kulasek, TomaszX
  0 siblings, 1 reply; 25+ messages in thread
From: Kulasek, TomaszX @ 2016-03-11 17:48 UTC (permalink / raw)
  To: Thomas Monjalon, Maciej Czekaj; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Friday, March 11, 2016 17:23
> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
> 
> There is an error:
> examples/l3fwd/l3fwd_em_hlm_sse.h:72:38: error:
> 	incompatible type for argument 2 of ‘_mm_and_si128’

It's caused by

commit 64d3955de1de4d7879a0930a6d2f501369d3445a
Author: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
Date:   Thu Mar 10 17:06:22 2016 +0100

    examples/l3fwd: fix ARM build

    Enable NEON support in exact match mode.
    l3fwd example did not compile on ARM due to SSE2 instrincics used
    in generic part.
    Some instrinsins were used to initialize data structures and those were
    replaced by ordinary structure initalization.
    All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
    are moved to single inline function and made arch-specific.

    Signed-off-by: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>

Which doesn't include rework of l3fwd_em_hlm_sse.h file.

When you compile it now with global "#define HASH_MULTI_LOOKUP 1" and alternative classification is used, and compilation will also fail now.

I need a little bit more time to investigate it, because I'm not an expert in ARM. It will be nice if Maciej will help me in that.

Tomasz

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-11 17:48       ` Kulasek, TomaszX
@ 2016-03-15 14:31         ` Kulasek, TomaszX
  2016-03-15 14:49           ` Thomas Monjalon
  0 siblings, 1 reply; 25+ messages in thread
From: Kulasek, TomaszX @ 2016-03-15 14:31 UTC (permalink / raw)
  To: Thomas Monjalon, Maciej Czekaj; +Cc: dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Kulasek, TomaszX
> Sent: Friday, March 11, 2016 18:49
> To: Thomas Monjalon <thomas.monjalon@6wind.com>; Maciej Czekaj
> <maciej.czekaj@caviumnetworks.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
> 
> 
> 
> > -----Original Message-----
> > From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> > Sent: Friday, March 11, 2016 17:23
> > To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance
> fix
> >
> > There is an error:
> > examples/l3fwd/l3fwd_em_hlm_sse.h:72:38: error:
> > 	incompatible type for argument 2 of ‘_mm_and_si128’
> 
> It's caused by
> 
> commit 64d3955de1de4d7879a0930a6d2f501369d3445a
> Author: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> Date:   Thu Mar 10 17:06:22 2016 +0100
> 
>     examples/l3fwd: fix ARM build
> 
>     Enable NEON support in exact match mode.
>     l3fwd example did not compile on ARM due to SSE2 instrincics used
>     in generic part.
>     Some instrinsins were used to initialize data structures and those
> were
>     replaced by ordinary structure initalization.
>     All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
>     are moved to single inline function and made arch-specific.
> 
>     Signed-off-by: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> 
> Which doesn't include rework of l3fwd_em_hlm_sse.h file.
> 
> When you compile it now with global "#define HASH_MULTI_LOOKUP 1" and
> alternative classification is used, and compilation will also fail now.
> 
> I need a little bit more time to investigate it, because I'm not an expert
> in ARM. It will be nice if Maciej will help me in that.
> 
> Tomasz

Will be that ok for you to disable this path for arm?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-15 14:31         ` Kulasek, TomaszX
@ 2016-03-15 14:49           ` Thomas Monjalon
  2016-03-15 16:06             ` Kulasek, TomaszX
  0 siblings, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-15 14:49 UTC (permalink / raw)
  To: Kulasek, TomaszX, Maciej Czekaj; +Cc: dev

2016-03-15 14:31, Kulasek, TomaszX:
> From: Kulasek, TomaszX
> > From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> > > There is an error:
> > > examples/l3fwd/l3fwd_em_hlm_sse.h:72:38: error:
> > > 	incompatible type for argument 2 of ‘_mm_and_si128’
> > 
> > It's caused by
> > 
> > commit 64d3955de1de4d7879a0930a6d2f501369d3445a
> > Author: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> > Date:   Thu Mar 10 17:06:22 2016 +0100
> > 
> >     examples/l3fwd: fix ARM build
> > 
> >     Enable NEON support in exact match mode.
> >     l3fwd example did not compile on ARM due to SSE2 instrincics used
> >     in generic part.
> >     Some instrinsins were used to initialize data structures and those
> > were
> >     replaced by ordinary structure initalization.
> >     All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
> >     are moved to single inline function and made arch-specific.
> > 
> >     Signed-off-by: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> > 
> > Which doesn't include rework of l3fwd_em_hlm_sse.h file.
> > 
> > When you compile it now with global "#define HASH_MULTI_LOOKUP 1" and
> > alternative classification is used, and compilation will also fail now.
> > 
> > I need a little bit more time to investigate it, because I'm not an expert
> > in ARM. It will be nice if Maciej will help me in that.
> > 
> > Tomasz
> 
> Will be that ok for you to disable this path for arm?

Please, what do you mean?
Maciej, have you looked at this issue?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-15 14:49           ` Thomas Monjalon
@ 2016-03-15 16:06             ` Kulasek, TomaszX
  2016-03-15 19:42               ` [dpdk-dev] Odp.: " Czekaj, Maciej
  0 siblings, 1 reply; 25+ messages in thread
From: Kulasek, TomaszX @ 2016-03-15 16:06 UTC (permalink / raw)
  To: Thomas Monjalon, Maciej Czekaj; +Cc: dev



> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Tuesday, March 15, 2016 15:50
> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>; Maciej Czekaj
> <maciej.czekaj@caviumnetworks.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
> 
> 2016-03-15 14:31, Kulasek, TomaszX:
> > From: Kulasek, TomaszX
> > > From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> > > > There is an error:
> > > > examples/l3fwd/l3fwd_em_hlm_sse.h:72:38: error:
> > > > 	incompatible type for argument 2 of ‘_mm_and_si128’
> > >
> > > It's caused by
> > >
> > > commit 64d3955de1de4d7879a0930a6d2f501369d3445a
> > > Author: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> > > Date:   Thu Mar 10 17:06:22 2016 +0100
> > >
> > >     examples/l3fwd: fix ARM build
> > >
> > >     Enable NEON support in exact match mode.
> > >     l3fwd example did not compile on ARM due to SSE2 instrincics used
> > >     in generic part.
> > >     Some instrinsins were used to initialize data structures and
> > > those were
> > >     replaced by ordinary structure initalization.
> > >     All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP
> header
> > >     are moved to single inline function and made arch-specific.
> > >
> > >     Signed-off-by: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> > >
> > > Which doesn't include rework of l3fwd_em_hlm_sse.h file.
> > >
> > > When you compile it now with global "#define HASH_MULTI_LOOKUP 1"
> > > and alternative classification is used, and compilation will also fail
> now.
> > >
> > > I need a little bit more time to investigate it, because I'm not an
> > > expert in ARM. It will be nice if Maciej will help me in that.
> > >
> > > Tomasz
> >
> > Will be that ok for you to disable this path for arm?
> 
> Please, what do you mean?
> Maciej, have you looked at this issue?

This fix uses platform specific part of code which wasn't reworked in previous patch for ARM. It causes compilation error.
What I mean, is to leave current classification path for ARM and turn on alternative only for Intel platform.

Like that:

60 +#if defined(NO_HASH_MULTI_LOOKUP) || defined(__ARM_NEON)
61  #include "l3fwd_em_sse.h"
62  #else
63  #include "l3fwd_em_hlm_sse.h"


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] Odp.: [PATCH v3] examples/l3fwd: em path performance fix
  2016-03-15 16:06             ` Kulasek, TomaszX
@ 2016-03-15 19:42               ` Czekaj, Maciej
  0 siblings, 0 replies; 25+ messages in thread
From: Czekaj, Maciej @ 2016-03-15 19:42 UTC (permalink / raw)
  To: Kulasek, TomaszX, thomas.monjalon; +Cc: dev



________________________________________
Od: Kulasek, TomaszX <tomaszx.kulasek@intel.com>
Wysłane: 15 marca 2016 17:06
Do: Thomas Monjalon; Czekaj, Maciej
DW: dev@dpdk.org
Temat: RE: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix

> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> Sent: Tuesday, March 15, 2016 15:50
> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>; Maciej Czekaj
> <maciej.czekaj@caviumnetworks.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] examples/l3fwd: em path performance fix
>
> 2016-03-15 14:31, Kulasek, TomaszX:
> > From: Kulasek, TomaszX
> > > From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com]
> > > > There is an error:
> > > > examples/l3fwd/l3fwd_em_hlm_sse.h:72:38: error:
> > > >         incompatible type for argument 2 of ‘_mm_and_si128’
> > >
> > > It's caused by
> > >
> > > commit 64d3955de1de4d7879a0930a6d2f501369d3445a
> > > Author: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> > > Date:   Thu Mar 10 17:06:22 2016 +0100
> > >
> > >     examples/l3fwd: fix ARM build
> > >
> > >     Enable NEON support in exact match mode.
> > >     l3fwd example did not compile on ARM due to SSE2 instrincics used
> > >     in generic part.
> > >     Some instrinsins were used to initialize data structures and
> > > those were
> > >     replaced by ordinary structure initalization.
> > >     All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP
> header
> > >     are moved to single inline function and made arch-specific.
> > >
> > >     Signed-off-by: Maciej Czekaj <maciej.czekaj@caviumnetworks.com>
> > >
> > > Which doesn't include rework of l3fwd_em_hlm_sse.h file.
> > >
> > > When you compile it now with global "#define HASH_MULTI_LOOKUP 1"
> > > and alternative classification is used, and compilation will also fail
> now.
> > >
> > > I need a little bit more time to investigate it, because I'm not an
> > > expert in ARM. It will be nice if Maciej will help me in that.
> > >
> > > Tomasz
> >
> > Will be that ok for you to disable this path for arm?
>
> Please, what do you mean?
> Maciej, have you looked at this issue?

This fix uses platform specific part of code which wasn't reworked in previous patch for ARM. It causes compilation error.
What I mean, is to leave current classification path for ARM and turn on alternative only for Intel platform.

Like that:

60 +#if defined(NO_HASH_MULTI_LOOKUP) || defined(__ARM_NEON)
61  #include "l3fwd_em_sse.h"
62  #else
63  #include "l3fwd_em_hlm_sse.h"

Thanks guys for pointing this out. The issue is that after my patch mask0, mask1 and mask2 are now defined as:

static rte_xmm_t mask0;
static rte_xmm_t mask1;
static rte_xmm_t mask2;

rte_xmm_t is a union with xmm_t field inside.

Apparently, I overlooked the HASH_MULTI_LOOKUP define

I can provide a quick fix for that, I need to rename all maskN references to maskN.x, to point out to xmm_t variable. E.g. the following diff is fixing the compilation.

diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..eb23163 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -77,14 +77,14 @@ em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
                                sizeof(struct ether_hdr) +
                                offsetof(struct ipv4_hdr, time_to_live)));
 
-       key[0].xmm = _mm_and_si128(data[0], mask0);
-       key[1].xmm = _mm_and_si128(data[1], mask0);
-       key[2].xmm = _mm_and_si128(data[2], mask0);
-       key[3].xmm = _mm_and_si128(data[3], mask0);
-       key[4].xmm = _mm_and_si128(data[4], mask0);
-       key[5].xmm = _mm_and_si128(data[5], mask0);
-       key[6].xmm = _mm_and_si128(data[6], mask0);
-       key[7].xmm = _mm_and_si128(data[7], mask0);
+       key[0].xmm = _mm_and_si128(data[0], mask0.x);
+       key[1].xmm = _mm_and_si128(data[1], mask0.x);
+       key[2].xmm = _mm_and_si128(data[2], mask0.x);
+       key[3].xmm = _mm_and_si128(data[3], mask0.x);
+       key[4].xmm = _mm_and_si128(data[4], mask0.x);
+       key[5].xmm = _mm_and_si128(data[5], mask0.x);
+       key[6].xmm = _mm_and_si128(data[6], mask0.x);
+       key[7].xmm = _mm_and_si128(data[7], mask0.x);
 
        const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
                                &key[4], &key[5], &key[6], &key[7]};
@@ -175,14 +175,14 @@ em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
        int32_t ret[8];
        union ipv6_5tuple_host key[8];
 
-       get_ipv6_5tuple(m[0], mask1, mask2, &key[0]);
-       get_ipv6_5tuple(m[1], mask1, mask2, &key[1]);
-       get_ipv6_5tuple(m[2], mask1, mask2, &key[2]);
-       get_ipv6_5tuple(m[3], mask1, mask2, &key[3]);
-       get_ipv6_5tuple(m[4], mask1, mask2, &key[4]);
-       get_ipv6_5tuple(m[5], mask1, mask2, &key[5]);
-       get_ipv6_5tuple(m[6], mask1, mask2, &key[6]);
-       get_ipv6_5tuple(m[7], mask1, mask2, &key[7]);
+       get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+       get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+       get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+       get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+       get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+       get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+       get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+       get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
 
        const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
                        &key[4], &key[5], &key[6], &key[7]};

Would you like me to re-post the patch?

Thanks
Maciej

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v4] examples/l3fwd: em path performance fix
  2016-03-11 12:10   ` [dpdk-dev] [PATCH v3] " Tomasz Kulasek
  2016-03-11 16:23     ` Thomas Monjalon
@ 2016-03-18  9:36     ` Tomasz Kulasek
  2016-03-18  9:43       ` Kulasek, TomaszX
  2016-03-18  9:52       ` [dpdk-dev] [PATCH v5] " Tomasz Kulasek
  1 sibling, 2 replies; 25+ messages in thread
From: Tomasz Kulasek @ 2016-03-18  9:36 UTC (permalink / raw)
  To: dev

It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.


This patch should be applied after Maciej Czekaj's patch "l3fwd: Fix
compilation with HASH_MULTI_LOOKUP"


v4 changes:
 - rebased to be applicable after patch "l3fwd: Fix compilation with
   HASH_MULTI_LOOKUP" of Maciej Czekaj

v3 changes:
 - "lpm: extend IPv4 next hop field" patch extends dst_port table from
   uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
   what causes incompatible pointer type error after turning on this header

v2 changes:
 - fixed copy-paste error causing that not all packets are classified right
   in hash_multi_lookup implementation when burst size is not divisible
   by 8

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    8 ++++++++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 7dcc7e5..69dcc17 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,6 +40,12 @@
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
+#define __ARM_NEON 1
+
+#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)
+#define NO_HASH_MULTI_LOOKUP 1
+#endif
+
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
@@ -86,6 +92,8 @@ struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..5a2e7ff 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -320,7 +320,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -568,8 +568,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 891ae2e..7faf04a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -322,17 +314,17 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 
 		} else {
 			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j], portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
 		}
 	}
 
-	for (; j < n; j++)
+	for (; j < nb_rx; j++)
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index d4a2a2d..8bd150a 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index a354797..990a7f1 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -159,8 +159,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 8520f71..792894f 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v4] examples/l3fwd: em path performance fix
  2016-03-18  9:36     ` [dpdk-dev] [PATCH v4] " Tomasz Kulasek
@ 2016-03-18  9:43       ` Kulasek, TomaszX
  2016-03-18  9:52       ` [dpdk-dev] [PATCH v5] " Tomasz Kulasek
  1 sibling, 0 replies; 25+ messages in thread
From: Kulasek, TomaszX @ 2016-03-18  9:43 UTC (permalink / raw)
  To: Kulasek, TomaszX, dev



> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tomasz Kulasek
> Sent: Friday, March 18, 2016 10:37
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v4] examples/l3fwd: em path performance fix
> 
> It seems that for the most use cases, previous hash_multi_lookup provides
> better performance, and more, sequential lookup can cause significant
> performance drop.
> 
> This patch sets previously optional hash_multi_lookup method as default.
> It also provides some minor optimizations such as queue drain only on used
> tx ports.
> 
> 
> This patch should be applied after Maciej Czekaj's patch "l3fwd: Fix
> compilation with HASH_MULTI_LOOKUP"
> 
> 
> v4 changes:
>  - rebased to be applicable after patch "l3fwd: Fix compilation with
>    HASH_MULTI_LOOKUP" of Maciej Czekaj
> 
> v3 changes:
>  - "lpm: extend IPv4 next hop field" patch extends dst_port table from
>    uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
>    what causes incompatible pointer type error after turning on this
> header
> 
> v2 changes:
>  - fixed copy-paste error causing that not all packets are classified
> right
>    in hash_multi_lookup implementation when burst size is not divisible
>    by 8
> 
> Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")
> 
> Reported-by: Qian Xu <qian.q.xu@intel.com>
> Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
> ---
>  examples/l3fwd/l3fwd.h            |    8 ++++++++
>  examples/l3fwd/l3fwd_em.c         |    6 +++---
>  examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
>  examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
>  examples/l3fwd/l3fwd_lpm.c        |    4 ++--
>  examples/l3fwd/main.c             |    7 +++++++
>  6 files changed, 39 insertions(+), 23 deletions(-)
> 

Self NACK - I forgot to remove debug options

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18  9:36     ` [dpdk-dev] [PATCH v4] " Tomasz Kulasek
  2016-03-18  9:43       ` Kulasek, TomaszX
@ 2016-03-18  9:52       ` Tomasz Kulasek
  2016-03-18 10:04         ` Thomas Monjalon
  2016-03-18 13:31         ` [dpdk-dev] [PATCH v6] " Tomasz Kulasek
  1 sibling, 2 replies; 25+ messages in thread
From: Tomasz Kulasek @ 2016-03-18  9:52 UTC (permalink / raw)
  To: dev

It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.


This patch should be applied after Maciej Czekaj's patch "l3fwd: Fix
compilation with HASH_MULTI_LOOKUP"


v5 changes:
 - removed debug informations, patch cleanup

v4 changes:
 - rebased to be applicable after patch "l3fwd: Fix compilation with
   HASH_MULTI_LOOKUP" of Maciej Czekaj

v3 changes:
 - "lpm: extend IPv4 next hop field" patch extends dst_port table from
   uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
   what causes incompatible pointer type error after turning on this header

v2 changes:
 - fixed copy-paste error causing that not all packets are classified right
   in hash_multi_lookup implementation when burst size is not divisible
   by 8

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")
Fixes: dc81ebbacaeb ("lpm: extend IPv4 next hop field")
Fixes: 64d3955de1de ("examples/l3fwd: fix ARM build")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    6 ++++++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 7dcc7e5..1b148e7 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,6 +40,10 @@
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
+#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)
+#define NO_HASH_MULTI_LOOKUP 1
+#endif
+
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
@@ -86,6 +90,8 @@ struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..5a2e7ff 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -320,7 +320,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -568,8 +568,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 891ae2e..7faf04a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -322,17 +314,17 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 
 		} else {
 			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j], portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
 		}
 	}
 
-	for (; j < n; j++)
+	for (; j < nb_rx; j++)
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index d4a2a2d..8bd150a 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index a354797..990a7f1 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -159,8 +159,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 8520f71..792894f 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18  9:52       ` [dpdk-dev] [PATCH v5] " Tomasz Kulasek
@ 2016-03-18 10:04         ` Thomas Monjalon
  2016-03-18 10:52           ` Jerin Jacob
  2016-03-18 13:31         ` [dpdk-dev] [PATCH v6] " Tomasz Kulasek
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-18 10:04 UTC (permalink / raw)
  To: Tomasz Kulasek; +Cc: dev, Jan Viktorin, jianbo.liu, jerin.jacob

2016-03-18 10:52, Tomasz Kulasek:
> +#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)

I think we should use CONFIG_RTE_ARCH_ARM_NEON here.
Any ARM maintainer to confirm?

Note that there is already another occurence of this compiler flag:
examples/l3fwd/l3fwd_em.c:#elif defined(__ARM_NEON)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18 10:04         ` Thomas Monjalon
@ 2016-03-18 10:52           ` Jerin Jacob
  2016-03-18 11:00             ` Thomas Monjalon
  0 siblings, 1 reply; 25+ messages in thread
From: Jerin Jacob @ 2016-03-18 10:52 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Tomasz Kulasek, dev, Jan Viktorin, jianbo.liu

On Fri, Mar 18, 2016 at 11:04:49AM +0100, Thomas Monjalon wrote:
> 2016-03-18 10:52, Tomasz Kulasek:
> > +#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)
> 
> I think we should use CONFIG_RTE_ARCH_ARM_NEON here.
> Any ARM maintainer to confirm?

__ARM_NEON should work existing GCC, but it is better to use
RTE_MACHINE_CPUFLAG_NEON as
-it has been generated by probing the compiler capabilities.
-it's future-proof solution to support clang or other gcc versions in
future

➜ [master]laptop [dpdk-master] $ aarch64-thunderx-linux-gnu-gcc -dM -E - < /dev/null  | grep NEON
#define __ARM_NEON_FP 12
#define __ARM_NEON 1

make output
aarch64-thunderx-linux-gnu-gcc -Wp,-MD,./.eal.o.d.tmp  -pthread
-march=armv8-a+crc -mcpu=thunderx -DRTE_MACHINE_CPUFLAG_NEON
-DRTE_MACHINE_CPUFLAG_CRC32
-DRTE_COMPILE_TIME_CPUFLAGS=RTE_CPUFLAG_NEON,RTE_CPUFLAG_CRC32
-I/export/dpdk-master/build/include -include
/export/dpdk-master/build/include/rte_config.h
-I/export/dpdk-master/lib/librte_eal/linuxapp/eal/include
-I/export/dpdk-master/lib/librte_eal/common
-I/export/dpdk-master/lib/librte_eal/common/include
-I/export/dpdk-master/lib/librte_ring
-I/export/dpdk-master/lib/librte_mempool
-I/export/dpdk-master/lib/librte_ivshmem -W -Wall -Wstrict-prototypes
-Wmissing-prototypes -Wmissing-declarations -Wold-style-definition
-Wpointer-arith -Wcast-align -Wnested-externs -Wcast-qual
-Wformat-nonliteral -Wformat-security -Wundef -Wwrite-strings -Werror
-O3 -D_GNU_SOURCE  -o eal.o -c
/export/dpdk-master/lib/librte_eal/linuxapp/eal/eal.c 

Jerin

> 
> Note that there is already another occurence of this compiler flag:
> examples/l3fwd/l3fwd_em.c:#elif defined(__ARM_NEON)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18 10:52           ` Jerin Jacob
@ 2016-03-18 11:00             ` Thomas Monjalon
  2016-03-18 11:16               ` [dpdk-dev] [PATCH] examples/l3fwd: prefer probed NEON flag to ARM gcc flag Thomas Monjalon
  2016-03-18 11:56               ` [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix Jan Viktorin
  0 siblings, 2 replies; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-18 11:00 UTC (permalink / raw)
  To: Jerin Jacob, Tomasz Kulasek, Jan Viktorin; +Cc: dev, jianbo.liu

2016-03-18 16:22, Jerin Jacob:
> On Fri, Mar 18, 2016 at 11:04:49AM +0100, Thomas Monjalon wrote:
> > 2016-03-18 10:52, Tomasz Kulasek:
> > > +#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)
> > 
> > I think we should use CONFIG_RTE_ARCH_ARM_NEON here.
> > Any ARM maintainer to confirm?
> 
> __ARM_NEON should work existing GCC, but it is better to use
> RTE_MACHINE_CPUFLAG_NEON as
> -it has been generated by probing the compiler capabilities.
> -it's future-proof solution to support clang or other gcc versions in
> future

I agree to use RTE_MACHINE_CPUFLAG_NEON.

I just don't understand why CONFIG_RTE_ARCH_ARM_NEON has been introduced.
It seems to be used to disable NEON on ARMv7:
	ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)                                                                             
	MACHINE_CFLAGS += -mfpu=neon
	endif

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH] examples/l3fwd: prefer probed NEON flag to ARM gcc flag
  2016-03-18 11:00             ` Thomas Monjalon
@ 2016-03-18 11:16               ` Thomas Monjalon
  2016-03-18 11:20                 ` [dpdk-dev] [PATCH v2] " Thomas Monjalon
  2016-03-18 11:56               ` [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix Jan Viktorin
  1 sibling, 1 reply; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-18 11:16 UTC (permalink / raw)
  To: maciej.czekaj, jerin.jacob; +Cc: dev

Fixes: 64d3955de1de ("examples/l3fwd: fix ARM build")

Suggested-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
 examples/l3fwd/l3fwd_em.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..e8e7ca1 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -250,7 +250,7 @@ em_mask_key(void *key, xmm_t mask)
 
 	return _mm_and_si128(data, mask);
 }
-#elif defined(__ARM_NEON)
+#elif defined(RTE_CPUFLAG_NEON)
 static inline xmm_t
 em_mask_key(void *key, xmm_t mask)
 {
-- 
2.7.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v2] examples/l3fwd: prefer probed NEON flag to ARM gcc flag
  2016-03-18 11:16               ` [dpdk-dev] [PATCH] examples/l3fwd: prefer probed NEON flag to ARM gcc flag Thomas Monjalon
@ 2016-03-18 11:20                 ` Thomas Monjalon
  0 siblings, 0 replies; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-18 11:20 UTC (permalink / raw)
  To: maciej.czekaj, jerin.jacob; +Cc: dev

Fixes: 64d3955de1de ("examples/l3fwd: fix ARM build")

Suggested-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Signed-off-by: Thomas Monjalon <thomas.monjalon@6wind.com>
---
 examples/l3fwd/l3fwd_em.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

v2: use RTE_MACHINE_CPUFLAG_NEON instead of (always defined) RTE_CPUFLAG_NEON

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..4983eed 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -250,7 +250,7 @@ em_mask_key(void *key, xmm_t mask)
 
 	return _mm_and_si128(data, mask);
 }
-#elif defined(__ARM_NEON)
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
 static inline xmm_t
 em_mask_key(void *key, xmm_t mask)
 {
-- 
2.7.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18 11:00             ` Thomas Monjalon
  2016-03-18 11:16               ` [dpdk-dev] [PATCH] examples/l3fwd: prefer probed NEON flag to ARM gcc flag Thomas Monjalon
@ 2016-03-18 11:56               ` Jan Viktorin
  2016-03-18 12:45                 ` Kulasek, TomaszX
  1 sibling, 1 reply; 25+ messages in thread
From: Jan Viktorin @ 2016-03-18 11:56 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: Jerin Jacob, Tomasz Kulasek, dev, jianbo.liu

Hello Thomas, Jerin, Tomasz, all...

On Fri, 18 Mar 2016 12:00:24 +0100
Thomas Monjalon <thomas.monjalon@6wind.com> wrote:

> 2016-03-18 16:22, Jerin Jacob:
> > On Fri, Mar 18, 2016 at 11:04:49AM +0100, Thomas Monjalon wrote:  
> > > 2016-03-18 10:52, Tomasz Kulasek:  
> > > > +#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)  
> > > 
> > > I think we should use CONFIG_RTE_ARCH_ARM_NEON here.
> > > Any ARM maintainer to confirm?  
> > 
> > __ARM_NEON should work existing GCC, but it is better to use
> > RTE_MACHINE_CPUFLAG_NEON as
> > -it has been generated by probing the compiler capabilities.
> > -it's future-proof solution to support clang or other gcc versions in
> > future  
> 
> I agree to use RTE_MACHINE_CPUFLAG_NEON.
> 
> I just don't understand why CONFIG_RTE_ARCH_ARM_NEON has been introduced.
> It seems to be used to disable NEON on ARMv7:

This is true. You should be able to disable the NEON-specific code if it
is unwanted. Eg., the memcpy operations are not always faster with NEON.
But...

$ git grep ARM_NEON
...
lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h:45:#ifdef __ARM_NEON_FP
lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h:328:#endif /* __ARM_NEON_FP */
...

From this point of view, this is wrong and should be fixed to check
a different constant.

> 	ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)                                                                             
> 	MACHINE_CFLAGS += -mfpu=neon
> 	endif

However, there is another possible way of understanding these options.
We can (well, unlikely and I am about to say 'never') have an ARM
processor without NEON. This cannot be detected by gcc as it does not
know the target processor... So from my point of view:

* CONFIG_RTE_ARCH_ARM_NEON says "my CPU does (not) support NEON" or "I
  want to enable/disable NEON" while
* RTE_MACHINE_CPUFLAG_NEON says, the _compiler_ supports NEON

I'll send a patch trying to solve this.

Regards
Jan

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18 11:56               ` [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix Jan Viktorin
@ 2016-03-18 12:45                 ` Kulasek, TomaszX
  2016-03-18 12:50                   ` Jan Viktorin
  0 siblings, 1 reply; 25+ messages in thread
From: Kulasek, TomaszX @ 2016-03-18 12:45 UTC (permalink / raw)
  To: Jan Viktorin, Thomas Monjalon; +Cc: Jerin Jacob, dev, jianbo.liu



> -----Original Message-----
> From: Jan Viktorin [mailto:viktorin@rehivetech.com]
> Sent: Friday, March 18, 2016 12:57
> To: Thomas Monjalon <thomas.monjalon@6wind.com>
> Cc: Jerin Jacob <jerin.jacob@caviumnetworks.com>; Kulasek, TomaszX
> <tomaszx.kulasek@intel.com>; dev@dpdk.org; jianbo.liu@linaro.org
> Subject: Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
> 
> Hello Thomas, Jerin, Tomasz, all...
> 
> On Fri, 18 Mar 2016 12:00:24 +0100
> Thomas Monjalon <thomas.monjalon@6wind.com> wrote:
> 
> > 2016-03-18 16:22, Jerin Jacob:
> > > On Fri, Mar 18, 2016 at 11:04:49AM +0100, Thomas Monjalon wrote:
> > > > 2016-03-18 10:52, Tomasz Kulasek:
> > > > > +#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)
> > > >
> > > > I think we should use CONFIG_RTE_ARCH_ARM_NEON here.
> > > > Any ARM maintainer to confirm?
> > >
> > > __ARM_NEON should work existing GCC, but it is better to use
> > > RTE_MACHINE_CPUFLAG_NEON as -it has been generated by probing the
> > > compiler capabilities.
> > > -it's future-proof solution to support clang or other gcc versions
> > > in future
> >
> > I agree to use RTE_MACHINE_CPUFLAG_NEON.
> >
> > I just don't understand why CONFIG_RTE_ARCH_ARM_NEON has been
> introduced.
> > It seems to be used to disable NEON on ARMv7:
> 
> This is true. You should be able to disable the NEON-specific code if it
> is unwanted. Eg., the memcpy operations are not always faster with NEON.
> But...
> 
> $ git grep ARM_NEON
> ...
> lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h:45:#ifdef
> __ARM_NEON_FP
> lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h:328:#endif /*
> __ARM_NEON_FP */ ...
> 
> From this point of view, this is wrong and should be fixed to check a
> different constant.
> 
> > 	ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
> > 	MACHINE_CFLAGS += -mfpu=neon
> > 	endif
> 
> However, there is another possible way of understanding these options.
> We can (well, unlikely and I am about to say 'never') have an ARM
> processor without NEON. This cannot be detected by gcc as it does not know
> the target processor... So from my point of view:
> 
> * CONFIG_RTE_ARCH_ARM_NEON says "my CPU does (not) support NEON" or "I
>   want to enable/disable NEON" while
> * RTE_MACHINE_CPUFLAG_NEON says, the _compiler_ supports NEON
> 
> I'll send a patch trying to solve this.
> 
> Regards
> Jan

Hi

As I understand with your last patch it's safe and preferred to use RTE_MACHINE_CPUFLAG_NEON for ARM Neon detection? If so, I can include this modification for whole l3fwd in v6 of this patch.

Tomasz.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
  2016-03-18 12:45                 ` Kulasek, TomaszX
@ 2016-03-18 12:50                   ` Jan Viktorin
  0 siblings, 0 replies; 25+ messages in thread
From: Jan Viktorin @ 2016-03-18 12:50 UTC (permalink / raw)
  To: Kulasek, TomaszX; +Cc: Thomas Monjalon, Jerin Jacob, dev, jianbo.liu

On Fri, 18 Mar 2016 12:45:03 +0000
"Kulasek, TomaszX" <tomaszx.kulasek@intel.com> wrote:

> > -----Original Message-----
> > From: Jan Viktorin [mailto:viktorin@rehivetech.com]
> > Sent: Friday, March 18, 2016 12:57
> > To: Thomas Monjalon <thomas.monjalon@6wind.com>
> > Cc: Jerin Jacob <jerin.jacob@caviumnetworks.com>; Kulasek, TomaszX
> > <tomaszx.kulasek@intel.com>; dev@dpdk.org; jianbo.liu@linaro.org
> > Subject: Re: [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix
> > 
> > Hello Thomas, Jerin, Tomasz, all...
> > 
> > On Fri, 18 Mar 2016 12:00:24 +0100
> > Thomas Monjalon <thomas.monjalon@6wind.com> wrote:
> >   
> > > 2016-03-18 16:22, Jerin Jacob:  
> > > > On Fri, Mar 18, 2016 at 11:04:49AM +0100, Thomas Monjalon wrote:  
> > > > > 2016-03-18 10:52, Tomasz Kulasek:  
> > > > > > +#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)  
> > > > >
> > > > > I think we should use CONFIG_RTE_ARCH_ARM_NEON here.
> > > > > Any ARM maintainer to confirm?  
> > > >
> > > > __ARM_NEON should work existing GCC, but it is better to use
> > > > RTE_MACHINE_CPUFLAG_NEON as -it has been generated by probing the
> > > > compiler capabilities.
> > > > -it's future-proof solution to support clang or other gcc versions
> > > > in future  
> > >
> > > I agree to use RTE_MACHINE_CPUFLAG_NEON.
> > >
> > > I just don't understand why CONFIG_RTE_ARCH_ARM_NEON has been  
> > introduced.  
> > > It seems to be used to disable NEON on ARMv7:  
> > 
> > This is true. You should be able to disable the NEON-specific code if it
> > is unwanted. Eg., the memcpy operations are not always faster with NEON.
> > But...
> > 
> > $ git grep ARM_NEON
> > ...
> > lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h:45:#ifdef
> > __ARM_NEON_FP
> > lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h:328:#endif /*
> > __ARM_NEON_FP */ ...
> > 
> > From this point of view, this is wrong and should be fixed to check a
> > different constant.
> >   
> > > 	ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
> > > 	MACHINE_CFLAGS += -mfpu=neon
> > > 	endif  
> > 
> > However, there is another possible way of understanding these options.
> > We can (well, unlikely and I am about to say 'never') have an ARM
> > processor without NEON. This cannot be detected by gcc as it does not know
> > the target processor... So from my point of view:
> > 
> > * CONFIG_RTE_ARCH_ARM_NEON says "my CPU does (not) support NEON" or "I
> >   want to enable/disable NEON" while
> > * RTE_MACHINE_CPUFLAG_NEON says, the _compiler_ supports NEON
> > 
> > I'll send a patch trying to solve this.
> > 
> > Regards
> > Jan  
> 
> Hi
> 
> As I understand with your last patch it's safe and preferred to use RTE_MACHINE_CPUFLAG_NEON for ARM Neon detection? If so, I can include this modification for whole l3fwd in v6 of this patch.

Yes, I'd prefer this approach as well.

Jan

> 
> Tomasz.



-- 
   Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
   System Architect              Web:    www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [dpdk-dev] [PATCH v6] examples/l3fwd: em path performance fix
  2016-03-18  9:52       ` [dpdk-dev] [PATCH v5] " Tomasz Kulasek
  2016-03-18 10:04         ` Thomas Monjalon
@ 2016-03-18 13:31         ` Tomasz Kulasek
  2016-03-21 11:57           ` Thomas Monjalon
  1 sibling, 1 reply; 25+ messages in thread
From: Tomasz Kulasek @ 2016-03-18 13:31 UTC (permalink / raw)
  To: dev

It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.


This patch should be applied after Maciej Czekaj's patch "l3fwd: Fix
compilation with HASH_MULTI_LOOKUP"


v6 changes:
 - use RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON for ARM Neon
   detection

v5 changes:
 - removed debug informations, patch cleanup

v4 changes:
 - rebased to be applicable after patch "l3fwd: Fix compilation with
   HASH_MULTI_LOOKUP" of Maciej Czekaj

v3 changes:
 - "lpm: extend IPv4 next hop field" patch extends dst_port table from
   uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
   what causes incompatible pointer type error after turning on this header

v2 changes:
 - fixed copy-paste error causing that not all packets are classified right
   in hash_multi_lookup implementation when burst size is not divisible
   by 8

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")
Fixes: dc81ebbacaeb ("lpm: extend IPv4 next hop field")
Fixes: 64d3955de1de ("examples/l3fwd: fix ARM build")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    6 ++++++
 examples/l3fwd/l3fwd_em.c         |    8 ++++----
 examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 7dcc7e5..726e8cc 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,6 +40,10 @@
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
+#if !defined(NO_HASH_MULTI_LOOKUP) && defined(RTE_MACHINE_CPUFLAG_NEON)
+#define NO_HASH_MULTI_LOOKUP 1
+#endif
+
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
@@ -86,6 +90,8 @@ struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..50f09e5 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -250,7 +250,7 @@ em_mask_key(void *key, xmm_t mask)
 
 	return _mm_and_si128(data, mask);
 }
-#elif defined(__ARM_NEON)
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
 static inline xmm_t
 em_mask_key(void *key, xmm_t mask)
 {
@@ -320,7 +320,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -568,8 +568,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 891ae2e..7faf04a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -322,17 +314,17 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 
 		} else {
 			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j], portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
 		}
 	}
 
-	for (; j < n; j++)
+	for (; j < nb_rx; j++)
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index d4a2a2d..8bd150a 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index a354797..990a7f1 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -159,8 +159,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 8520f71..792894f 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [dpdk-dev] [PATCH v6] examples/l3fwd: em path performance fix
  2016-03-18 13:31         ` [dpdk-dev] [PATCH v6] " Tomasz Kulasek
@ 2016-03-21 11:57           ` Thomas Monjalon
  0 siblings, 0 replies; 25+ messages in thread
From: Thomas Monjalon @ 2016-03-21 11:57 UTC (permalink / raw)
  To: Tomasz Kulasek; +Cc: dev

2016-03-18 14:31, Tomasz Kulasek:
> It seems that for the most use cases, previous hash_multi_lookup provides
> better performance, and more, sequential lookup can cause significant
> performance drop.
> 
> This patch sets previously optional hash_multi_lookup method as default.
> It also provides some minor optimizations such as queue drain only on used
> tx ports.

Applied, thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2016-03-21 11:58 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-03 17:23 [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix Tomasz Kulasek
2016-03-07  6:19 ` Xu, Qian Q
2016-03-08 12:58 ` [dpdk-dev] [PATCH v2] " Tomasz Kulasek
2016-03-11 11:14   ` Thomas Monjalon
2016-03-11 12:28     ` Kulasek, TomaszX
2016-03-11 12:10   ` [dpdk-dev] [PATCH v3] " Tomasz Kulasek
2016-03-11 16:23     ` Thomas Monjalon
2016-03-11 17:48       ` Kulasek, TomaszX
2016-03-15 14:31         ` Kulasek, TomaszX
2016-03-15 14:49           ` Thomas Monjalon
2016-03-15 16:06             ` Kulasek, TomaszX
2016-03-15 19:42               ` [dpdk-dev] Odp.: " Czekaj, Maciej
2016-03-18  9:36     ` [dpdk-dev] [PATCH v4] " Tomasz Kulasek
2016-03-18  9:43       ` Kulasek, TomaszX
2016-03-18  9:52       ` [dpdk-dev] [PATCH v5] " Tomasz Kulasek
2016-03-18 10:04         ` Thomas Monjalon
2016-03-18 10:52           ` Jerin Jacob
2016-03-18 11:00             ` Thomas Monjalon
2016-03-18 11:16               ` [dpdk-dev] [PATCH] examples/l3fwd: prefer probed NEON flag to ARM gcc flag Thomas Monjalon
2016-03-18 11:20                 ` [dpdk-dev] [PATCH v2] " Thomas Monjalon
2016-03-18 11:56               ` [dpdk-dev] [PATCH v5] examples/l3fwd: em path performance fix Jan Viktorin
2016-03-18 12:45                 ` Kulasek, TomaszX
2016-03-18 12:50                   ` Jan Viktorin
2016-03-18 13:31         ` [dpdk-dev] [PATCH v6] " Tomasz Kulasek
2016-03-21 11:57           ` Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).