DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM
@ 2016-03-10 16:06 Maciej.Czekaj
  2016-03-10 16:06 ` Maciej.Czekaj
  2016-03-15 20:05 ` [dpdk-dev] [PATCH v2] " Maciej.Czekaj
  0 siblings, 2 replies; 9+ messages in thread
From: Maciej.Czekaj @ 2016-03-10 16:06 UTC (permalink / raw)
  To: dev

From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

This patch depends on following pending patches:
lpm: add support for NEON
http://dpdk.org/dev/patchwork/patch/10479/
lpm: make rte_lpm_lookupx4 API definition architecture agnostic
http://dpdk.org/dev/patchwork/patch/10478/

Maciej Czekaj (1):
  l3fwd: Fix compilation & enable exact match mode on ARM.

 examples/l3fwd/l3fwd.h    |  4 ++-
 examples/l3fwd/l3fwd_em.c | 72 +++++++++++++++++++++++++++++------------------
 examples/l3fwd/main.c     |  2 +-
 3 files changed, 48 insertions(+), 30 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-10 16:06 [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM Maciej.Czekaj
@ 2016-03-10 16:06 ` Maciej.Czekaj
  2016-03-11 15:16   ` Thomas Monjalon
  2016-03-15 20:05 ` [dpdk-dev] [PATCH v2] " Maciej.Czekaj
  1 sibling, 1 reply; 9+ messages in thread
From: Maciej.Czekaj @ 2016-03-10 16:06 UTC (permalink / raw)
  To: dev

From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

Enable NEON support in exact match mode.
l3fwd example did not compile on ARM due to SSE2 instrincics used
in generic part.
Some instrinsins were used to initialize data structures and those were
replaced by ordinary structure initalization.
All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
are moved to single inline function and made arch-specific.

Signed-off-by: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>
---
 examples/l3fwd/l3fwd.h    |  4 ++-
 examples/l3fwd/l3fwd_em.c | 72 +++++++++++++++++++++++++++++------------------
 examples/l3fwd/main.c     |  2 +-
 3 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index da6d369..7dcc7e5 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -34,6 +34,8 @@
 #ifndef __L3_FWD_H__
 #define __L3_FWD_H__
 
+#include <rte_vect.h>
+
 #define DO_RFC_1812_CHECKS
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
@@ -103,7 +105,7 @@ extern uint32_t enabled_port_mask;
 extern int ipv6; /**< ipv6 is false by default. */
 extern uint32_t hash_entry_number;
 
-extern __m128i val_eth[RTE_MAX_ETHPORTS];
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
 
 extern struct lcore_conf lcore_conf[RTE_MAX_LCORE];
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index f6a65d8..0adf8f4 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -85,7 +85,7 @@ union ipv4_5tuple_host {
 		uint16_t port_src;
 		uint16_t port_dst;
 	};
-	__m128i xmm;
+	xmm_t xmm;
 };
 
 #define XMM_NUM_IN_IPV6_5TUPLE 3
@@ -109,9 +109,11 @@ union ipv6_5tuple_host {
 		uint16_t port_dst;
 		uint64_t reserve;
 	};
-	__m128i xmm[XMM_NUM_IN_IPV6_5TUPLE];
+	xmm_t xmm[XMM_NUM_IN_IPV6_5TUPLE];
 };
 
+
+
 struct ipv4_l3fwd_em_route {
 	struct ipv4_5tuple key;
 	uint8_t if_out;
@@ -236,9 +238,27 @@ ipv6_hash_crc(const void *data, __rte_unused uint32_t data_len,
 static uint8_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
 static uint8_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
 
-static __m128i mask0;
-static __m128i mask1;
-static __m128i mask2;
+static rte_xmm_t mask0;
+static rte_xmm_t mask1;
+static rte_xmm_t mask2;
+
+#if defined(__SSE2__)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	__m128i data = _mm_loadu_si128((__m128i *)(key));
+
+	return _mm_and_si128(data, mask);
+}
+#elif defined(__ARM_NEON)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	int32x4_t data = vld1q_s32((int32_t *)key);
+
+	return vandq_s32(data, mask);
+}
+#endif
 
 static inline uint8_t
 em_get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, void *lookup_struct)
@@ -249,13 +269,12 @@ em_get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, void *lookup_struct)
 		(struct rte_hash *)lookup_struct;
 
 	ipv4_hdr = (uint8_t *)ipv4_hdr + offsetof(struct ipv4_hdr, time_to_live);
-	__m128i data = _mm_loadu_si128((__m128i *)(ipv4_hdr));
 
 	/*
 	 * Get 5 tuple: dst port, src port, dst IP address,
 	 * src IP address and protocol.
 	 */
-	key.xmm = _mm_and_si128(data, mask0);
+	key.xmm = em_mask_key(ipv4_hdr, mask0.x);
 
 	/* Find destination port */
 	ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key);
@@ -271,35 +290,31 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
 		(struct rte_hash *)lookup_struct;
 
 	ipv6_hdr = (uint8_t *)ipv6_hdr + offsetof(struct ipv6_hdr, payload_len);
-	__m128i data0 =
-		_mm_loadu_si128((__m128i *)(ipv6_hdr));
-	__m128i data1 =
-		_mm_loadu_si128((__m128i *)(((uint8_t *)ipv6_hdr)+
-					sizeof(__m128i)));
-	__m128i data2 =
-		_mm_loadu_si128((__m128i *)(((uint8_t *)ipv6_hdr)+
-					sizeof(__m128i)+sizeof(__m128i)));
+	void *data0 = ipv6_hdr;
+	void *data1 = ((uint8_t *)ipv6_hdr) + sizeof(xmm_t);
+	void *data2 = ((uint8_t *)ipv6_hdr) + sizeof(xmm_t) + sizeof(xmm_t);
 
 	/* Get part of 5 tuple: src IP address lower 96 bits and protocol */
-	key.xmm[0] = _mm_and_si128(data0, mask1);
+	key.xmm[0] = em_mask_key(data0, mask1.x);
 
 	/*
 	 * Get part of 5 tuple: dst IP address lower 96 bits
 	 * and src IP address higher 32 bits.
 	 */
-	key.xmm[1] = data1;
+	key.xmm[1] = *(xmm_t *)data1;
 
 	/*
 	 * Get part of 5 tuple: dst port and src port
 	 * and dst IP address higher 32 bits.
 	 */
-	key.xmm[2] = _mm_and_si128(data2, mask2);
+	key.xmm[2] = em_mask_key(data2, mask2.x);
 
 	/* Find destination port */
 	ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key);
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
+
 /*
  * Include header file if SSE4_1 is enabled for
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
@@ -348,14 +363,15 @@ convert_ipv6_5tuple(struct ipv6_5tuple *key1,
 #define BYTE_VALUE_MAX 256
 #define ALL_32_BITS 0xffffffff
 #define BIT_8_TO_15 0x0000ff00
+
 static inline void
 populate_ipv4_few_flow_into_table(const struct rte_hash *h)
 {
 	uint32_t i;
 	int32_t ret;
 
-	mask0 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_8_TO_15);
+	mask0 = (rte_xmm_t){.u32 = {BIT_8_TO_15, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
 	for (i = 0; i < IPV4_L3FWD_EM_NUM_ROUTES; i++) {
 		struct ipv4_l3fwd_em_route  entry;
@@ -381,10 +397,10 @@ populate_ipv6_few_flow_into_table(const struct rte_hash *h)
 	uint32_t i;
 	int32_t ret;
 
-	mask1 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_16_TO_23);
+	mask1 = (rte_xmm_t){.u32 = {BIT_16_TO_23, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
-	mask2 = _mm_set_epi32(0, 0, ALL_32_BITS, ALL_32_BITS);
+	mask2 = (rte_xmm_t){.u32 = {ALL_32_BITS, ALL_32_BITS, 0, 0} };
 
 	for (i = 0; i < IPV6_L3FWD_EM_NUM_ROUTES; i++) {
 		struct ipv6_l3fwd_em_route entry;
@@ -410,8 +426,8 @@ populate_ipv4_many_flow_into_table(const struct rte_hash *h,
 {
 	unsigned i;
 
-	mask0 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_8_TO_15);
+	mask0 = (rte_xmm_t){.u32 = {BIT_8_TO_15, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
 	for (i = 0; i < nr_flow; i++) {
 		struct ipv4_l3fwd_em_route entry;
@@ -462,9 +478,9 @@ populate_ipv6_many_flow_into_table(const struct rte_hash *h,
 {
 	unsigned i;
 
-	mask1 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_16_TO_23);
-	mask2 = _mm_set_epi32(0, 0, ALL_32_BITS, ALL_32_BITS);
+	mask1 = (rte_xmm_t){.u32 = {BIT_16_TO_23, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
+	mask2 = (rte_xmm_t){.u32 = {ALL_32_BITS, ALL_32_BITS, 0, 0} };
 
 	for (i = 0; i < nr_flow; i++) {
 		struct ipv6_l3fwd_em_route entry;
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 0e33039..8520f71 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -112,7 +112,7 @@ volatile bool force_quit;
 uint64_t dest_eth_addr[RTE_MAX_ETHPORTS];
 struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
-__m128i val_eth[RTE_MAX_ETHPORTS];
+xmm_t val_eth[RTE_MAX_ETHPORTS];
 
 /* mask of enabled ports */
 uint32_t enabled_port_mask;
-- 
1.9.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-10 16:06 ` Maciej.Czekaj
@ 2016-03-11 15:16   ` Thomas Monjalon
  0 siblings, 0 replies; 9+ messages in thread
From: Thomas Monjalon @ 2016-03-11 15:16 UTC (permalink / raw)
  To: Maciej.Czekaj; +Cc: dev

2016-03-10 17:06, Maciej.Czekaj@caviumnetworks.com:
> From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>
> 
> Enable NEON support in exact match mode.
> l3fwd example did not compile on ARM due to SSE2 instrincics used
> in generic part.
> Some instrinsins were used to initialize data structures and those were
> replaced by ordinary structure initalization.
> All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
> are moved to single inline function and made arch-specific.
> 
> Signed-off-by: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

Applied, thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH v2] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-10 16:06 [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM Maciej.Czekaj
  2016-03-10 16:06 ` Maciej.Czekaj
@ 2016-03-15 20:05 ` Maciej.Czekaj
  2016-03-15 20:05   ` Maciej.Czekaj
  2016-03-15 20:49   ` Thomas Monjalon
  1 sibling, 2 replies; 9+ messages in thread
From: Maciej.Czekaj @ 2016-03-15 20:05 UTC (permalink / raw)
  To: dev; +Cc: tomaszx.kulasek, thomas.monjalon, Maciej Czekaj

From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

v2:
 * Fixed compilation issue with HASH_MULTI_LOOKUP

Maciej Czekaj (1):
  l3fwd: Fix compilation & enable exact match mode on ARM.

 examples/l3fwd/l3fwd.h            |  4 ++-
 examples/l3fwd/l3fwd_em.c         | 72 ++++++++++++++++++++++++---------------
 examples/l3fwd/l3fwd_em_hlm_sse.h | 32 ++++++++---------
 examples/l3fwd/main.c             |  2 +-
 4 files changed, 64 insertions(+), 46 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH v2] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-15 20:05 ` [dpdk-dev] [PATCH v2] " Maciej.Czekaj
@ 2016-03-15 20:05   ` Maciej.Czekaj
  2016-03-15 20:49   ` Thomas Monjalon
  1 sibling, 0 replies; 9+ messages in thread
From: Maciej.Czekaj @ 2016-03-15 20:05 UTC (permalink / raw)
  To: dev; +Cc: tomaszx.kulasek, thomas.monjalon, Maciej Czekaj

From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

Enable NEON support in exact match mode.
l3fwd example did not compile on ARM due to SSE2 instrincics used
in generic part.
Some instrinsins were used to initialize data structures and those were
replaced by ordinary structure initalization.
All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
are moved to single inline function and made arch-specific.

Signed-off-by: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>
---
 examples/l3fwd/l3fwd.h            |  4 ++-
 examples/l3fwd/l3fwd_em.c         | 72 ++++++++++++++++++++++++---------------
 examples/l3fwd/l3fwd_em_hlm_sse.h | 32 ++++++++---------
 examples/l3fwd/main.c             |  2 +-
 4 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index da6d369..7dcc7e5 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -34,6 +34,8 @@
 #ifndef __L3_FWD_H__
 #define __L3_FWD_H__
 
+#include <rte_vect.h>
+
 #define DO_RFC_1812_CHECKS
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
@@ -103,7 +105,7 @@ extern uint32_t enabled_port_mask;
 extern int ipv6; /**< ipv6 is false by default. */
 extern uint32_t hash_entry_number;
 
-extern __m128i val_eth[RTE_MAX_ETHPORTS];
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
 
 extern struct lcore_conf lcore_conf[RTE_MAX_LCORE];
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index f6a65d8..0adf8f4 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -85,7 +85,7 @@ union ipv4_5tuple_host {
 		uint16_t port_src;
 		uint16_t port_dst;
 	};
-	__m128i xmm;
+	xmm_t xmm;
 };
 
 #define XMM_NUM_IN_IPV6_5TUPLE 3
@@ -109,9 +109,11 @@ union ipv6_5tuple_host {
 		uint16_t port_dst;
 		uint64_t reserve;
 	};
-	__m128i xmm[XMM_NUM_IN_IPV6_5TUPLE];
+	xmm_t xmm[XMM_NUM_IN_IPV6_5TUPLE];
 };
 
+
+
 struct ipv4_l3fwd_em_route {
 	struct ipv4_5tuple key;
 	uint8_t if_out;
@@ -236,9 +238,27 @@ ipv6_hash_crc(const void *data, __rte_unused uint32_t data_len,
 static uint8_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
 static uint8_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
 
-static __m128i mask0;
-static __m128i mask1;
-static __m128i mask2;
+static rte_xmm_t mask0;
+static rte_xmm_t mask1;
+static rte_xmm_t mask2;
+
+#if defined(__SSE2__)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	__m128i data = _mm_loadu_si128((__m128i *)(key));
+
+	return _mm_and_si128(data, mask);
+}
+#elif defined(__ARM_NEON)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	int32x4_t data = vld1q_s32((int32_t *)key);
+
+	return vandq_s32(data, mask);
+}
+#endif
 
 static inline uint8_t
 em_get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, void *lookup_struct)
@@ -249,13 +269,12 @@ em_get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, void *lookup_struct)
 		(struct rte_hash *)lookup_struct;
 
 	ipv4_hdr = (uint8_t *)ipv4_hdr + offsetof(struct ipv4_hdr, time_to_live);
-	__m128i data = _mm_loadu_si128((__m128i *)(ipv4_hdr));
 
 	/*
 	 * Get 5 tuple: dst port, src port, dst IP address,
 	 * src IP address and protocol.
 	 */
-	key.xmm = _mm_and_si128(data, mask0);
+	key.xmm = em_mask_key(ipv4_hdr, mask0.x);
 
 	/* Find destination port */
 	ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key);
@@ -271,35 +290,31 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
 		(struct rte_hash *)lookup_struct;
 
 	ipv6_hdr = (uint8_t *)ipv6_hdr + offsetof(struct ipv6_hdr, payload_len);
-	__m128i data0 =
-		_mm_loadu_si128((__m128i *)(ipv6_hdr));
-	__m128i data1 =
-		_mm_loadu_si128((__m128i *)(((uint8_t *)ipv6_hdr)+
-					sizeof(__m128i)));
-	__m128i data2 =
-		_mm_loadu_si128((__m128i *)(((uint8_t *)ipv6_hdr)+
-					sizeof(__m128i)+sizeof(__m128i)));
+	void *data0 = ipv6_hdr;
+	void *data1 = ((uint8_t *)ipv6_hdr) + sizeof(xmm_t);
+	void *data2 = ((uint8_t *)ipv6_hdr) + sizeof(xmm_t) + sizeof(xmm_t);
 
 	/* Get part of 5 tuple: src IP address lower 96 bits and protocol */
-	key.xmm[0] = _mm_and_si128(data0, mask1);
+	key.xmm[0] = em_mask_key(data0, mask1.x);
 
 	/*
 	 * Get part of 5 tuple: dst IP address lower 96 bits
 	 * and src IP address higher 32 bits.
 	 */
-	key.xmm[1] = data1;
+	key.xmm[1] = *(xmm_t *)data1;
 
 	/*
 	 * Get part of 5 tuple: dst port and src port
 	 * and dst IP address higher 32 bits.
 	 */
-	key.xmm[2] = _mm_and_si128(data2, mask2);
+	key.xmm[2] = em_mask_key(data2, mask2.x);
 
 	/* Find destination port */
 	ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key);
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
+
 /*
  * Include header file if SSE4_1 is enabled for
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
@@ -348,14 +363,15 @@ convert_ipv6_5tuple(struct ipv6_5tuple *key1,
 #define BYTE_VALUE_MAX 256
 #define ALL_32_BITS 0xffffffff
 #define BIT_8_TO_15 0x0000ff00
+
 static inline void
 populate_ipv4_few_flow_into_table(const struct rte_hash *h)
 {
 	uint32_t i;
 	int32_t ret;
 
-	mask0 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_8_TO_15);
+	mask0 = (rte_xmm_t){.u32 = {BIT_8_TO_15, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
 	for (i = 0; i < IPV4_L3FWD_EM_NUM_ROUTES; i++) {
 		struct ipv4_l3fwd_em_route  entry;
@@ -381,10 +397,10 @@ populate_ipv6_few_flow_into_table(const struct rte_hash *h)
 	uint32_t i;
 	int32_t ret;
 
-	mask1 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_16_TO_23);
+	mask1 = (rte_xmm_t){.u32 = {BIT_16_TO_23, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
-	mask2 = _mm_set_epi32(0, 0, ALL_32_BITS, ALL_32_BITS);
+	mask2 = (rte_xmm_t){.u32 = {ALL_32_BITS, ALL_32_BITS, 0, 0} };
 
 	for (i = 0; i < IPV6_L3FWD_EM_NUM_ROUTES; i++) {
 		struct ipv6_l3fwd_em_route entry;
@@ -410,8 +426,8 @@ populate_ipv4_many_flow_into_table(const struct rte_hash *h,
 {
 	unsigned i;
 
-	mask0 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_8_TO_15);
+	mask0 = (rte_xmm_t){.u32 = {BIT_8_TO_15, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
 	for (i = 0; i < nr_flow; i++) {
 		struct ipv4_l3fwd_em_route entry;
@@ -462,9 +478,9 @@ populate_ipv6_many_flow_into_table(const struct rte_hash *h,
 {
 	unsigned i;
 
-	mask1 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_16_TO_23);
-	mask2 = _mm_set_epi32(0, 0, ALL_32_BITS, ALL_32_BITS);
+	mask1 = (rte_xmm_t){.u32 = {BIT_16_TO_23, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
+	mask2 = (rte_xmm_t){.u32 = {ALL_32_BITS, ALL_32_BITS, 0, 0} };
 
 	for (i = 0; i < nr_flow; i++) {
 		struct ipv6_l3fwd_em_route entry;
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..eb23163 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -77,14 +77,14 @@ em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 				sizeof(struct ether_hdr) +
 				offsetof(struct ipv4_hdr, time_to_live)));
 
-	key[0].xmm = _mm_and_si128(data[0], mask0);
-	key[1].xmm = _mm_and_si128(data[1], mask0);
-	key[2].xmm = _mm_and_si128(data[2], mask0);
-	key[3].xmm = _mm_and_si128(data[3], mask0);
-	key[4].xmm = _mm_and_si128(data[4], mask0);
-	key[5].xmm = _mm_and_si128(data[5], mask0);
-	key[6].xmm = _mm_and_si128(data[6], mask0);
-	key[7].xmm = _mm_and_si128(data[7], mask0);
+	key[0].xmm = _mm_and_si128(data[0], mask0.x);
+	key[1].xmm = _mm_and_si128(data[1], mask0.x);
+	key[2].xmm = _mm_and_si128(data[2], mask0.x);
+	key[3].xmm = _mm_and_si128(data[3], mask0.x);
+	key[4].xmm = _mm_and_si128(data[4], mask0.x);
+	key[5].xmm = _mm_and_si128(data[5], mask0.x);
+	key[6].xmm = _mm_and_si128(data[6], mask0.x);
+	key[7].xmm = _mm_and_si128(data[7], mask0.x);
 
 	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
 				&key[4], &key[5], &key[6], &key[7]};
@@ -175,14 +175,14 @@ em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 	int32_t ret[8];
 	union ipv6_5tuple_host key[8];
 
-	get_ipv6_5tuple(m[0], mask1, mask2, &key[0]);
-	get_ipv6_5tuple(m[1], mask1, mask2, &key[1]);
-	get_ipv6_5tuple(m[2], mask1, mask2, &key[2]);
-	get_ipv6_5tuple(m[3], mask1, mask2, &key[3]);
-	get_ipv6_5tuple(m[4], mask1, mask2, &key[4]);
-	get_ipv6_5tuple(m[5], mask1, mask2, &key[5]);
-	get_ipv6_5tuple(m[6], mask1, mask2, &key[6]);
-	get_ipv6_5tuple(m[7], mask1, mask2, &key[7]);
+	get_ipv6_5tuple(m[0], mask1.x, mask2.x, &key[0]);
+	get_ipv6_5tuple(m[1], mask1.x, mask2.x, &key[1]);
+	get_ipv6_5tuple(m[2], mask1.x, mask2.x, &key[2]);
+	get_ipv6_5tuple(m[3], mask1.x, mask2.x, &key[3]);
+	get_ipv6_5tuple(m[4], mask1.x, mask2.x, &key[4]);
+	get_ipv6_5tuple(m[5], mask1.x, mask2.x, &key[5]);
+	get_ipv6_5tuple(m[6], mask1.x, mask2.x, &key[6]);
+	get_ipv6_5tuple(m[7], mask1.x, mask2.x, &key[7]);
 
 	const void *key_array[8] = {&key[0], &key[1], &key[2], &key[3],
 			&key[4], &key[5], &key[6], &key[7]};
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 0e33039..8520f71 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -112,7 +112,7 @@ volatile bool force_quit;
 uint64_t dest_eth_addr[RTE_MAX_ETHPORTS];
 struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
-__m128i val_eth[RTE_MAX_ETHPORTS];
+xmm_t val_eth[RTE_MAX_ETHPORTS];
 
 /* mask of enabled ports */
 uint32_t enabled_port_mask;
-- 
1.9.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [dpdk-dev] [PATCH v2] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-15 20:05 ` [dpdk-dev] [PATCH v2] " Maciej.Czekaj
  2016-03-15 20:05   ` Maciej.Czekaj
@ 2016-03-15 20:49   ` Thomas Monjalon
  2016-03-15 23:47     ` [dpdk-dev] Odp.: " Czekaj, Maciej
  1 sibling, 1 reply; 9+ messages in thread
From: Thomas Monjalon @ 2016-03-15 20:49 UTC (permalink / raw)
  To: Maciej.Czekaj; +Cc: dev, tomaszx.kulasek

Hi Maciej,

2016-03-15 21:05, Maciej.Czekaj@caviumnetworks.com:
> v2:
>  * Fixed compilation issue with HASH_MULTI_LOOKUP

2 comments:
- your patch v1 is already applied, so now we need a standalone fix
- you do not need to send a cover letter for an unique patch

Thanks

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] Odp.: [PATCH v2] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-15 20:49   ` Thomas Monjalon
@ 2016-03-15 23:47     ` Czekaj, Maciej
  0 siblings, 0 replies; 9+ messages in thread
From: Czekaj, Maciej @ 2016-03-15 23:47 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, tomaszx.kulasek



________________________________________
Od: Thomas Monjalon <thomas.monjalon@6wind.com>
Wysłane: 15 marca 2016 21:49
Do: Czekaj, Maciej
DW: dev@dpdk.org; tomaszx.kulasek@intel.com
Temat: Re: [PATCH v2] l3fwd: Fix compilation & enable exact match mode on ARM.

Hi Maciej,

2016-03-15 21:05, Maciej.Czekaj@caviumnetworks.com:
> v2:
>  * Fixed compilation issue with HASH_MULTI_LOOKUP

2 comments:
- your patch v1 is already applied, so now we need a standalone fix
- you do not need to send a cover letter for an unique patch

Thanks

Addressed in new patch.

Thanks
Maciej

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM.
  2016-03-10 15:07 [dpdk-dev] [PATCH] " Maciej.Czekaj
@ 2016-03-10 15:07 ` Maciej.Czekaj
  0 siblings, 0 replies; 9+ messages in thread
From: Maciej.Czekaj @ 2016-03-10 15:07 UTC (permalink / raw)
  To: bruce.richardson, pablo.de.lara.guarch; +Cc: dev

From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

Enable NEON support in exact match mode.
l3fwd example did not compile on ARM due to SSE2 instrincics used
in generic part.
Some instrinsins were used to initialize data structures and those were
replaced by ordinary structure initalization.
All SSE2 intrinsics used in forwarding, i.e. masking the IP/TCP header
are moved to single inline function and made arch-specific.

Signed-off-by: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>
---
 examples/l3fwd/l3fwd.h    |  4 ++-
 examples/l3fwd/l3fwd_em.c | 72 +++++++++++++++++++++++++++++------------------
 examples/l3fwd/main.c     |  2 +-
 3 files changed, 48 insertions(+), 30 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index da6d369..7dcc7e5 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -34,6 +34,8 @@
 #ifndef __L3_FWD_H__
 #define __L3_FWD_H__
 
+#include <rte_vect.h>
+
 #define DO_RFC_1812_CHECKS
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
@@ -103,7 +105,7 @@ extern uint32_t enabled_port_mask;
 extern int ipv6; /**< ipv6 is false by default. */
 extern uint32_t hash_entry_number;
 
-extern __m128i val_eth[RTE_MAX_ETHPORTS];
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
 
 extern struct lcore_conf lcore_conf[RTE_MAX_LCORE];
 
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index f6a65d8..0adf8f4 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -85,7 +85,7 @@ union ipv4_5tuple_host {
 		uint16_t port_src;
 		uint16_t port_dst;
 	};
-	__m128i xmm;
+	xmm_t xmm;
 };
 
 #define XMM_NUM_IN_IPV6_5TUPLE 3
@@ -109,9 +109,11 @@ union ipv6_5tuple_host {
 		uint16_t port_dst;
 		uint64_t reserve;
 	};
-	__m128i xmm[XMM_NUM_IN_IPV6_5TUPLE];
+	xmm_t xmm[XMM_NUM_IN_IPV6_5TUPLE];
 };
 
+
+
 struct ipv4_l3fwd_em_route {
 	struct ipv4_5tuple key;
 	uint8_t if_out;
@@ -236,9 +238,27 @@ ipv6_hash_crc(const void *data, __rte_unused uint32_t data_len,
 static uint8_t ipv4_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
 static uint8_t ipv6_l3fwd_out_if[L3FWD_HASH_ENTRIES] __rte_cache_aligned;
 
-static __m128i mask0;
-static __m128i mask1;
-static __m128i mask2;
+static rte_xmm_t mask0;
+static rte_xmm_t mask1;
+static rte_xmm_t mask2;
+
+#if defined(__SSE2__)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	__m128i data = _mm_loadu_si128((__m128i *)(key));
+
+	return _mm_and_si128(data, mask);
+}
+#elif defined(__ARM_NEON)
+static inline xmm_t
+em_mask_key(void *key, xmm_t mask)
+{
+	int32x4_t data = vld1q_s32((int32_t *)key);
+
+	return vandq_s32(data, mask);
+}
+#endif
 
 static inline uint8_t
 em_get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, void *lookup_struct)
@@ -249,13 +269,12 @@ em_get_ipv4_dst_port(void *ipv4_hdr, uint8_t portid, void *lookup_struct)
 		(struct rte_hash *)lookup_struct;
 
 	ipv4_hdr = (uint8_t *)ipv4_hdr + offsetof(struct ipv4_hdr, time_to_live);
-	__m128i data = _mm_loadu_si128((__m128i *)(ipv4_hdr));
 
 	/*
 	 * Get 5 tuple: dst port, src port, dst IP address,
 	 * src IP address and protocol.
 	 */
-	key.xmm = _mm_and_si128(data, mask0);
+	key.xmm = em_mask_key(ipv4_hdr, mask0.x);
 
 	/* Find destination port */
 	ret = rte_hash_lookup(ipv4_l3fwd_lookup_struct, (const void *)&key);
@@ -271,35 +290,31 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
 		(struct rte_hash *)lookup_struct;
 
 	ipv6_hdr = (uint8_t *)ipv6_hdr + offsetof(struct ipv6_hdr, payload_len);
-	__m128i data0 =
-		_mm_loadu_si128((__m128i *)(ipv6_hdr));
-	__m128i data1 =
-		_mm_loadu_si128((__m128i *)(((uint8_t *)ipv6_hdr)+
-					sizeof(__m128i)));
-	__m128i data2 =
-		_mm_loadu_si128((__m128i *)(((uint8_t *)ipv6_hdr)+
-					sizeof(__m128i)+sizeof(__m128i)));
+	void *data0 = ipv6_hdr;
+	void *data1 = ((uint8_t *)ipv6_hdr) + sizeof(xmm_t);
+	void *data2 = ((uint8_t *)ipv6_hdr) + sizeof(xmm_t) + sizeof(xmm_t);
 
 	/* Get part of 5 tuple: src IP address lower 96 bits and protocol */
-	key.xmm[0] = _mm_and_si128(data0, mask1);
+	key.xmm[0] = em_mask_key(data0, mask1.x);
 
 	/*
 	 * Get part of 5 tuple: dst IP address lower 96 bits
 	 * and src IP address higher 32 bits.
 	 */
-	key.xmm[1] = data1;
+	key.xmm[1] = *(xmm_t *)data1;
 
 	/*
 	 * Get part of 5 tuple: dst port and src port
 	 * and dst IP address higher 32 bits.
 	 */
-	key.xmm[2] = _mm_and_si128(data2, mask2);
+	key.xmm[2] = em_mask_key(data2, mask2.x);
 
 	/* Find destination port */
 	ret = rte_hash_lookup(ipv6_l3fwd_lookup_struct, (const void *)&key);
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
+
 /*
  * Include header file if SSE4_1 is enabled for
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
@@ -348,14 +363,15 @@ convert_ipv6_5tuple(struct ipv6_5tuple *key1,
 #define BYTE_VALUE_MAX 256
 #define ALL_32_BITS 0xffffffff
 #define BIT_8_TO_15 0x0000ff00
+
 static inline void
 populate_ipv4_few_flow_into_table(const struct rte_hash *h)
 {
 	uint32_t i;
 	int32_t ret;
 
-	mask0 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_8_TO_15);
+	mask0 = (rte_xmm_t){.u32 = {BIT_8_TO_15, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
 	for (i = 0; i < IPV4_L3FWD_EM_NUM_ROUTES; i++) {
 		struct ipv4_l3fwd_em_route  entry;
@@ -381,10 +397,10 @@ populate_ipv6_few_flow_into_table(const struct rte_hash *h)
 	uint32_t i;
 	int32_t ret;
 
-	mask1 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_16_TO_23);
+	mask1 = (rte_xmm_t){.u32 = {BIT_16_TO_23, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
-	mask2 = _mm_set_epi32(0, 0, ALL_32_BITS, ALL_32_BITS);
+	mask2 = (rte_xmm_t){.u32 = {ALL_32_BITS, ALL_32_BITS, 0, 0} };
 
 	for (i = 0; i < IPV6_L3FWD_EM_NUM_ROUTES; i++) {
 		struct ipv6_l3fwd_em_route entry;
@@ -410,8 +426,8 @@ populate_ipv4_many_flow_into_table(const struct rte_hash *h,
 {
 	unsigned i;
 
-	mask0 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_8_TO_15);
+	mask0 = (rte_xmm_t){.u32 = {BIT_8_TO_15, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
 
 	for (i = 0; i < nr_flow; i++) {
 		struct ipv4_l3fwd_em_route entry;
@@ -462,9 +478,9 @@ populate_ipv6_many_flow_into_table(const struct rte_hash *h,
 {
 	unsigned i;
 
-	mask1 = _mm_set_epi32(ALL_32_BITS, ALL_32_BITS,
-				ALL_32_BITS, BIT_16_TO_23);
-	mask2 = _mm_set_epi32(0, 0, ALL_32_BITS, ALL_32_BITS);
+	mask1 = (rte_xmm_t){.u32 = {BIT_16_TO_23, ALL_32_BITS,
+				ALL_32_BITS, ALL_32_BITS} };
+	mask2 = (rte_xmm_t){.u32 = {ALL_32_BITS, ALL_32_BITS, 0, 0} };
 
 	for (i = 0; i < nr_flow; i++) {
 		struct ipv6_l3fwd_em_route entry;
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 0e33039..8520f71 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -112,7 +112,7 @@ volatile bool force_quit;
 uint64_t dest_eth_addr[RTE_MAX_ETHPORTS];
 struct ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
 
-__m128i val_eth[RTE_MAX_ETHPORTS];
+xmm_t val_eth[RTE_MAX_ETHPORTS];
 
 /* mask of enabled ports */
 uint32_t enabled_port_mask;
-- 
1.9.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM
@ 2016-03-10 15:07 Maciej.Czekaj
  2016-03-10 15:07 ` Maciej.Czekaj
  0 siblings, 1 reply; 9+ messages in thread
From: Maciej.Czekaj @ 2016-03-10 15:07 UTC (permalink / raw)
  To: bruce.richardson, pablo.de.lara.guarch; +Cc: dev

From: Maciej Czekaj <Maciej.Czekaj@caviumnetworks.com>

This patch depends on following pending patches:
lpm: add support for NEON
http://dpdk.org/dev/patchwork/patch/10479/
lpm: make rte_lpm_lookupx4 API definition architecture agnostic
http://dpdk.org/dev/patchwork/patch/10478/

Maciej Czekaj (1):
  l3fwd: Fix compilation & enable exact match mode on ARM.

 examples/l3fwd/l3fwd.h    |  4 ++-
 examples/l3fwd/l3fwd_em.c | 72 +++++++++++++++++++++++++++++------------------
 examples/l3fwd/main.c     |  2 +-
 3 files changed, 48 insertions(+), 30 deletions(-)

-- 
1.9.1

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-03-15 23:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-10 16:06 [dpdk-dev] [PATCH] l3fwd: Fix compilation & enable exact match mode on ARM Maciej.Czekaj
2016-03-10 16:06 ` Maciej.Czekaj
2016-03-11 15:16   ` Thomas Monjalon
2016-03-15 20:05 ` [dpdk-dev] [PATCH v2] " Maciej.Czekaj
2016-03-15 20:05   ` Maciej.Czekaj
2016-03-15 20:49   ` Thomas Monjalon
2016-03-15 23:47     ` [dpdk-dev] Odp.: " Czekaj, Maciej
  -- strict thread matches above, loose matches on Subject: below --
2016-03-10 15:07 [dpdk-dev] [PATCH] " Maciej.Czekaj
2016-03-10 15:07 ` Maciej.Czekaj

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).