DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
@ 2014-08-04 15:35 Neil Horman
  2014-08-05 15:26 ` Ananyev, Konstantin
  0 siblings, 1 reply; 14+ messages in thread
From: Neil Horman @ 2014-08-04 15:35 UTC (permalink / raw)
  To: dev

The ACL library makes extensive use of some SSE4.2 instructions, which means the
default build can't compile this library.  Work around the problem by testing
the __SSE42__ definition in the acl_vects.h file and defining the macros there
as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
only the definitions that are currently used in the ACL library.

Only compile tested so far, but I wanted to post it for early review so that
others could aid in unit testing.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Thomas Monjalon <thomas.monjalon@6wind.com>
CC: "Konstantin Ananyev" <konstantin.ananyev@intel.com>
CC: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_acl/acl_bld.c  |   3 +-
 lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index 873447b..de974a4 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -31,7 +31,6 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <nmmintrin.h>
 #include <rte_acl.h>
 #include "tb_mem.h"
 #include "acl.h"
@@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
 			switch (rule->config->defs[n].type) {
 			case RTE_ACL_FIELD_TYPE_BITMASK:
 				wild = (size -
-					_mm_popcnt_u32(fld->mask_range.u8)) /
+					__builtin_popcountl(fld->mask_range.u8)) /
 					size;
 				break;
 
diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
index d813600..e5f391b 100644
--- a/lib/librte_acl/acl_vect.h
+++ b/lib/librte_acl/acl_vect.h
@@ -34,6 +34,10 @@
 #ifndef _RTE_ACL_VECT_H_
 #define _RTE_ACL_VECT_H_
 
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
 /**
  * @file
  *
@@ -44,12 +48,12 @@
 extern "C" {
 #endif
 
+
 #define	MM_ADD16(a, b)		_mm_add_epi16(a, b)
 #define	MM_ADD32(a, b)		_mm_add_epi32(a, b)
 #define	MM_ALIGNR8(a, b, c)	_mm_alignr_epi8(a, b, c)
 #define	MM_AND(a, b)		_mm_and_si128(a, b)
 #define MM_ANDNOT(a, b)		_mm_andnot_si128(a, b)
-#define MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
 #define MM_CMPEQ16(a, b)	_mm_cmpeq_epi16(a, b)
 #define MM_CMPEQ32(a, b)	_mm_cmpeq_epi32(a, b)
 #define	MM_CMPEQ8(a, b)		_mm_cmpeq_epi8(a, b)
@@ -59,7 +63,6 @@ extern "C" {
 #define	MM_CVT32(a)		_mm_cvtsi128_si32(a)
 #define MM_CVTU32(a)		_mm_cvtsi32_si128(a)
 #define	MM_INSERT16(a, c, b)	_mm_insert_epi16(a, c, b)
-#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
 #define	MM_LOAD(a)		_mm_load_si128(a)
 #define	MM_LOADH_PI(a, b)	_mm_loadh_pi(a, b)
 #define	MM_LOADU(a)		_mm_loadu_si128(a)
@@ -82,7 +85,6 @@ extern "C" {
 #define	MM_SRL32(a, b)		_mm_srli_epi32(a, b)
 #define	MM_STORE(a, b)		_mm_store_si128(a, b)
 #define	MM_STOREU(a, b)		_mm_storeu_si128(a, b)
-#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
 #define	MM_XOR(a, b)		_mm_xor_si128(a, b)
 
 #define	MM_SET16(a, b, c, d, e, f, g, h)	\
@@ -93,6 +95,100 @@ extern "C" {
 	_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,	\
 		c8, c9, cA, cB, cC, cD, cE, cF)
 
+
+#ifndef __SSE4_1__
+static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
+{
+	unsigned char tmpd[16], tmps[16], tmpm[16];
+	int i;
+
+	MM_STOREU((xmm_t *)&tmpd, dst);
+	MM_STOREU((xmm_t *)&tmps, src);
+	MM_STOREU((xmm_t *)&tmpm, mask);
+
+	for (i = 0; i < 16; i++)
+		if (mask[i] & 0x8)
+			dst[i] = src[i];
+
+	dst = MM_LOADU((xmm_t *)&tmpd);
+
+	return dst;
+}
+
+#define MM_BLENDV8(a, b, c)	pblendvb(a, b, c)
+
+
+static inline int ptestz(xmm_t a, xmm_t b)
+{
+	unsigned long long tmpa[2], tmpb[2];
+
+	MM_STOREU((xmm_t *)&tmpa, a);
+	MM_STOREU((xmm_t *)&tmpb, b);
+
+	if (tmpa[0] & tmpb[0])
+		return 1;
+	if (tmpa[1] & tmpb[1])
+		return 1;
+
+	return 0;
+}
+
+#define	MM_TESTZ(a, b)		ptestz(a, b)
+
+static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off)
+{
+	unsigned long long tmpa[2];
+	unsigned long long mask;
+	int32_t tmp;
+	
+	MM_STOREU((xmm_t *)&tmpa, dst);
+
+	/*
+	 * Inserting a dword is a bit odd as it can cross a word boundary
+	 */
+
+	if (off > 32) {
+		/*
+		 * If the offset is more than 32, then part of the 
+		 * inserted word will appear in the upper half of the xmm
+		 * register.  Grab the part of the value that crosses the 64 bit 
+		 * boundary.
+		 */
+		tmp = val >> (off - 32);
+
+		/*
+		 * Mask off the least significant bits of the upper longword
+		 */
+		mask = ~((1 << (off - 32)) - 1);
+		tmpa[1] &= mask;
+
+		/*
+		 * and insert the new value
+		 */
+		tmpa[1] |= tmp;
+	}
+	if (off < 64) {
+		/*
+		 * If the offset is less than 64 bits, we also need to mask and 
+		 * assign the lower longword
+		 */
+		mask = (1 << off) - 1;
+		tmpa[0] &= mask;
+		tmpa[0] |= (val << off);
+	}
+
+	dst = MM_LOADU((xmm_t *)&tmpa);
+	return dst;
+}
+
+#define	MM_INSERT32(a, c, b)	pinsrd(a, c, b)
+
+#else
+#define	MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
+#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
+#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
+#endif
+
 #ifdef RTE_ARCH_X86_64
 
 #define	MM_CVT64(a)		_mm_cvtsi128_si64(a)
-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-04 15:35 [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code Neil Horman
@ 2014-08-05 15:26 ` Ananyev, Konstantin
  2014-08-05 18:20   ` Neil Horman
  0 siblings, 1 reply; 14+ messages in thread
From: Ananyev, Konstantin @ 2014-08-05 15:26 UTC (permalink / raw)
  To: Neil Horman, dev

Hi Neil,

> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Monday, August 04, 2014 4:36 PM
> To: dev@dpdk.org
> Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson, Bruce
> Subject: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> 
> The ACL library makes extensive use of some SSE4.2 instructions, which means the
> default build can't compile this library.  Work around the problem by testing
> the __SSE42__ definition in the acl_vects.h file and defining the macros there
> as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> only the definitions that are currently used in the ACL library.
> 

My comments about actual implementations of c-level equivalents below.
None of them look correct to me.
Of course it could be fixed.
Though I am not sure that is a right way to proceed:
At first I really doubt that these equivalents will provide similar performance.
As you probably note - we do have a scalar version of  rte_acl_classify():  rte_acl_classify_scalar().
So I think it might be faster than vector one with 'emulated' instrincts. 
Unfortunately it is all mixed right now into one file and 'scalar' version could use few sse4 instrincts through resolve_priority().  
Another thing - we consider to add another version of rte_acl_classify() that will use avx2 instrincts.
If we go the way you suggest - I am afraid will soon have to provide scalar equivalents for several AVX2 instrincts too.
So in summary that way (providing our own scalar equivalents of SIMD instrincts) seems to me slow, hard to maintain and error prone.

What porbably can be done instead: rework acl_run.c a bit, so it provide:
rte_acl_classify_scalar() - could be build and used on all systems.
rte_acl_classify_sse() - could be build and used only on systems with sse4.2 and upper, return ENOTSUP on lower arch.
In future: rte_acl_classify_avx2 - could be build and used only on systems with avx2 and upper, return ENOTSUP on lower arch.

I am looking at rte_acl right now anyway.
So will try to come up with something workable.

> Only compile tested so far, but I wanted to post it for early review so that
> others could aid in unit testing.

Could you please stop submitting untested patches.
It is common (and good) practice to do at least some minimal testing before submitting your code changes.
At the end, it will save your own and other people time.
For ACL there is a simple UT, that could be run as:
./x86_64-native-linuxapp-gcc/app/test ...
RTE>>acl_autotest
It takes just few seconds to run.
Also if you plan further development for ACL, we probably can provide you with extra test-cases that we use for functional testing.
Thanks
Konstantin

> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Thomas Monjalon <thomas.monjalon@6wind.com>
> CC: "Konstantin Ananyev" <konstantin.ananyev@intel.com>
> CC: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  lib/librte_acl/acl_bld.c  |   3 +-
>  lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 100 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
> index 873447b..de974a4 100644
> --- a/lib/librte_acl/acl_bld.c
> +++ b/lib/librte_acl/acl_bld.c
> @@ -31,7 +31,6 @@
>   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>   */
> 
> -#include <nmmintrin.h>
>  #include <rte_acl.h>
>  #include "tb_mem.h"
>  #include "acl.h"
> @@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
>  			switch (rule->config->defs[n].type) {
>  			case RTE_ACL_FIELD_TYPE_BITMASK:
>  				wild = (size -
> -					_mm_popcnt_u32(fld->mask_range.u8)) /
> +					__builtin_popcountl(fld->mask_range.u8)) /
>  					size;
>  				break;
> 
> diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
> index d813600..e5f391b 100644
> --- a/lib/librte_acl/acl_vect.h
> +++ b/lib/librte_acl/acl_vect.h
> @@ -34,6 +34,10 @@
>  #ifndef _RTE_ACL_VECT_H_
>  #define _RTE_ACL_VECT_H_
> 
> +#ifdef __SSE4_1__
> +#include <smmintrin.h>
> +#endif
> +
>  /**
>   * @file
>   *
> @@ -44,12 +48,12 @@
>  extern "C" {
>  #endif
> 
> +
>  #define	MM_ADD16(a, b)		_mm_add_epi16(a, b)
>  #define	MM_ADD32(a, b)		_mm_add_epi32(a, b)
>  #define	MM_ALIGNR8(a, b, c)	_mm_alignr_epi8(a, b, c)
>  #define	MM_AND(a, b)		_mm_and_si128(a, b)
>  #define MM_ANDNOT(a, b)		_mm_andnot_si128(a, b)
> -#define MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
>  #define MM_CMPEQ16(a, b)	_mm_cmpeq_epi16(a, b)
>  #define MM_CMPEQ32(a, b)	_mm_cmpeq_epi32(a, b)
>  #define	MM_CMPEQ8(a, b)		_mm_cmpeq_epi8(a, b)
> @@ -59,7 +63,6 @@ extern "C" {
>  #define	MM_CVT32(a)		_mm_cvtsi128_si32(a)
>  #define MM_CVTU32(a)		_mm_cvtsi32_si128(a)
>  #define	MM_INSERT16(a, c, b)	_mm_insert_epi16(a, c, b)
> -#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
>  #define	MM_LOAD(a)		_mm_load_si128(a)
>  #define	MM_LOADH_PI(a, b)	_mm_loadh_pi(a, b)
>  #define	MM_LOADU(a)		_mm_loadu_si128(a)
> @@ -82,7 +85,6 @@ extern "C" {
>  #define	MM_SRL32(a, b)		_mm_srli_epi32(a, b)
>  #define	MM_STORE(a, b)		_mm_store_si128(a, b)
>  #define	MM_STOREU(a, b)		_mm_storeu_si128(a, b)
> -#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
>  #define	MM_XOR(a, b)		_mm_xor_si128(a, b)
> 
>  #define	MM_SET16(a, b, c, d, e, f, g, h)	\
> @@ -93,6 +95,100 @@ extern "C" {
>  	_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,	\
>  		c8, c9, cA, cB, cC, cD, cE, cF)
> 
> +
> +#ifndef __SSE4_1__
> +static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
> +{
> +	unsigned char tmpd[16], tmps[16], tmpm[16];
> +	int i;
> +
> +	MM_STOREU((xmm_t *)&tmpd, dst);
> +	MM_STOREU((xmm_t *)&tmps, src);
> +	MM_STOREU((xmm_t *)&tmpm, mask);
> +

Generic comment:
You don't need to use char temp[16] or other stuff each time you need to swap between __m128i and char[16];
Specially for that purpose inside rte_common_vect.h we have:
typedef union rte_xmm {
        xmm_t    m;
        uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
        uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
        uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
        uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
        double   pd[XMM_SIZE / sizeof(double)];
} rte_xmm_t;


> +	for (i = 0; i < 16; i++)
> +		if (mask[i] & 0x8)

Should be tmpm, not mask.
Wonder how did it compile without warnings?

> +			dst[i] = src[i];
> +


Reading Intel Architecture Manual:
MASK <-- SRC3
IF (MASK[7] = 1) THEN DEST[7:0]  SRC2[7:0];
ELSE DEST[7:0] <-- SRC1[7:0];
IF (MASK[15] = 1) THEN DEST[15:8]  SRC2[15:8];
ELSE DEST[15:8] <-- SRC1[15:8];
...
So it should be 'tmpm[i] & 0x80'

> +	dst = MM_LOADU((xmm_t *)&tmpd);
> +
> +	return dst;
> +}

Taking into the account all of the above, I think it should be something like that:

static inline xmm_t
pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
{
        rte_xmm_t d, m, s;
        int32_t i;

        d.m = dst;
        s.m = src;
        m.m = mask;

        for (i = 0; i != RTE_DIM(m.u8); i++)
                if ((m.u8[i] & INT8_MIN) != 0)
                        d.u8[i] = s.u8[i];

        return (d.m);
}

> +
> +#define MM_BLENDV8(a, b, c)	pblendvb(a, b, c)
> +
> +
> +static inline int ptestz(xmm_t a, xmm_t b)
> +{
> +	unsigned long long tmpa[2], tmpb[2];
> +
> +	MM_STOREU((xmm_t *)&tmpa, a);
> +	MM_STOREU((xmm_t *)&tmpb, b);
> +
> +	if (tmpa[0] & tmpb[0])
> +		return 1;
> +	if (tmpa[1] & tmpb[1])
> +		return 1;
> +
> +	return 0;
> +}

Again from IA manual:
(V)PTEST (128-bit version)
IF (SRC[127:0] BITWISE AND DEST[127:0] = 0)
THEN ZF <-- 1;
ELSE ZF <-- 0;
IF (SRC[127:0] BITWISE AND NOT DEST[127:0] = 0)
THEN CF <-- 1;
ELSE CF <-- 0;

So _mm_testz_si128 (__m128i s1, __m128i s2) returns 1 if the bitwise AND operation on s1 and s2 results in all zeros, else returns 0.
Yours one doing opposite stuff.
Should be:

static inline int
ptestz(xmm_t a, xmm_t b)
{
        rte_xmm_t x, y;

        x.m = a;
        y.m = b;

        return ((x.u64[0] & y.u64[0]) == 0 && (x.u64[1] & y.u64[1]) == 0);
}

> +
> +#define	MM_TESTZ(a, b)		ptestz(a, b)
> +
> +static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off)
> +{
> +	unsigned long long tmpa[2];
> +	unsigned long long mask;
> +	int32_t tmp;
> +
> +	MM_STOREU((xmm_t *)&tmpa, dst);
> +
> +	/*
> +	 * Inserting a dword is a bit odd as it can cross a word boundary
> +	 */
> +
> +	if (off > 32) {
> +		/*
> +		 * If the offset is more than 32, then part of the
> +		 * inserted word will appear in the upper half of the xmm
> +		 * register.  Grab the part of the value that crosses the 64 bit
> +		 * boundary.
> +		 */
> +		tmp = val >> (off - 32);
> +
> +		/*
> +		 * Mask off the least significant bits of the upper longword
> +		 */
> +		mask = ~((1 << (off - 32)) - 1);
> +		tmpa[1] &= mask;
> +
> +		/*
> +		 * and insert the new value
> +		 */
> +		tmpa[1] |= tmp;
> +	}
> +	if (off < 64) {
> +		/*
> +		 * If the offset is less than 64 bits, we also need to mask and
> +		 * assign the lower longword
> +		 */
> +		mask = (1 << off) - 1;
> +		tmpa[0] &= mask;
> +		tmpa[0] |= (val << off);
> +	}
> +
> +	dst = MM_LOADU((xmm_t *)&tmpa);
> +	return dst;
> +}

Again from IA manual:
PINSRD: SEL <-- COUNT[1:0];
MASK <-- (0FFFFFFFFH << (SEL * 32));
TEMP <-- (((SRC << (SEL *32)) AND MASK) ;
DEST <-- ((DEST AND NOT MASK) OR TEMP);

So basically, all we need is to treat dst as array of 4 integers.
Off - is just an index [0-3] where to insert val:

Static inline xmm_t
pinsrd(xmm_t dst, int32_t val, uint32_t off)
{
        rte_xmm_t x;
        int32_t idx;

        x.m = dst;
        idx = off % RTE_DIM(x.u32);
        x.u32[idx] = val;
        return x.m;
}

> +
> +#define	MM_INSERT32(a, c, b)	pinsrd(a, c, b)
> +
> +#else
> +#define	MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
> +#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
> +#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
> +#endif
> +
>  #ifdef RTE_ARCH_X86_64
> 
>  #define	MM_CVT64(a)		_mm_cvtsi128_si64(a)
> --
> 1.8.3.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-05 15:26 ` Ananyev, Konstantin
@ 2014-08-05 18:20   ` Neil Horman
  2014-08-06 10:52     ` Ananyev, Konstantin
  2014-08-06 11:39     ` Ananyev, Konstantin
  0 siblings, 2 replies; 14+ messages in thread
From: Neil Horman @ 2014-08-05 18:20 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

On Tue, Aug 05, 2014 at 03:26:27PM +0000, Ananyev, Konstantin wrote:
> Hi Neil,
> 
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Monday, August 04, 2014 4:36 PM
> > To: dev@dpdk.org
> > Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson, Bruce
> > Subject: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > 
> > The ACL library makes extensive use of some SSE4.2 instructions, which means the
> > default build can't compile this library.  Work around the problem by testing
> > the __SSE42__ definition in the acl_vects.h file and defining the macros there
> > as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> > only the definitions that are currently used in the ACL library.
> > 
> 
> My comments about actual implementations of c-level equivalents below.
> None of them look correct to me.
> Of course it could be fixed.
> Though I am not sure that is a right way to proceed:
> At first I really doubt that these equivalents will provide similar performance.
> As you probably note - we do have a scalar version of  rte_acl_classify():  rte_acl_classify_scalar().
> So I think it might be faster than vector one with 'emulated' instrincts. 
> Unfortunately it is all mixed right now into one file and 'scalar' version could use few sse4 instrincts through resolve_priority().  
> Another thing - we consider to add another version of rte_acl_classify() that will use avx2 instrincts.
> If we go the way you suggest - I am afraid will soon have to provide scalar equivalents for several AVX2 instrincts too.
> So in summary that way (providing our own scalar equivalents of SIMD instrincts) seems to me slow, hard to maintain and error prone.
> 
> What porbably can be done instead: rework acl_run.c a bit, so it provide:
> rte_acl_classify_scalar() - could be build and used on all systems.
> rte_acl_classify_sse() - could be build and used only on systems with sse4.2 and upper, return ENOTSUP on lower arch.
> In future: rte_acl_classify_avx2 - could be build and used only on systems with avx2 and upper, return ENOTSUP on lower arch.
> 
> I am looking at rte_acl right now anyway.
> So will try to come up with something workable.
> 
So, this is exactly the opposite of what Bruce and I just spent several days and
a huge email thread that you clearly are aware of discussing run time versus
compile time selection of paths.  At this point I'm done ping ponging between
your opposing viewpoints.  If you want to implement something that does run time
checking, I'm fine with it, but I'm not going back and forth until you two come
to an agreement on this.

> > Only compile tested so far, but I wanted to post it for early review so that
> > others could aid in unit testing.
> 
> Could you please stop submitting untested patches.
What part of "Compile tested only" was hard to understand? I posted it early
specficially because I expected comments like you've provided above.  After I
posted my first idea, you naked it, and after I started fixing it up to make
run-time selections, Bruce came down in favor of common compile time
functionality.  Given our previous conversation, I figured you would be opposed
to this approach, so whats the point in testing this if you're just going to
kill the idea?

> It is common (and good) practice to do at least some minimal testing before submitting your code changes.
Never intended it to be accepted just tested/reviewed.  Hence my comment
"compile tested only".  I admit I was unaware of the unit test below
> At the end, it will save your own and other people time.
> For ACL there is a simple UT, that could be run as:
> ./x86_64-native-linuxapp-gcc/app/test ...
> RTE>>acl_autotest
> It takes just few seconds to run.

It doesn't seem to work properly, at least not on any of my systems.  With a
system allocation 100 pages to hugepage memory:

[root@hmsreliant app]# ./test -c 0x3 -n 2
EAL: Detected lcore 0 as core 0 on socket 0
EAL: Detected lcore 1 as core 1 on socket 0
EAL: Detected lcore 2 as core 2 on socket 0
EAL: Detected lcore 3 as core 3 on socket 0
EAL: Detected lcore 4 as core 0 on socket 0
EAL: Detected lcore 5 as core 1 on socket 0
EAL: Detected lcore 6 as core 2 on socket 0
EAL: Detected lcore 7 as core 3 on socket 0
EAL: Support maximum 64 logical core(s) by configuration.
EAL: Detected 8 lcore(s)
EAL:   cannot open VFIO container, error 2 (No such file or directory)
EAL: VFIO support could not be initialized
EAL: Setting up memory...
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7fef07000000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7fef06c00000 (size = 0x200000)
EAL: Ask a virtual area of 0x400000 bytes
EAL: Virtual area found at 0x7fef06600000 (size = 0x400000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7fef06200000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7fef05e00000 (size = 0x200000)
EAL: Ask a virtual area of 0x600000 bytes
EAL: Virtual area found at 0x7fef05600000 (size = 0x600000)
EAL: Ask a virtual area of 0x600000 bytes
EAL: Virtual area found at 0x7fef04e00000 (size = 0x600000)
EAL: Ask a virtual area of 0x800000 bytes
EAL: Virtual area found at 0x7fef04400000 (size = 0x800000)
EAL: Ask a virtual area of 0x400000 bytes
EAL: Virtual area found at 0x7fef03e00000 (size = 0x400000)
EAL: Ask a virtual area of 0xa00000 bytes
EAL: Virtual area found at 0x7fef03200000 (size = 0xa00000)
EAL: Ask a virtual area of 0x400000 bytes
EAL: Virtual area found at 0x7fef02c00000 (size = 0x400000)
EAL: Ask a virtual area of 0x400000 bytes
EAL: Virtual area found at 0x7fef02600000 (size = 0x400000)
EAL: Ask a virtual area of 0x1800000 bytes
EAL: Virtual area found at 0x7fef00c00000 (size = 0x1800000)
EAL: Ask a virtual area of 0x1200000 bytes
EAL: Virtual area found at 0x7feeff800000 (size = 0x1200000)
EAL: Ask a virtual area of 0x2600000 bytes
EAL: Virtual area found at 0x7feefd000000 (size = 0x2600000)
EAL: Ask a virtual area of 0xc00000 bytes
EAL: Virtual area found at 0x7feefc200000 (size = 0xc00000)
EAL: Ask a virtual area of 0x2200000 bytes
EAL: Virtual area found at 0x7feef9e00000 (size = 0x2200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef9a00000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef9600000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef9200000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef8e00000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef8a00000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef8600000 (size = 0x200000)
EAL: Ask a virtual area of 0x200000 bytes
EAL: Virtual area found at 0x7feef8200000 (size = 0x200000)
EAL: Ask a virtual area of 0x600000 bytes
EAL: Virtual area found at 0x7feef7a00000 (size = 0x600000)
EAL: Requesting 100 pages of size 2MB from socket 0
EAL: TSC frequency is ~3392297 KHz
EAL: Master core 0 is ready (tid=73cf880)
EAL: Core 1 is ready (tid=f71fe700)
APP: HPET is not enabled, using TSC as default timer
RTE>>acl_autotest
ACL: allocation of 25166720 bytes on socket 9 for ACL_acl_ctx failed

This hangs forever (well, at least 30 minutes, but thats sufficiently
close to forever for me to wait).

> Also if you plan further development for ACL, we probably can provide you with extra test-cases that we use for functional testing.
That would be good please.

Thanks
Neil

> Thanks
> Konstantin
> 
> > 
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > CC: Thomas Monjalon <thomas.monjalon@6wind.com>
> > CC: "Konstantin Ananyev" <konstantin.ananyev@intel.com>
> > CC: Bruce Richardson <bruce.richardson@intel.com>
> > ---
> >  lib/librte_acl/acl_bld.c  |   3 +-
> >  lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++--
> >  2 files changed, 100 insertions(+), 5 deletions(-)
> > 
> > diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
> > index 873447b..de974a4 100644
> > --- a/lib/librte_acl/acl_bld.c
> > +++ b/lib/librte_acl/acl_bld.c
> > @@ -31,7 +31,6 @@
> >   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> >   */
> > 
> > -#include <nmmintrin.h>
> >  #include <rte_acl.h>
> >  #include "tb_mem.h"
> >  #include "acl.h"
> > @@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
> >  			switch (rule->config->defs[n].type) {
> >  			case RTE_ACL_FIELD_TYPE_BITMASK:
> >  				wild = (size -
> > -					_mm_popcnt_u32(fld->mask_range.u8)) /
> > +					__builtin_popcountl(fld->mask_range.u8)) /
> >  					size;
> >  				break;
> > 
> > diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
> > index d813600..e5f391b 100644
> > --- a/lib/librte_acl/acl_vect.h
> > +++ b/lib/librte_acl/acl_vect.h
> > @@ -34,6 +34,10 @@
> >  #ifndef _RTE_ACL_VECT_H_
> >  #define _RTE_ACL_VECT_H_
> > 
> > +#ifdef __SSE4_1__
> > +#include <smmintrin.h>
> > +#endif
> > +
> >  /**
> >   * @file
> >   *
> > @@ -44,12 +48,12 @@
> >  extern "C" {
> >  #endif
> > 
> > +
> >  #define	MM_ADD16(a, b)		_mm_add_epi16(a, b)
> >  #define	MM_ADD32(a, b)		_mm_add_epi32(a, b)
> >  #define	MM_ALIGNR8(a, b, c)	_mm_alignr_epi8(a, b, c)
> >  #define	MM_AND(a, b)		_mm_and_si128(a, b)
> >  #define MM_ANDNOT(a, b)		_mm_andnot_si128(a, b)
> > -#define MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
> >  #define MM_CMPEQ16(a, b)	_mm_cmpeq_epi16(a, b)
> >  #define MM_CMPEQ32(a, b)	_mm_cmpeq_epi32(a, b)
> >  #define	MM_CMPEQ8(a, b)		_mm_cmpeq_epi8(a, b)
> > @@ -59,7 +63,6 @@ extern "C" {
> >  #define	MM_CVT32(a)		_mm_cvtsi128_si32(a)
> >  #define MM_CVTU32(a)		_mm_cvtsi32_si128(a)
> >  #define	MM_INSERT16(a, c, b)	_mm_insert_epi16(a, c, b)
> > -#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
> >  #define	MM_LOAD(a)		_mm_load_si128(a)
> >  #define	MM_LOADH_PI(a, b)	_mm_loadh_pi(a, b)
> >  #define	MM_LOADU(a)		_mm_loadu_si128(a)
> > @@ -82,7 +85,6 @@ extern "C" {
> >  #define	MM_SRL32(a, b)		_mm_srli_epi32(a, b)
> >  #define	MM_STORE(a, b)		_mm_store_si128(a, b)
> >  #define	MM_STOREU(a, b)		_mm_storeu_si128(a, b)
> > -#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
> >  #define	MM_XOR(a, b)		_mm_xor_si128(a, b)
> > 
> >  #define	MM_SET16(a, b, c, d, e, f, g, h)	\
> > @@ -93,6 +95,100 @@ extern "C" {
> >  	_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,	\
> >  		c8, c9, cA, cB, cC, cD, cE, cF)
> > 
> > +
> > +#ifndef __SSE4_1__
> > +static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
> > +{
> > +	unsigned char tmpd[16], tmps[16], tmpm[16];
> > +	int i;
> > +
> > +	MM_STOREU((xmm_t *)&tmpd, dst);
> > +	MM_STOREU((xmm_t *)&tmps, src);
> > +	MM_STOREU((xmm_t *)&tmpm, mask);
> > +
> 
> Generic comment:
> You don't need to use char temp[16] or other stuff each time you need to swap between __m128i and char[16];
> Specially for that purpose inside rte_common_vect.h we have:
> typedef union rte_xmm {
>         xmm_t    m;
>         uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
>         uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
>         uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
>         uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
>         double   pd[XMM_SIZE / sizeof(double)];
> } rte_xmm_t;
> 
> 
> > +	for (i = 0; i < 16; i++)
> > +		if (mask[i] & 0x8)
> 
> Should be tmpm, not mask.
> Wonder how did it compile without warnings?
> 
> > +			dst[i] = src[i];
> > +
> 
> 
> Reading Intel Architecture Manual:
> MASK <-- SRC3
> IF (MASK[7] = 1) THEN DEST[7:0]  SRC2[7:0];
> ELSE DEST[7:0] <-- SRC1[7:0];
> IF (MASK[15] = 1) THEN DEST[15:8]  SRC2[15:8];
> ELSE DEST[15:8] <-- SRC1[15:8];
> ...
> So it should be 'tmpm[i] & 0x80'
> 
> > +	dst = MM_LOADU((xmm_t *)&tmpd);
> > +
> > +	return dst;
> > +}
> 
> Taking into the account all of the above, I think it should be something like that:
> 
> static inline xmm_t
> pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
> {
>         rte_xmm_t d, m, s;
>         int32_t i;
> 
>         d.m = dst;
>         s.m = src;
>         m.m = mask;
> 
>         for (i = 0; i != RTE_DIM(m.u8); i++)
>                 if ((m.u8[i] & INT8_MIN) != 0)
>                         d.u8[i] = s.u8[i];
> 
>         return (d.m);
> }
> 
> > +
> > +#define MM_BLENDV8(a, b, c)	pblendvb(a, b, c)
> > +
> > +
> > +static inline int ptestz(xmm_t a, xmm_t b)
> > +{
> > +	unsigned long long tmpa[2], tmpb[2];
> > +
> > +	MM_STOREU((xmm_t *)&tmpa, a);
> > +	MM_STOREU((xmm_t *)&tmpb, b);
> > +
> > +	if (tmpa[0] & tmpb[0])
> > +		return 1;
> > +	if (tmpa[1] & tmpb[1])
> > +		return 1;
> > +
> > +	return 0;
> > +}
> 
> Again from IA manual:
> (V)PTEST (128-bit version)
> IF (SRC[127:0] BITWISE AND DEST[127:0] = 0)
> THEN ZF <-- 1;
> ELSE ZF <-- 0;
> IF (SRC[127:0] BITWISE AND NOT DEST[127:0] = 0)
> THEN CF <-- 1;
> ELSE CF <-- 0;
> 
> So _mm_testz_si128 (__m128i s1, __m128i s2) returns 1 if the bitwise AND operation on s1 and s2 results in all zeros, else returns 0.
> Yours one doing opposite stuff.
> Should be:
> 
> static inline int
> ptestz(xmm_t a, xmm_t b)
> {
>         rte_xmm_t x, y;
> 
>         x.m = a;
>         y.m = b;
> 
>         return ((x.u64[0] & y.u64[0]) == 0 && (x.u64[1] & y.u64[1]) == 0);
> }
> 
> > +
> > +#define	MM_TESTZ(a, b)		ptestz(a, b)
> > +
> > +static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off)
> > +{
> > +	unsigned long long tmpa[2];
> > +	unsigned long long mask;
> > +	int32_t tmp;
> > +
> > +	MM_STOREU((xmm_t *)&tmpa, dst);
> > +
> > +	/*
> > +	 * Inserting a dword is a bit odd as it can cross a word boundary
> > +	 */
> > +
> > +	if (off > 32) {
> > +		/*
> > +		 * If the offset is more than 32, then part of the
> > +		 * inserted word will appear in the upper half of the xmm
> > +		 * register.  Grab the part of the value that crosses the 64 bit
> > +		 * boundary.
> > +		 */
> > +		tmp = val >> (off - 32);
> > +
> > +		/*
> > +		 * Mask off the least significant bits of the upper longword
> > +		 */
> > +		mask = ~((1 << (off - 32)) - 1);
> > +		tmpa[1] &= mask;
> > +
> > +		/*
> > +		 * and insert the new value
> > +		 */
> > +		tmpa[1] |= tmp;
> > +	}
> > +	if (off < 64) {
> > +		/*
> > +		 * If the offset is less than 64 bits, we also need to mask and
> > +		 * assign the lower longword
> > +		 */
> > +		mask = (1 << off) - 1;
> > +		tmpa[0] &= mask;
> > +		tmpa[0] |= (val << off);
> > +	}
> > +
> > +	dst = MM_LOADU((xmm_t *)&tmpa);
> > +	return dst;
> > +}
> 
> Again from IA manual:
> PINSRD: SEL <-- COUNT[1:0];
> MASK <-- (0FFFFFFFFH << (SEL * 32));
> TEMP <-- (((SRC << (SEL *32)) AND MASK) ;
> DEST <-- ((DEST AND NOT MASK) OR TEMP);
> 
> So basically, all we need is to treat dst as array of 4 integers.
> Off - is just an index [0-3] where to insert val:
> 
> Static inline xmm_t
> pinsrd(xmm_t dst, int32_t val, uint32_t off)
> {
>         rte_xmm_t x;
>         int32_t idx;
> 
>         x.m = dst;
>         idx = off % RTE_DIM(x.u32);
>         x.u32[idx] = val;
>         return x.m;
> }
> 
> > +
> > +#define	MM_INSERT32(a, c, b)	pinsrd(a, c, b)
> > +
> > +#else
> > +#define	MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
> > +#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
> > +#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
> > +#endif
> > +
> >  #ifdef RTE_ARCH_X86_64
> > 
> >  #define	MM_CVT64(a)		_mm_cvtsi128_si64(a)
> > --
> > 1.8.3.1
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-05 18:20   ` Neil Horman
@ 2014-08-06 10:52     ` Ananyev, Konstantin
  2014-08-06 12:12       ` Neil Horman
  2014-08-06 11:39     ` Ananyev, Konstantin
  1 sibling, 1 reply; 14+ messages in thread
From: Ananyev, Konstantin @ 2014-08-06 10:52 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev


> > For ACL there is a simple UT, that could be run as:
> > ./x86_64-native-linuxapp-gcc/app/test ...
> > RTE>>acl_autotest
> > It takes just few seconds to run.
> 
> It doesn't seem to work properly, at least not on any of my systems.  With a
> system allocation 100 pages to hugepage memory:
> 
> [root@hmsreliant app]# ./test -c 0x3 -n 2
> EAL: Detected lcore 0 as core 0 on socket 0
> EAL: Detected lcore 1 as core 1 on socket 0
> EAL: Detected lcore 2 as core 2 on socket 0
> EAL: Detected lcore 3 as core 3 on socket 0
> EAL: Detected lcore 4 as core 0 on socket 0
> EAL: Detected lcore 5 as core 1 on socket 0
> EAL: Detected lcore 6 as core 2 on socket 0
> EAL: Detected lcore 7 as core 3 on socket 0
> EAL: Support maximum 64 logical core(s) by configuration.
> EAL: Detected 8 lcore(s)
> EAL:   cannot open VFIO container, error 2 (No such file or directory)
> EAL: VFIO support could not be initialized
> EAL: Setting up memory...
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef07000000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef06c00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef06600000 (size = 0x400000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef06200000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef05e00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x600000 bytes
> EAL: Virtual area found at 0x7fef05600000 (size = 0x600000)
> EAL: Ask a virtual area of 0x600000 bytes
> EAL: Virtual area found at 0x7fef04e00000 (size = 0x600000)
> EAL: Ask a virtual area of 0x800000 bytes
> EAL: Virtual area found at 0x7fef04400000 (size = 0x800000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef03e00000 (size = 0x400000)
> EAL: Ask a virtual area of 0xa00000 bytes
> EAL: Virtual area found at 0x7fef03200000 (size = 0xa00000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef02c00000 (size = 0x400000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef02600000 (size = 0x400000)
> EAL: Ask a virtual area of 0x1800000 bytes
> EAL: Virtual area found at 0x7fef00c00000 (size = 0x1800000)
> EAL: Ask a virtual area of 0x1200000 bytes
> EAL: Virtual area found at 0x7feeff800000 (size = 0x1200000)
> EAL: Ask a virtual area of 0x2600000 bytes
> EAL: Virtual area found at 0x7feefd000000 (size = 0x2600000)
> EAL: Ask a virtual area of 0xc00000 bytes
> EAL: Virtual area found at 0x7feefc200000 (size = 0xc00000)
> EAL: Ask a virtual area of 0x2200000 bytes
> EAL: Virtual area found at 0x7feef9e00000 (size = 0x2200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef9a00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef9600000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef9200000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8e00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8a00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8600000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8200000 (size = 0x200000)
> EAL: Ask a virtual area of 0x600000 bytes
> EAL: Virtual area found at 0x7feef7a00000 (size = 0x600000)
> EAL: Requesting 100 pages of size 2MB from socket 0
> EAL: TSC frequency is ~3392297 KHz
> EAL: Master core 0 is ready (tid=73cf880)
> EAL: Core 1 is ready (tid=f71fe700)
> APP: HPET is not enabled, using TSC as default timer
> RTE>>acl_autotest
> ACL: allocation of 25166720 bytes on socket 9 for ACL_acl_ctx failed
> 
> This hangs forever (well, at least 30 minutes, but thats sufficiently
> close to forever for me to wait).
> 

Ok that's unusual.
Never seen before.
I suppose that happens with unmodified dpdk 1.7?
I do run acl_autotest with at least 256M of huge-pages as ACL is quite memory consuming  and, as I remember, requires at least 2X32M areas of contiguous hugepages.
But in that case (not enough hugepages) it should just fail with something like:
ACL: allocation of 25953152 bytes on socket -1 for ACL_acl_ctx failed
Any other details, so I can try to reproduce it:
arch you build/run it on?
amount of free memory in the system?
Wonder what pstack says?

Thanks
Konstantin




^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-05 18:20   ` Neil Horman
  2014-08-06 10:52     ` Ananyev, Konstantin
@ 2014-08-06 11:39     ` Ananyev, Konstantin
  2014-08-06 12:18       ` Neil Horman
  1 sibling, 1 reply; 14+ messages in thread
From: Ananyev, Konstantin @ 2014-08-06 11:39 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev



> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Tuesday, August 05, 2014 7:21 PM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> 
> On Tue, Aug 05, 2014 at 03:26:27PM +0000, Ananyev, Konstantin wrote:
> > Hi Neil,
> >
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Monday, August 04, 2014 4:36 PM
> > > To: dev@dpdk.org
> > > Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson, Bruce
> > > Subject: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > >
> > > The ACL library makes extensive use of some SSE4.2 instructions, which means the
> > > default build can't compile this library.  Work around the problem by testing
> > > the __SSE42__ definition in the acl_vects.h file and defining the macros there
> > > as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> > > only the definitions that are currently used in the ACL library.
> > >
> >
> > My comments about actual implementations of c-level equivalents below.
> > None of them look correct to me.
> > Of course it could be fixed.
> > Though I am not sure that is a right way to proceed:
> > At first I really doubt that these equivalents will provide similar performance.
> > As you probably note - we do have a scalar version of  rte_acl_classify():  rte_acl_classify_scalar().
> > So I think it might be faster than vector one with 'emulated' instrincts.
> > Unfortunately it is all mixed right now into one file and 'scalar' version could use few sse4 instrincts through resolve_priority().
> > Another thing - we consider to add another version of rte_acl_classify() that will use avx2 instrincts.
> > If we go the way you suggest - I am afraid will soon have to provide scalar equivalents for several AVX2 instrincts too.
> > So in summary that way (providing our own scalar equivalents of SIMD instrincts) seems to me slow, hard to maintain and error
> prone.
> >
> > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > rte_acl_classify_scalar() - could be build and used on all systems.
> > rte_acl_classify_sse() - could be build and used only on systems with sse4.2 and upper, return ENOTSUP on lower arch.
> > In future: rte_acl_classify_avx2 - could be build and used only on systems with avx2 and upper, return ENOTSUP on lower arch.
> >
> > I am looking at rte_acl right now anyway.
> > So will try to come up with something workable.
> >
> So, this is exactly the opposite of what Bruce and I just spent several days and
> a huge email thread that you clearly are aware of discussing run time versus
> compile time selection of paths.  At this point I'm done ping ponging between
> your opposing viewpoints.  If you want to implement something that does run time
> checking, I'm fine with it, but I'm not going back and forth until you two come
> to an agreement on this.

Right now, I am not talking about 'run time vs compile time selection'.
Whatever way we choose, I think the implementation need to be:
1) correct
2) allow easily add(/modify) code path optimised for particular architecture.
Without need to modify/re-test what you call  'least common denominator' code path.
And visa-versa, if someone find a way to optimise common code path - no need to
touch/retest architecture specific ones.

> 
> > > Only compile tested so far, but I wanted to post it for early review so that
> > > others could aid in unit testing.
> >
> > Could you please stop submitting untested patches.
> What part of "Compile tested only" was hard to understand? I posted it early
> specficially because I expected comments like you've provided above.  After I
> posted my first idea, you naked it, and after I started fixing it up to make
> run-time selections, Bruce came down in favor of common compile time
> functionality.  Given our previous conversation, I figured you would be opposed
> to this approach, so whats the point in testing this if you're just going to
> kill the idea?
> 
> > It is common (and good) practice to do at least some minimal testing before submitting your code changes.
> Never intended it to be accepted just tested/reviewed.  Hence my comment
> "compile tested only".  I admit I was unaware of the unit test below
> > At the end, it will save your own and other people time.
> > For ACL there is a simple UT, that could be run as:
> > ./x86_64-native-linuxapp-gcc/app/test ...
> > RTE>>acl_autotest
> > It takes just few seconds to run.
> 
> It doesn't seem to work properly, at least not on any of my systems.  With a
> system allocation 100 pages to hugepage memory:
> 
> [root@hmsreliant app]# ./test -c 0x3 -n 2
> EAL: Detected lcore 0 as core 0 on socket 0
> EAL: Detected lcore 1 as core 1 on socket 0
> EAL: Detected lcore 2 as core 2 on socket 0
> EAL: Detected lcore 3 as core 3 on socket 0
> EAL: Detected lcore 4 as core 0 on socket 0
> EAL: Detected lcore 5 as core 1 on socket 0
> EAL: Detected lcore 6 as core 2 on socket 0
> EAL: Detected lcore 7 as core 3 on socket 0
> EAL: Support maximum 64 logical core(s) by configuration.
> EAL: Detected 8 lcore(s)
> EAL:   cannot open VFIO container, error 2 (No such file or directory)
> EAL: VFIO support could not be initialized
> EAL: Setting up memory...
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef07000000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef06c00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef06600000 (size = 0x400000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef06200000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7fef05e00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x600000 bytes
> EAL: Virtual area found at 0x7fef05600000 (size = 0x600000)
> EAL: Ask a virtual area of 0x600000 bytes
> EAL: Virtual area found at 0x7fef04e00000 (size = 0x600000)
> EAL: Ask a virtual area of 0x800000 bytes
> EAL: Virtual area found at 0x7fef04400000 (size = 0x800000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef03e00000 (size = 0x400000)
> EAL: Ask a virtual area of 0xa00000 bytes
> EAL: Virtual area found at 0x7fef03200000 (size = 0xa00000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef02c00000 (size = 0x400000)
> EAL: Ask a virtual area of 0x400000 bytes
> EAL: Virtual area found at 0x7fef02600000 (size = 0x400000)
> EAL: Ask a virtual area of 0x1800000 bytes
> EAL: Virtual area found at 0x7fef00c00000 (size = 0x1800000)
> EAL: Ask a virtual area of 0x1200000 bytes
> EAL: Virtual area found at 0x7feeff800000 (size = 0x1200000)
> EAL: Ask a virtual area of 0x2600000 bytes
> EAL: Virtual area found at 0x7feefd000000 (size = 0x2600000)
> EAL: Ask a virtual area of 0xc00000 bytes
> EAL: Virtual area found at 0x7feefc200000 (size = 0xc00000)
> EAL: Ask a virtual area of 0x2200000 bytes
> EAL: Virtual area found at 0x7feef9e00000 (size = 0x2200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef9a00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef9600000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef9200000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8e00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8a00000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8600000 (size = 0x200000)
> EAL: Ask a virtual area of 0x200000 bytes
> EAL: Virtual area found at 0x7feef8200000 (size = 0x200000)
> EAL: Ask a virtual area of 0x600000 bytes
> EAL: Virtual area found at 0x7feef7a00000 (size = 0x600000)
> EAL: Requesting 100 pages of size 2MB from socket 0
> EAL: TSC frequency is ~3392297 KHz
> EAL: Master core 0 is ready (tid=73cf880)
> EAL: Core 1 is ready (tid=f71fe700)
> APP: HPET is not enabled, using TSC as default timer
> RTE>>acl_autotest
> ACL: allocation of 25166720 bytes on socket 9 for ACL_acl_ctx failed
> 
> This hangs forever (well, at least 30 minutes, but thats sufficiently
> close to forever for me to wait).
> 
> > Also if you plan further development for ACL, we probably can provide you with extra test-cases that we use for functional testing.
> That would be good please.
> 
> Thanks
> Neil
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 10:52     ` Ananyev, Konstantin
@ 2014-08-06 12:12       ` Neil Horman
  2014-08-06 12:23         ` Ananyev, Konstantin
  0 siblings, 1 reply; 14+ messages in thread
From: Neil Horman @ 2014-08-06 12:12 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

On Wed, Aug 06, 2014 at 10:52:09AM +0000, Ananyev, Konstantin wrote:
> 
> > > For ACL there is a simple UT, that could be run as:
> > > ./x86_64-native-linuxapp-gcc/app/test ...
> > > RTE>>acl_autotest
> > > It takes just few seconds to run.
> > 
> > It doesn't seem to work properly, at least not on any of my systems.  With a
> > system allocation 100 pages to hugepage memory:
> > 
> > [root@hmsreliant app]# ./test -c 0x3 -n 2
> > EAL: Detected lcore 0 as core 0 on socket 0
> > EAL: Detected lcore 1 as core 1 on socket 0
> > EAL: Detected lcore 2 as core 2 on socket 0
> > EAL: Detected lcore 3 as core 3 on socket 0
> > EAL: Detected lcore 4 as core 0 on socket 0
> > EAL: Detected lcore 5 as core 1 on socket 0
> > EAL: Detected lcore 6 as core 2 on socket 0
> > EAL: Detected lcore 7 as core 3 on socket 0
> > EAL: Support maximum 64 logical core(s) by configuration.
> > EAL: Detected 8 lcore(s)
> > EAL:   cannot open VFIO container, error 2 (No such file or directory)
> > EAL: VFIO support could not be initialized
> > EAL: Setting up memory...
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7fef07000000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7fef06c00000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x400000 bytes
> > EAL: Virtual area found at 0x7fef06600000 (size = 0x400000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7fef06200000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7fef05e00000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x600000 bytes
> > EAL: Virtual area found at 0x7fef05600000 (size = 0x600000)
> > EAL: Ask a virtual area of 0x600000 bytes
> > EAL: Virtual area found at 0x7fef04e00000 (size = 0x600000)
> > EAL: Ask a virtual area of 0x800000 bytes
> > EAL: Virtual area found at 0x7fef04400000 (size = 0x800000)
> > EAL: Ask a virtual area of 0x400000 bytes
> > EAL: Virtual area found at 0x7fef03e00000 (size = 0x400000)
> > EAL: Ask a virtual area of 0xa00000 bytes
> > EAL: Virtual area found at 0x7fef03200000 (size = 0xa00000)
> > EAL: Ask a virtual area of 0x400000 bytes
> > EAL: Virtual area found at 0x7fef02c00000 (size = 0x400000)
> > EAL: Ask a virtual area of 0x400000 bytes
> > EAL: Virtual area found at 0x7fef02600000 (size = 0x400000)
> > EAL: Ask a virtual area of 0x1800000 bytes
> > EAL: Virtual area found at 0x7fef00c00000 (size = 0x1800000)
> > EAL: Ask a virtual area of 0x1200000 bytes
> > EAL: Virtual area found at 0x7feeff800000 (size = 0x1200000)
> > EAL: Ask a virtual area of 0x2600000 bytes
> > EAL: Virtual area found at 0x7feefd000000 (size = 0x2600000)
> > EAL: Ask a virtual area of 0xc00000 bytes
> > EAL: Virtual area found at 0x7feefc200000 (size = 0xc00000)
> > EAL: Ask a virtual area of 0x2200000 bytes
> > EAL: Virtual area found at 0x7feef9e00000 (size = 0x2200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef9a00000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef9600000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef9200000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef8e00000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef8a00000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef8600000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x200000 bytes
> > EAL: Virtual area found at 0x7feef8200000 (size = 0x200000)
> > EAL: Ask a virtual area of 0x600000 bytes
> > EAL: Virtual area found at 0x7feef7a00000 (size = 0x600000)
> > EAL: Requesting 100 pages of size 2MB from socket 0
> > EAL: TSC frequency is ~3392297 KHz
> > EAL: Master core 0 is ready (tid=73cf880)
> > EAL: Core 1 is ready (tid=f71fe700)
> > APP: HPET is not enabled, using TSC as default timer
> > RTE>>acl_autotest
> > ACL: allocation of 25166720 bytes on socket 9 for ACL_acl_ctx failed
> > 
> > This hangs forever (well, at least 30 minutes, but thats sufficiently
> > close to forever for me to wait).
> > 
> 
> Ok that's unusual.
> Never seen before.
> I suppose that happens with unmodified dpdk 1.7?
If I build unmodified dpdk 1.7 (which I can only build for the native platform),
it works, however it doesn't work when built with modifications for the default
platform, but see below.

> I do run acl_autotest with at least 256M of huge-pages as ACL is quite memory consuming  and, as I remember, requires at least 2X32M areas of contiguous hugepages.
> But in that case (not enough hugepages) it should just fail with something like:
> ACL: allocation of 25953152 bytes on socket -1 for ACL_acl_ctx failed
> Any other details, so I can try to reproduce it:
> arch you build/run it on?
x86_64 default build (has to be to test the new code).

> amount of free memory in the system?
Its an 8G workstation, should have most of that free (haven't checked yet).

> Wonder what pstack says?
It says that we're stuck in the eal library, before we've touched the ACL
library it seems, so I'm not sure whats going on here:
#0  0x0000003d11ef53c3 in epoll_wait () from /lib64/libc.so.6
#1  0x00000000004d315a in eal_intr_thread_main ()
#2  0x0000003d12607f33 in start_thread () from /lib64/libpthread.so.0
#3  0x0000003d11ef4ded in clone () from /lib64/libc.so.6
Thread 2 (Thread 0x7fee73ffe700 (LWP 14529)):
#0  0x0000003d1260e87d in read () from /lib64/libpthread.so.0
#1  0x00000000004ced0a in eal_thread_loop ()
#2  0x0000003d12607f33 in start_thread () from /lib64/libpthread.so.0
#3  0x0000003d11ef4ded in clone () from /lib64/libc.so.6
Thread 1 (Thread 0x7fee84100880 (LWP 14527)):
#0  0x000000000041adde in acl_match_check_x2.constprop ()
#1  0x000000000041b1af in search_sse_2 ()
#2  0x00000000004baab9 in rte_acl_classify ()
#3  0x000000000047d12d in test_acl ()
#4  0x000000000041c535 in cmd_autotest_parsed ()
#5  0x00000000004d8503 in cmdline_parse ()
#6  0x00000000004d7600 in cmdline_valid_buffer ()
#7  0x00000000004da43a in rdline_char_in ()
#8  0x00000000004d7809 in cmdline_interact ()
#9  0x000000000041bccb in main ()

> 
> Thanks
> Konstantin
> 
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 11:39     ` Ananyev, Konstantin
@ 2014-08-06 12:18       ` Neil Horman
  2014-08-06 12:26         ` Ananyev, Konstantin
  2014-08-06 16:59         ` Richardson, Bruce
  0 siblings, 2 replies; 14+ messages in thread
From: Neil Horman @ 2014-08-06 12:18 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

On Wed, Aug 06, 2014 at 11:39:22AM +0000, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Tuesday, August 05, 2014 7:21 PM
> > To: Ananyev, Konstantin
> > Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> > Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > 
> > On Tue, Aug 05, 2014 at 03:26:27PM +0000, Ananyev, Konstantin wrote:
> > > Hi Neil,
> > >
> > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > Sent: Monday, August 04, 2014 4:36 PM
> > > > To: dev@dpdk.org
> > > > Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson, Bruce
> > > > Subject: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > > >
> > > > The ACL library makes extensive use of some SSE4.2 instructions, which means the
> > > > default build can't compile this library.  Work around the problem by testing
> > > > the __SSE42__ definition in the acl_vects.h file and defining the macros there
> > > > as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> > > > only the definitions that are currently used in the ACL library.
> > > >
> > >
> > > My comments about actual implementations of c-level equivalents below.
> > > None of them look correct to me.
> > > Of course it could be fixed.
> > > Though I am not sure that is a right way to proceed:
> > > At first I really doubt that these equivalents will provide similar performance.
> > > As you probably note - we do have a scalar version of  rte_acl_classify():  rte_acl_classify_scalar().
> > > So I think it might be faster than vector one with 'emulated' instrincts.
> > > Unfortunately it is all mixed right now into one file and 'scalar' version could use few sse4 instrincts through resolve_priority().
> > > Another thing - we consider to add another version of rte_acl_classify() that will use avx2 instrincts.
> > > If we go the way you suggest - I am afraid will soon have to provide scalar equivalents for several AVX2 instrincts too.
> > > So in summary that way (providing our own scalar equivalents of SIMD instrincts) seems to me slow, hard to maintain and error
> > prone.
> > >
> > > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > > rte_acl_classify_scalar() - could be build and used on all systems.
> > > rte_acl_classify_sse() - could be build and used only on systems with sse4.2 and upper, return ENOTSUP on lower arch.
> > > In future: rte_acl_classify_avx2 - could be build and used only on systems with avx2 and upper, return ENOTSUP on lower arch.
> > >
> > > I am looking at rte_acl right now anyway.
> > > So will try to come up with something workable.
> > >
> > So, this is exactly the opposite of what Bruce and I just spent several days and
> > a huge email thread that you clearly are aware of discussing run time versus
> > compile time selection of paths.  At this point I'm done ping ponging between
> > your opposing viewpoints.  If you want to implement something that does run time
> > checking, I'm fine with it, but I'm not going back and forth until you two come
> > to an agreement on this.
> 
> Right now, I am not talking about 'run time vs compile time selection'.
But you are talking about exactly that, allbeit implicitly.  To implement what
you recommend above (that being multiple functional paths that return a not
supported error code at run time), we need to make run time tests for what the
cpu supports.  While I'm actually ok with doing that (I think it makes alot of
sense), Bruce and I just spent several days and dozens of emails debating that,
so you can understand why I don't want to write yet another version of this
patch that requires doing the exact thing we just argued about, especially if it
means he's going to pipe back up and say no, driving me back to a common single
implementation that compiles and runs for all platforms.  I'm not going to keep
re-writing this boucing back and forth between your opposing viewpoints.  We
need to agree on a direction before I make another pass at this.

> Whatever way we choose, I think the implementation need to be:
> 1) correct
Obviously.

> 2) allow easily add(/modify) code path optimised for particular architecture.
> Without need to modify/re-test what you call  'least common denominator' code path.
> And visa-versa, if someone find a way to optimise common code path - no need to
> touch/retest architecture specific ones.
So I'm fine with this, but it is anathema to what Bruce advocated for when I did
this latest iteration.  Bruce advocated for a single common path that compiled
in all cases.  Bruce, do you want to comment here?  I'd really like to get this
settled before I go try this again. 

Neil

> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 12:12       ` Neil Horman
@ 2014-08-06 12:23         ` Ananyev, Konstantin
  2014-08-06 13:35           ` Neil Horman
  0 siblings, 1 reply; 14+ messages in thread
From: Ananyev, Konstantin @ 2014-08-06 12:23 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev



> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Wednesday, August 06, 2014 1:12 PM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> 
> On Wed, Aug 06, 2014 at 10:52:09AM +0000, Ananyev, Konstantin wrote:
> >
> > > > For ACL there is a simple UT, that could be run as:
> > > > ./x86_64-native-linuxapp-gcc/app/test ...
> > > > RTE>>acl_autotest
> > > > It takes just few seconds to run.
> > >
> > > It doesn't seem to work properly, at least not on any of my systems.  With a
> > > system allocation 100 pages to hugepage memory:
> > >
> > > [root@hmsreliant app]# ./test -c 0x3 -n 2
> > > EAL: Detected lcore 0 as core 0 on socket 0
> > > EAL: Detected lcore 1 as core 1 on socket 0
> > > EAL: Detected lcore 2 as core 2 on socket 0
> > > EAL: Detected lcore 3 as core 3 on socket 0
> > > EAL: Detected lcore 4 as core 0 on socket 0
> > > EAL: Detected lcore 5 as core 1 on socket 0
> > > EAL: Detected lcore 6 as core 2 on socket 0
> > > EAL: Detected lcore 7 as core 3 on socket 0
> > > EAL: Support maximum 64 logical core(s) by configuration.
> > > EAL: Detected 8 lcore(s)
> > > EAL:   cannot open VFIO container, error 2 (No such file or directory)
> > > EAL: VFIO support could not be initialized
> > > EAL: Setting up memory...
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7fef07000000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7fef06c00000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x400000 bytes
> > > EAL: Virtual area found at 0x7fef06600000 (size = 0x400000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7fef06200000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7fef05e00000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x600000 bytes
> > > EAL: Virtual area found at 0x7fef05600000 (size = 0x600000)
> > > EAL: Ask a virtual area of 0x600000 bytes
> > > EAL: Virtual area found at 0x7fef04e00000 (size = 0x600000)
> > > EAL: Ask a virtual area of 0x800000 bytes
> > > EAL: Virtual area found at 0x7fef04400000 (size = 0x800000)
> > > EAL: Ask a virtual area of 0x400000 bytes
> > > EAL: Virtual area found at 0x7fef03e00000 (size = 0x400000)
> > > EAL: Ask a virtual area of 0xa00000 bytes
> > > EAL: Virtual area found at 0x7fef03200000 (size = 0xa00000)
> > > EAL: Ask a virtual area of 0x400000 bytes
> > > EAL: Virtual area found at 0x7fef02c00000 (size = 0x400000)
> > > EAL: Ask a virtual area of 0x400000 bytes
> > > EAL: Virtual area found at 0x7fef02600000 (size = 0x400000)
> > > EAL: Ask a virtual area of 0x1800000 bytes
> > > EAL: Virtual area found at 0x7fef00c00000 (size = 0x1800000)
> > > EAL: Ask a virtual area of 0x1200000 bytes
> > > EAL: Virtual area found at 0x7feeff800000 (size = 0x1200000)
> > > EAL: Ask a virtual area of 0x2600000 bytes
> > > EAL: Virtual area found at 0x7feefd000000 (size = 0x2600000)
> > > EAL: Ask a virtual area of 0xc00000 bytes
> > > EAL: Virtual area found at 0x7feefc200000 (size = 0xc00000)
> > > EAL: Ask a virtual area of 0x2200000 bytes
> > > EAL: Virtual area found at 0x7feef9e00000 (size = 0x2200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef9a00000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef9600000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef9200000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef8e00000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef8a00000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef8600000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x200000 bytes
> > > EAL: Virtual area found at 0x7feef8200000 (size = 0x200000)
> > > EAL: Ask a virtual area of 0x600000 bytes
> > > EAL: Virtual area found at 0x7feef7a00000 (size = 0x600000)
> > > EAL: Requesting 100 pages of size 2MB from socket 0
> > > EAL: TSC frequency is ~3392297 KHz
> > > EAL: Master core 0 is ready (tid=73cf880)
> > > EAL: Core 1 is ready (tid=f71fe700)
> > > APP: HPET is not enabled, using TSC as default timer
> > > RTE>>acl_autotest
> > > ACL: allocation of 25166720 bytes on socket 9 for ACL_acl_ctx failed
> > >
> > > This hangs forever (well, at least 30 minutes, but thats sufficiently
> > > close to forever for me to wait).
> > >
> >
> > Ok that's unusual.
> > Never seen before.
> > I suppose that happens with unmodified dpdk 1.7?
> If I build unmodified dpdk 1.7 (which I can only build for the native platform),
> it works, however it doesn't work when built with modifications for the default
> platform, but see below.
> 
> > I do run acl_autotest with at least 256M of huge-pages as ACL is quite memory consuming  and, as I remember, requires at least
> 2X32M areas of contiguous hugepages.
> > But in that case (not enough hugepages) it should just fail with something like:
> > ACL: allocation of 25953152 bytes on socket -1 for ACL_acl_ctx failed
> > Any other details, so I can try to reproduce it:
> > arch you build/run it on?
> x86_64 default build (has to be to test the new code).
> 
> > amount of free memory in the system?
> Its an 8G workstation, should have most of that free (haven't checked yet).

That's should be enough.
I run it on 8GB workstation with ~3GB reported as 'free' by linux.

> 
> > Wonder what pstack says?
> It says that we're stuck in the eal library, before we've touched the ACL
> library it seems, so I'm not sure whats going on here:
> #0  0x0000003d11ef53c3 in epoll_wait () from /lib64/libc.so.6
> #1  0x00000000004d315a in eal_intr_thread_main ()
> #2  0x0000003d12607f33 in start_thread () from /lib64/libpthread.so.0
> #3  0x0000003d11ef4ded in clone () from /lib64/libc.so.6
> Thread 2 (Thread 0x7fee73ffe700 (LWP 14529)):
> #0  0x0000003d1260e87d in read () from /lib64/libpthread.so.0
> #1  0x00000000004ced0a in eal_thread_loop ()
> #2  0x0000003d12607f33 in start_thread () from /lib64/libpthread.so.0
> #3  0x0000003d11ef4ded in clone () from /lib64/libc.so.6
> Thread 1 (Thread 0x7fee84100880 (LWP 14527)):
> #0  0x000000000041adde in acl_match_check_x2.constprop ()
> #1  0x000000000041b1af in search_sse_2 ()
> #2  0x00000000004baab9 in rte_acl_classify ()
> #3  0x000000000047d12d in test_acl ()
> #4  0x000000000041c535 in cmd_autotest_parsed ()
> #5  0x00000000004d8503 in cmdline_parse ()
> #6  0x00000000004d7600 in cmdline_valid_buffer ()
> #7  0x00000000004da43a in rdline_char_in ()
> #8  0x00000000004d7809 in cmdline_interact ()
> #9  0x000000000041bccb in main ()

Ok, seems that we are in the indefinite loop.
Probably here:
acl_match_check_x2 
{
  ...
  while (!MM_TESTZ(temp, temp)) {..}
}
 
Did I get it right, that it works ok with x86_64-default-linuxapp-gcc binary, but hangs with modified x86_64-default-linuxapp-gcc?
If yes, then I wonder what modifications you made?

Konstantin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 12:18       ` Neil Horman
@ 2014-08-06 12:26         ` Ananyev, Konstantin
  2014-08-06 16:59         ` Richardson, Bruce
  1 sibling, 0 replies; 14+ messages in thread
From: Ananyev, Konstantin @ 2014-08-06 12:26 UTC (permalink / raw)
  To: Neil Horman; +Cc: dev



> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Wednesday, August 06, 2014 1:19 PM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> 
> On Wed, Aug 06, 2014 at 11:39:22AM +0000, Ananyev, Konstantin wrote:
> >
> >
> > > -----Original Message-----
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Tuesday, August 05, 2014 7:21 PM
> > > To: Ananyev, Konstantin
> > > Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> > > Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > >
> > > On Tue, Aug 05, 2014 at 03:26:27PM +0000, Ananyev, Konstantin wrote:
> > > > Hi Neil,
> > > >
> > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > Sent: Monday, August 04, 2014 4:36 PM
> > > > > To: dev@dpdk.org
> > > > > Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson, Bruce
> > > > > Subject: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > > > >
> > > > > The ACL library makes extensive use of some SSE4.2 instructions, which means the
> > > > > default build can't compile this library.  Work around the problem by testing
> > > > > the __SSE42__ definition in the acl_vects.h file and defining the macros there
> > > > > as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> > > > > only the definitions that are currently used in the ACL library.
> > > > >
> > > >
> > > > My comments about actual implementations of c-level equivalents below.
> > > > None of them look correct to me.
> > > > Of course it could be fixed.
> > > > Though I am not sure that is a right way to proceed:
> > > > At first I really doubt that these equivalents will provide similar performance.
> > > > As you probably note - we do have a scalar version of  rte_acl_classify():  rte_acl_classify_scalar().
> > > > So I think it might be faster than vector one with 'emulated' instrincts.
> > > > Unfortunately it is all mixed right now into one file and 'scalar' version could use few sse4 instrincts through resolve_priority().
> > > > Another thing - we consider to add another version of rte_acl_classify() that will use avx2 instrincts.
> > > > If we go the way you suggest - I am afraid will soon have to provide scalar equivalents for several AVX2 instrincts too.
> > > > So in summary that way (providing our own scalar equivalents of SIMD instrincts) seems to me slow, hard to maintain and error
> > > prone.
> > > >
> > > > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > > > rte_acl_classify_scalar() - could be build and used on all systems.
> > > > rte_acl_classify_sse() - could be build and used only on systems with sse4.2 and upper, return ENOTSUP on lower arch.
> > > > In future: rte_acl_classify_avx2 - could be build and used only on systems with avx2 and upper, return ENOTSUP on lower arch.
> > > >
> > > > I am looking at rte_acl right now anyway.
> > > > So will try to come up with something workable.
> > > >
> > > So, this is exactly the opposite of what Bruce and I just spent several days and
> > > a huge email thread that you clearly are aware of discussing run time versus
> > > compile time selection of paths.  At this point I'm done ping ponging between
> > > your opposing viewpoints.  If you want to implement something that does run time
> > > checking, I'm fine with it, but I'm not going back and forth until you two come
> > > to an agreement on this.
> >
> > Right now, I am not talking about 'run time vs compile time selection'.
> But you are talking about exactly that, allbeit implicitly.  To implement what
> you recommend above (that being multiple functional paths that return a not
> supported error code at run time), we need to make run time tests for what the
> cpu supports.  While I'm actually ok with doing that (I think it makes alot of
> sense), Bruce and I just spent several days and dozens of emails debating that,
> so you can understand why I don't want to write yet another version of this
> patch that requires doing the exact thing we just argued about, especially if it
> means he's going to pipe back up and say no, driving me back to a common single
> implementation that compiles and runs for all platforms.  I'm not going to keep
> re-writing this boucing back and forth between your opposing viewpoints.  We
> need to agree on a direction before I make another pass at this.
> 
> > Whatever way we choose, I think the implementation need to be:
> > 1) correct
> Obviously.
> 
> > 2) allow easily add(/modify) code path optimised for particular architecture.
> > Without need to modify/re-test what you call  'least common denominator' code path.
> > And visa-versa, if someone find a way to optimise common code path - no need to
> > touch/retest architecture specific ones.
> So I'm fine with this, but it is anathema to what Bruce advocated for when I did
> this latest iteration.  Bruce advocated for a single common path that compiled
> in all cases.  Bruce, do you want to comment here?  I'd really like to get this
> settled before I go try this again.
> 
> Neil
> 

Ok, let me try to prepare a patch with what I suggested.
Hopefully it will make everyone reasonably happy.
Konstantin

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 12:23         ` Ananyev, Konstantin
@ 2014-08-06 13:35           ` Neil Horman
  0 siblings, 0 replies; 14+ messages in thread
From: Neil Horman @ 2014-08-06 13:35 UTC (permalink / raw)
  To: Ananyev, Konstantin; +Cc: dev

On Wed, Aug 06, 2014 at 12:23:29PM +0000, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Wednesday, August 06, 2014 1:12 PM
> > To: Ananyev, Konstantin
> > Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> > Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
> > 
> > On Wed, Aug 06, 2014 at 10:52:09AM +0000, Ananyev, Konstantin wrote:
> > >
> > > > > For ACL there is a simple UT, that could be run as:
> > > > > ./x86_64-native-linuxapp-gcc/app/test ...
> > > > > RTE>>acl_autotest
> > > > > It takes just few seconds to run.
> > > >
> > > > It doesn't seem to work properly, at least not on any of my systems.  With a
> > > > system allocation 100 pages to hugepage memory:
> > > >
> > > > [root@hmsreliant app]# ./test -c 0x3 -n 2
> > > > EAL: Detected lcore 0 as core 0 on socket 0
> > > > EAL: Detected lcore 1 as core 1 on socket 0
> > > > EAL: Detected lcore 2 as core 2 on socket 0
> > > > EAL: Detected lcore 3 as core 3 on socket 0
> > > > EAL: Detected lcore 4 as core 0 on socket 0
> > > > EAL: Detected lcore 5 as core 1 on socket 0
> > > > EAL: Detected lcore 6 as core 2 on socket 0
> > > > EAL: Detected lcore 7 as core 3 on socket 0
> > > > EAL: Support maximum 64 logical core(s) by configuration.
> > > > EAL: Detected 8 lcore(s)
> > > > EAL:   cannot open VFIO container, error 2 (No such file or directory)
> > > > EAL: VFIO support could not be initialized
> > > > EAL: Setting up memory...
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7fef07000000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7fef06c00000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x400000 bytes
> > > > EAL: Virtual area found at 0x7fef06600000 (size = 0x400000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7fef06200000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7fef05e00000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x600000 bytes
> > > > EAL: Virtual area found at 0x7fef05600000 (size = 0x600000)
> > > > EAL: Ask a virtual area of 0x600000 bytes
> > > > EAL: Virtual area found at 0x7fef04e00000 (size = 0x600000)
> > > > EAL: Ask a virtual area of 0x800000 bytes
> > > > EAL: Virtual area found at 0x7fef04400000 (size = 0x800000)
> > > > EAL: Ask a virtual area of 0x400000 bytes
> > > > EAL: Virtual area found at 0x7fef03e00000 (size = 0x400000)
> > > > EAL: Ask a virtual area of 0xa00000 bytes
> > > > EAL: Virtual area found at 0x7fef03200000 (size = 0xa00000)
> > > > EAL: Ask a virtual area of 0x400000 bytes
> > > > EAL: Virtual area found at 0x7fef02c00000 (size = 0x400000)
> > > > EAL: Ask a virtual area of 0x400000 bytes
> > > > EAL: Virtual area found at 0x7fef02600000 (size = 0x400000)
> > > > EAL: Ask a virtual area of 0x1800000 bytes
> > > > EAL: Virtual area found at 0x7fef00c00000 (size = 0x1800000)
> > > > EAL: Ask a virtual area of 0x1200000 bytes
> > > > EAL: Virtual area found at 0x7feeff800000 (size = 0x1200000)
> > > > EAL: Ask a virtual area of 0x2600000 bytes
> > > > EAL: Virtual area found at 0x7feefd000000 (size = 0x2600000)
> > > > EAL: Ask a virtual area of 0xc00000 bytes
> > > > EAL: Virtual area found at 0x7feefc200000 (size = 0xc00000)
> > > > EAL: Ask a virtual area of 0x2200000 bytes
> > > > EAL: Virtual area found at 0x7feef9e00000 (size = 0x2200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef9a00000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef9600000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef9200000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef8e00000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef8a00000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef8600000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x200000 bytes
> > > > EAL: Virtual area found at 0x7feef8200000 (size = 0x200000)
> > > > EAL: Ask a virtual area of 0x600000 bytes
> > > > EAL: Virtual area found at 0x7feef7a00000 (size = 0x600000)
> > > > EAL: Requesting 100 pages of size 2MB from socket 0
> > > > EAL: TSC frequency is ~3392297 KHz
> > > > EAL: Master core 0 is ready (tid=73cf880)
> > > > EAL: Core 1 is ready (tid=f71fe700)
> > > > APP: HPET is not enabled, using TSC as default timer
> > > > RTE>>acl_autotest
> > > > ACL: allocation of 25166720 bytes on socket 9 for ACL_acl_ctx failed
> > > >
> > > > This hangs forever (well, at least 30 minutes, but thats sufficiently
> > > > close to forever for me to wait).
> > > >
> > >
> > > Ok that's unusual.
> > > Never seen before.
> > > I suppose that happens with unmodified dpdk 1.7?
> > If I build unmodified dpdk 1.7 (which I can only build for the native platform),
> > it works, however it doesn't work when built with modifications for the default
> > platform, but see below.
> > 
> > > I do run acl_autotest with at least 256M of huge-pages as ACL is quite memory consuming  and, as I remember, requires at least
> > 2X32M areas of contiguous hugepages.
> > > But in that case (not enough hugepages) it should just fail with something like:
> > > ACL: allocation of 25953152 bytes on socket -1 for ACL_acl_ctx failed
> > > Any other details, so I can try to reproduce it:
> > > arch you build/run it on?
> > x86_64 default build (has to be to test the new code).
> > 
> > > amount of free memory in the system?
> > Its an 8G workstation, should have most of that free (haven't checked yet).
> 
> That's should be enough.
> I run it on 8GB workstation with ~3GB reported as 'free' by linux.
> 
> > 
> > > Wonder what pstack says?
> > It says that we're stuck in the eal library, before we've touched the ACL
> > library it seems, so I'm not sure whats going on here:
> > #0  0x0000003d11ef53c3 in epoll_wait () from /lib64/libc.so.6
> > #1  0x00000000004d315a in eal_intr_thread_main ()
> > #2  0x0000003d12607f33 in start_thread () from /lib64/libpthread.so.0
> > #3  0x0000003d11ef4ded in clone () from /lib64/libc.so.6
> > Thread 2 (Thread 0x7fee73ffe700 (LWP 14529)):
> > #0  0x0000003d1260e87d in read () from /lib64/libpthread.so.0
> > #1  0x00000000004ced0a in eal_thread_loop ()
> > #2  0x0000003d12607f33 in start_thread () from /lib64/libpthread.so.0
> > #3  0x0000003d11ef4ded in clone () from /lib64/libc.so.6
> > Thread 1 (Thread 0x7fee84100880 (LWP 14527)):
> > #0  0x000000000041adde in acl_match_check_x2.constprop ()
> > #1  0x000000000041b1af in search_sse_2 ()
> > #2  0x00000000004baab9 in rte_acl_classify ()
> > #3  0x000000000047d12d in test_acl ()
> > #4  0x000000000041c535 in cmd_autotest_parsed ()
> > #5  0x00000000004d8503 in cmdline_parse ()
> > #6  0x00000000004d7600 in cmdline_valid_buffer ()
> > #7  0x00000000004da43a in rdline_char_in ()
> > #8  0x00000000004d7809 in cmdline_interact ()
> > #9  0x000000000041bccb in main ()
> 
> Ok, seems that we are in the indefinite loop.
> Probably here:
> acl_match_check_x2 
> {
>   ...
>   while (!MM_TESTZ(temp, temp)) {..}
> }
>  
Possibly, but I don't see how that lines up with the stack trace above.  I'll
add some printfs later this morning to see whats going on.

> Did I get it right, that it works ok with x86_64-default-linuxapp-gcc binary, but hangs with modified x86_64-default-linuxapp-gcc?
> If yes, then I wonder what modifications you made?
> 
Impossible to tell, because x86_64-default-linuxapp-gcc can't build the acl
library without my changes, thats the point of the patch :).

> Konstantin
> 
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 12:18       ` Neil Horman
  2014-08-06 12:26         ` Ananyev, Konstantin
@ 2014-08-06 16:59         ` Richardson, Bruce
  2014-08-06 17:27           ` Neil Horman
  1 sibling, 1 reply; 14+ messages in thread
From: Richardson, Bruce @ 2014-08-06 16:59 UTC (permalink / raw)
  To: Neil Horman, Ananyev, Konstantin; +Cc: dev

> -----Original Message-----
> From: Neil Horman [mailto:nhorman@tuxdriver.com]
> Sent: Wednesday, August 06, 2014 5:19 AM
> To: Ananyev, Konstantin
> Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing
> instructions with C code
> 
> On Wed, Aug 06, 2014 at 11:39:22AM +0000, Ananyev, Konstantin wrote:
> >
> >
> > > -----Original Message-----
> > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > Sent: Tuesday, August 05, 2014 7:21 PM
> > > To: Ananyev, Konstantin
> > > Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> > > Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing
> instructions with C code
> > >
> > > On Tue, Aug 05, 2014 at 03:26:27PM +0000, Ananyev, Konstantin wrote:
> > > > Hi Neil,
> > > >
> > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > Sent: Monday, August 04, 2014 4:36 PM
> > > > > To: dev@dpdk.org
> > > > > Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson,
> Bruce
> > > > > Subject: [PATCH] acl: If build does not support sse4.2, emulate missing
> instructions with C code
> > > > >
> > > > > The ACL library makes extensive use of some SSE4.2 instructions, which
> means the
> > > > > default build can't compile this library.  Work around the problem by
> testing
> > > > > the __SSE42__ definition in the acl_vects.h file and defining the macros
> there
> > > > > as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> > > > > only the definitions that are currently used in the ACL library.
> > > > >
> > > >
> > > > My comments about actual implementations of c-level equivalents below.
> > > > None of them look correct to me.
> > > > Of course it could be fixed.
> > > > Though I am not sure that is a right way to proceed:
> > > > At first I really doubt that these equivalents will provide similar
> performance.
> > > > As you probably note - we do have a scalar version of  rte_acl_classify():
> rte_acl_classify_scalar().
> > > > So I think it might be faster than vector one with 'emulated' instrincts.
> > > > Unfortunately it is all mixed right now into one file and 'scalar' version
> could use few sse4 instrincts through resolve_priority().
> > > > Another thing - we consider to add another version of rte_acl_classify()
> that will use avx2 instrincts.
> > > > If we go the way you suggest - I am afraid will soon have to provide scalar
> equivalents for several AVX2 instrincts too.
> > > > So in summary that way (providing our own scalar equivalents of SIMD
> instrincts) seems to me slow, hard to maintain and error
> > > prone.
> > > >
> > > > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > > > rte_acl_classify_scalar() - could be build and used on all systems.
> > > > rte_acl_classify_sse() - could be build and used only on systems with sse4.2
> and upper, return ENOTSUP on lower arch.
> > > > In future: rte_acl_classify_avx2 - could be build and used only on systems
> with avx2 and upper, return ENOTSUP on lower arch.
> > > >
> > > > I am looking at rte_acl right now anyway.
> > > > So will try to come up with something workable.
> > > >
> > > So, this is exactly the opposite of what Bruce and I just spent several days
> and
> > > a huge email thread that you clearly are aware of discussing run time versus
> > > compile time selection of paths.  At this point I'm done ping ponging
> between
> > > your opposing viewpoints.  If you want to implement something that does
> run time
> > > checking, I'm fine with it, but I'm not going back and forth until you two
> come
> > > to an agreement on this.
> >
> > Right now, I am not talking about 'run time vs compile time selection'.
> But you are talking about exactly that, allbeit implicitly.  To implement what
> you recommend above (that being multiple functional paths that return a not
> supported error code at run time), we need to make run time tests for what the
> cpu supports.  While I'm actually ok with doing that (I think it makes alot of
> sense), Bruce and I just spent several days and dozens of emails debating that,
> so you can understand why I don't want to write yet another version of this
> patch that requires doing the exact thing we just argued about, especially if it
> means he's going to pipe back up and say no, driving me back to a common
> single
> implementation that compiles and runs for all platforms.  I'm not going to keep
> re-writing this boucing back and forth between your opposing viewpoints.  We
> need to agree on a direction before I make another pass at this.
> 
> > Whatever way we choose, I think the implementation need to be:
> > 1) correct
> Obviously.
> 
> > 2) allow easily add(/modify) code path optimised for particular architecture.
> > Without need to modify/re-test what you call  'least common denominator'
> code path.
> > And visa-versa, if someone find a way to optimise common code path - no
> need to
> > touch/retest architecture specific ones.
> So I'm fine with this, but it is anathema to what Bruce advocated for when I did
> this latest iteration.  Bruce advocated for a single common path that compiled
> in all cases.  Bruce, do you want to comment here?  I'd really like to get this
> settled before I go try this again.

In our previous discussion I was primarily concerned with the ixgbe driver, which already had a number of scalar code paths as well as the vector one, so I was very keen there not to see more code paths created. However, while I hate seeing more code paths created that need to be maintained, I am ok with having them created if the benefit is big enough. Up till now code path selection would have been done at compile-time, but you've convinced me that if we have the code paths there, selecting them at runtime makes more sense for a packaged build. 
For ACL specifically, I generally defer to Konstantin as the expert in this area. If we need separate code paths for scalar, SSE and AVX, and each gives considerable performance improvement over the other, then I'm ok with that.

/Bruce

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 16:59         ` Richardson, Bruce
@ 2014-08-06 17:27           ` Neil Horman
  2014-08-12 23:19             ` Thomas Monjalon
  0 siblings, 1 reply; 14+ messages in thread
From: Neil Horman @ 2014-08-06 17:27 UTC (permalink / raw)
  To: Richardson, Bruce; +Cc: dev

On Wed, Aug 06, 2014 at 04:59:59PM +0000, Richardson, Bruce wrote:
> > -----Original Message-----
> > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > Sent: Wednesday, August 06, 2014 5:19 AM
> > To: Ananyev, Konstantin
> > Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> > Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing
> > instructions with C code
> > 
> > On Wed, Aug 06, 2014 at 11:39:22AM +0000, Ananyev, Konstantin wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > Sent: Tuesday, August 05, 2014 7:21 PM
> > > > To: Ananyev, Konstantin
> > > > Cc: dev@dpdk.org; Thomas Monjalon; Richardson, Bruce
> > > > Subject: Re: [PATCH] acl: If build does not support sse4.2, emulate missing
> > instructions with C code
> > > >
> > > > On Tue, Aug 05, 2014 at 03:26:27PM +0000, Ananyev, Konstantin wrote:
> > > > > Hi Neil,
> > > > >
> > > > > > From: Neil Horman [mailto:nhorman@tuxdriver.com]
> > > > > > Sent: Monday, August 04, 2014 4:36 PM
> > > > > > To: dev@dpdk.org
> > > > > > Cc: Neil Horman; Thomas Monjalon; Ananyev, Konstantin; Richardson,
> > Bruce
> > > > > > Subject: [PATCH] acl: If build does not support sse4.2, emulate missing
> > instructions with C code
> > > > > >
> > > > > > The ACL library makes extensive use of some SSE4.2 instructions, which
> > means the
> > > > > > default build can't compile this library.  Work around the problem by
> > testing
> > > > > > the __SSE42__ definition in the acl_vects.h file and defining the macros
> > there
> > > > > > as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
> > > > > > only the definitions that are currently used in the ACL library.
> > > > > >
> > > > >
> > > > > My comments about actual implementations of c-level equivalents below.
> > > > > None of them look correct to me.
> > > > > Of course it could be fixed.
> > > > > Though I am not sure that is a right way to proceed:
> > > > > At first I really doubt that these equivalents will provide similar
> > performance.
> > > > > As you probably note - we do have a scalar version of  rte_acl_classify():
> > rte_acl_classify_scalar().
> > > > > So I think it might be faster than vector one with 'emulated' instrincts.
> > > > > Unfortunately it is all mixed right now into one file and 'scalar' version
> > could use few sse4 instrincts through resolve_priority().
> > > > > Another thing - we consider to add another version of rte_acl_classify()
> > that will use avx2 instrincts.
> > > > > If we go the way you suggest - I am afraid will soon have to provide scalar
> > equivalents for several AVX2 instrincts too.
> > > > > So in summary that way (providing our own scalar equivalents of SIMD
> > instrincts) seems to me slow, hard to maintain and error
> > > > prone.
> > > > >
> > > > > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > > > > rte_acl_classify_scalar() - could be build and used on all systems.
> > > > > rte_acl_classify_sse() - could be build and used only on systems with sse4.2
> > and upper, return ENOTSUP on lower arch.
> > > > > In future: rte_acl_classify_avx2 - could be build and used only on systems
> > with avx2 and upper, return ENOTSUP on lower arch.
> > > > >
> > > > > I am looking at rte_acl right now anyway.
> > > > > So will try to come up with something workable.
> > > > >
> > > > So, this is exactly the opposite of what Bruce and I just spent several days
> > and
> > > > a huge email thread that you clearly are aware of discussing run time versus
> > > > compile time selection of paths.  At this point I'm done ping ponging
> > between
> > > > your opposing viewpoints.  If you want to implement something that does
> > run time
> > > > checking, I'm fine with it, but I'm not going back and forth until you two
> > come
> > > > to an agreement on this.
> > >
> > > Right now, I am not talking about 'run time vs compile time selection'.
> > But you are talking about exactly that, allbeit implicitly.  To implement what
> > you recommend above (that being multiple functional paths that return a not
> > supported error code at run time), we need to make run time tests for what the
> > cpu supports.  While I'm actually ok with doing that (I think it makes alot of
> > sense), Bruce and I just spent several days and dozens of emails debating that,
> > so you can understand why I don't want to write yet another version of this
> > patch that requires doing the exact thing we just argued about, especially if it
> > means he's going to pipe back up and say no, driving me back to a common
> > single
> > implementation that compiles and runs for all platforms.  I'm not going to keep
> > re-writing this boucing back and forth between your opposing viewpoints.  We
> > need to agree on a direction before I make another pass at this.
> > 
> > > Whatever way we choose, I think the implementation need to be:
> > > 1) correct
> > Obviously.
> > 
> > > 2) allow easily add(/modify) code path optimised for particular architecture.
> > > Without need to modify/re-test what you call  'least common denominator'
> > code path.
> > > And visa-versa, if someone find a way to optimise common code path - no
> > need to
> > > touch/retest architecture specific ones.
> > So I'm fine with this, but it is anathema to what Bruce advocated for when I did
> > this latest iteration.  Bruce advocated for a single common path that compiled
> > in all cases.  Bruce, do you want to comment here?  I'd really like to get this
> > settled before I go try this again.
> 
> In our previous discussion I was primarily concerned with the ixgbe driver, which already had a number of scalar code paths as well as the vector one, so I was very keen there not to see more code paths created. However, while I hate seeing more code paths created that need to be maintained, I am ok with having them created if the benefit is big enough. Up till now code path selection would have been done at compile-time, but you've convinced me that if we have the code paths there, selecting them at runtime makes more sense for a packaged build. 
> For ACL specifically, I generally defer to Konstantin as the expert in this area. If we need separate code paths for scalar, SSE and AVX, and each gives considerable performance improvement over the other, then I'm ok with that.
> 
> /Bruce
> 

I'm still not sure how you thought I was creating new code paths in the ixgbe
driver using run time selection vs. compile time selection, but regardless, if
run time path selection is the consensus, thats good, I can do that.

Neil
 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-06 17:27           ` Neil Horman
@ 2014-08-12 23:19             ` Thomas Monjalon
  2014-08-13 12:33               ` Neil Horman
  0 siblings, 1 reply; 14+ messages in thread
From: Thomas Monjalon @ 2014-08-12 23:19 UTC (permalink / raw)
  To: Neil Horman, Richardson, Bruce, Ananyev, Konstantin; +Cc: dev

Hi all,

2014-08-06 13:27, Neil Horman:
> Richardson, Bruce wrote:
> > Neil Horman
> > > Ananyev, Konstantin wrote:
> > > > Neil Horman
> > > > > Ananyev, Konstantin wrote:
> > > > > > As you probably note - we do have a scalar version of  rte_acl_classify():
> > > > > > rte_acl_classify_scalar().
> > > > > > So I think it might be faster than vector one with 'emulated' instrincts.
> > > > > > Unfortunately it is all mixed right now into one file and 'scalar' version
> > > > > > could use few sse4 instrincts through resolve_priority().
> > > > > > Another thing - we consider to add another version of rte_acl_classify()
> > > > > > that will use avx2 instrincts.
> > > > > > If we go the way you suggest - I am afraid will soon have to provide scalar
> > > > > > equivalents for several AVX2 instrincts too.
> > > > > > So in summary that way (providing our own scalar equivalents of SIMD
> > > > > > instrincts) seems to me slow, hard to maintain and error
> > > > > > prone.
> > > > > >
> > > > > > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > > > > > rte_acl_classify_scalar() - could be build and used on all systems.
> > > > > > rte_acl_classify_sse() - could be build and used only on systems with sse4.2
> > > > > > and upper, return ENOTSUP on lower arch.
> > > > > > In future: rte_acl_classify_avx2 - could be build and used only on systems
> > > > > > with avx2 and upper, return ENOTSUP on lower arch.
> > > > > >
> > > > > > I am looking at rte_acl right now anyway.
> > > > > > So will try to come up with something workable.
> > > > > 
> > > > > So, this is exactly the opposite of what Bruce and I just spent several days
> > > > > and a huge email thread that you clearly are aware of discussing run time versus
> > > > > compile time selection of paths.  At this point I'm done ping ponging between
> > > > > your opposing viewpoints.  If you want to implement something that does
> > > > > run time checking, I'm fine with it, but I'm not going back and forth until you
> > > > > two come to an agreement on this.
> > > >
> > > > Right now, I am not talking about 'run time vs compile time selection'.
> > > 
> > > But you are talking about exactly that, allbeit implicitly.  To implement what
> > > you recommend above (that being multiple functional paths that return a not
> > > supported error code at run time), we need to make run time tests for what the
> > > cpu supports.  While I'm actually ok with doing that (I think it makes alot of
> > > sense), Bruce and I just spent several days and dozens of emails debating that,
> > > so you can understand why I don't want to write yet another version of this
> > > patch that requires doing the exact thing we just argued about, especially if it
> > > means he's going to pipe back up and say no, driving me back to a common single
> > > implementation that compiles and runs for all platforms.  I'm not going to keep
> > > re-writing this boucing back and forth between your opposing viewpoints.  We
> > > need to agree on a direction before I make another pass at this.
> > > 
> > > > 2) allow easily add(/modify) code path optimised for particular architecture.
> > > > Without need to modify/re-test what you call  'least common denominator'
> > > > code path.
> > > > And visa-versa, if someone find a way to optimise common code path - no
> > > > need to touch/retest architecture specific ones.
> > > 
> > > So I'm fine with this, but it is anathema to what Bruce advocated for when I did
> > > this latest iteration.  Bruce advocated for a single common path that compiled
> > > in all cases.  Bruce, do you want to comment here?  I'd really like to get this
> > > settled before I go try this again.
> > 
> > In our previous discussion I was primarily concerned with the ixgbe driver,
> > which already had a number of scalar code paths as well as the vector one,
> > so I was very keen there not to see more code paths created.
> > However, while I hate seeing more code paths created that need to be maintained,
> > I am ok with having them created if the benefit is big enough.
> > Up till now code path selection would have been done at compile-time, but you've
> > convinced me that if we have the code paths there, selecting them at runtime makes
> > more sense for a packaged build. 
> > For ACL specifically, I generally defer to Konstantin as the expert in this area.
> > If we need separate code paths for scalar, SSE and AVX, and each gives considerable
> > performance improvement over the other, then I'm ok with that.
> 
> I'm still not sure how you thought I was creating new code paths in the ixgbe
> driver using run time selection vs. compile time selection, but regardless, if
> run time path selection is the consensus, thats good, I can do that.

Thanks everyone, it seems we have a consensus.
I think it's important to summarize it here:

1) If benefit is big, we allow creation of different code paths,
which means different implementations of a same feature in order to have
best performance with recent CPU.

2) The choice of the code path can be done at run-time, so it is possible
to package a binary which works on default architecture and use latest
instructions if available.

The current situation requires to build DPDK for the right architecture
(native one) in order to have the best performances.
This situation will be improved in some critical areas but it isn't planned
to fork code paths systematically.

Thanks everyone for these efforts
-- 
Thomas

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
  2014-08-12 23:19             ` Thomas Monjalon
@ 2014-08-13 12:33               ` Neil Horman
  0 siblings, 0 replies; 14+ messages in thread
From: Neil Horman @ 2014-08-13 12:33 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev

On Wed, Aug 13, 2014 at 01:19:09AM +0200, Thomas Monjalon wrote:
> Hi all,
> 
> 2014-08-06 13:27, Neil Horman:
> > Richardson, Bruce wrote:
> > > Neil Horman
> > > > Ananyev, Konstantin wrote:
> > > > > Neil Horman
> > > > > > Ananyev, Konstantin wrote:
> > > > > > > As you probably note - we do have a scalar version of  rte_acl_classify():
> > > > > > > rte_acl_classify_scalar().
> > > > > > > So I think it might be faster than vector one with 'emulated' instrincts.
> > > > > > > Unfortunately it is all mixed right now into one file and 'scalar' version
> > > > > > > could use few sse4 instrincts through resolve_priority().
> > > > > > > Another thing - we consider to add another version of rte_acl_classify()
> > > > > > > that will use avx2 instrincts.
> > > > > > > If we go the way you suggest - I am afraid will soon have to provide scalar
> > > > > > > equivalents for several AVX2 instrincts too.
> > > > > > > So in summary that way (providing our own scalar equivalents of SIMD
> > > > > > > instrincts) seems to me slow, hard to maintain and error
> > > > > > > prone.
> > > > > > >
> > > > > > > What porbably can be done instead: rework acl_run.c a bit, so it provide:
> > > > > > > rte_acl_classify_scalar() - could be build and used on all systems.
> > > > > > > rte_acl_classify_sse() - could be build and used only on systems with sse4.2
> > > > > > > and upper, return ENOTSUP on lower arch.
> > > > > > > In future: rte_acl_classify_avx2 - could be build and used only on systems
> > > > > > > with avx2 and upper, return ENOTSUP on lower arch.
> > > > > > >
> > > > > > > I am looking at rte_acl right now anyway.
> > > > > > > So will try to come up with something workable.
> > > > > > 
> > > > > > So, this is exactly the opposite of what Bruce and I just spent several days
> > > > > > and a huge email thread that you clearly are aware of discussing run time versus
> > > > > > compile time selection of paths.  At this point I'm done ping ponging between
> > > > > > your opposing viewpoints.  If you want to implement something that does
> > > > > > run time checking, I'm fine with it, but I'm not going back and forth until you
> > > > > > two come to an agreement on this.
> > > > >
> > > > > Right now, I am not talking about 'run time vs compile time selection'.
> > > > 
> > > > But you are talking about exactly that, allbeit implicitly.  To implement what
> > > > you recommend above (that being multiple functional paths that return a not
> > > > supported error code at run time), we need to make run time tests for what the
> > > > cpu supports.  While I'm actually ok with doing that (I think it makes alot of
> > > > sense), Bruce and I just spent several days and dozens of emails debating that,
> > > > so you can understand why I don't want to write yet another version of this
> > > > patch that requires doing the exact thing we just argued about, especially if it
> > > > means he's going to pipe back up and say no, driving me back to a common single
> > > > implementation that compiles and runs for all platforms.  I'm not going to keep
> > > > re-writing this boucing back and forth between your opposing viewpoints.  We
> > > > need to agree on a direction before I make another pass at this.
> > > > 
> > > > > 2) allow easily add(/modify) code path optimised for particular architecture.
> > > > > Without need to modify/re-test what you call  'least common denominator'
> > > > > code path.
> > > > > And visa-versa, if someone find a way to optimise common code path - no
> > > > > need to touch/retest architecture specific ones.
> > > > 
> > > > So I'm fine with this, but it is anathema to what Bruce advocated for when I did
> > > > this latest iteration.  Bruce advocated for a single common path that compiled
> > > > in all cases.  Bruce, do you want to comment here?  I'd really like to get this
> > > > settled before I go try this again.
> > > 
> > > In our previous discussion I was primarily concerned with the ixgbe driver,
> > > which already had a number of scalar code paths as well as the vector one,
> > > so I was very keen there not to see more code paths created.
> > > However, while I hate seeing more code paths created that need to be maintained,
> > > I am ok with having them created if the benefit is big enough.
> > > Up till now code path selection would have been done at compile-time, but you've
> > > convinced me that if we have the code paths there, selecting them at runtime makes
> > > more sense for a packaged build. 
> > > For ACL specifically, I generally defer to Konstantin as the expert in this area.
> > > If we need separate code paths for scalar, SSE and AVX, and each gives considerable
> > > performance improvement over the other, then I'm ok with that.
> > 
> > I'm still not sure how you thought I was creating new code paths in the ixgbe
> > driver using run time selection vs. compile time selection, but regardless, if
> > run time path selection is the consensus, thats good, I can do that.
> 
> Thanks everyone, it seems we have a consensus.
> I think it's important to summarize it here:
> 
> 1) If benefit is big, we allow creation of different code paths,
> which means different implementations of a same feature in order to have
> best performance with recent CPU.
> 
Agreed.

> 2) The choice of the code path can be done at run-time, so it is possible
> to package a binary which works on default architecture and use latest
> instructions if available.
> 
Agreed.

> The current situation requires to build DPDK for the right architecture
> (native one) in order to have the best performances.
> This situation will be improved in some critical areas but it isn't planned
> to fork code paths systematically.
> 
Right, higest performance case requires a native build, but a high compatibility
use case (i.e. default build), still needs to be able to use faster paths when
the cpu supports them.  Konstatin's patch does all of this, we just need to fix
the use of the exported function pointer.

Neil

> Thanks everyone for these efforts
> -- 
> Thomas
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-08-13 12:30 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-04 15:35 [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code Neil Horman
2014-08-05 15:26 ` Ananyev, Konstantin
2014-08-05 18:20   ` Neil Horman
2014-08-06 10:52     ` Ananyev, Konstantin
2014-08-06 12:12       ` Neil Horman
2014-08-06 12:23         ` Ananyev, Konstantin
2014-08-06 13:35           ` Neil Horman
2014-08-06 11:39     ` Ananyev, Konstantin
2014-08-06 12:18       ` Neil Horman
2014-08-06 12:26         ` Ananyev, Konstantin
2014-08-06 16:59         ` Richardson, Bruce
2014-08-06 17:27           ` Neil Horman
2014-08-12 23:19             ` Thomas Monjalon
2014-08-13 12:33               ` Neil Horman

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).