DPDK patches and discussions
 help / color / mirror / Atom feed
From: Neil Horman <nhorman@tuxdriver.com>
To: dev@dpdk.org
Subject: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code
Date: Mon,  4 Aug 2014 11:35:58 -0400	[thread overview]
Message-ID: <1407166558-9532-1-git-send-email-nhorman@tuxdriver.com> (raw)

The ACL library makes extensive use of some SSE4.2 instructions, which means the
default build can't compile this library.  Work around the problem by testing
the __SSE42__ definition in the acl_vects.h file and defining the macros there
as intrinsics or c-level equivalants.  Note this is a minimal patch, adjusting
only the definitions that are currently used in the ACL library.

Only compile tested so far, but I wanted to post it for early review so that
others could aid in unit testing.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Thomas Monjalon <thomas.monjalon@6wind.com>
CC: "Konstantin Ananyev" <konstantin.ananyev@intel.com>
CC: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_acl/acl_bld.c  |   3 +-
 lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c
index 873447b..de974a4 100644
--- a/lib/librte_acl/acl_bld.c
+++ b/lib/librte_acl/acl_bld.c
@@ -31,7 +31,6 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <nmmintrin.h>
 #include <rte_acl.h>
 #include "tb_mem.h"
 #include "acl.h"
@@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head,
 			switch (rule->config->defs[n].type) {
 			case RTE_ACL_FIELD_TYPE_BITMASK:
 				wild = (size -
-					_mm_popcnt_u32(fld->mask_range.u8)) /
+					__builtin_popcountl(fld->mask_range.u8)) /
 					size;
 				break;
 
diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h
index d813600..e5f391b 100644
--- a/lib/librte_acl/acl_vect.h
+++ b/lib/librte_acl/acl_vect.h
@@ -34,6 +34,10 @@
 #ifndef _RTE_ACL_VECT_H_
 #define _RTE_ACL_VECT_H_
 
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
 /**
  * @file
  *
@@ -44,12 +48,12 @@
 extern "C" {
 #endif
 
+
 #define	MM_ADD16(a, b)		_mm_add_epi16(a, b)
 #define	MM_ADD32(a, b)		_mm_add_epi32(a, b)
 #define	MM_ALIGNR8(a, b, c)	_mm_alignr_epi8(a, b, c)
 #define	MM_AND(a, b)		_mm_and_si128(a, b)
 #define MM_ANDNOT(a, b)		_mm_andnot_si128(a, b)
-#define MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
 #define MM_CMPEQ16(a, b)	_mm_cmpeq_epi16(a, b)
 #define MM_CMPEQ32(a, b)	_mm_cmpeq_epi32(a, b)
 #define	MM_CMPEQ8(a, b)		_mm_cmpeq_epi8(a, b)
@@ -59,7 +63,6 @@ extern "C" {
 #define	MM_CVT32(a)		_mm_cvtsi128_si32(a)
 #define MM_CVTU32(a)		_mm_cvtsi32_si128(a)
 #define	MM_INSERT16(a, c, b)	_mm_insert_epi16(a, c, b)
-#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
 #define	MM_LOAD(a)		_mm_load_si128(a)
 #define	MM_LOADH_PI(a, b)	_mm_loadh_pi(a, b)
 #define	MM_LOADU(a)		_mm_loadu_si128(a)
@@ -82,7 +85,6 @@ extern "C" {
 #define	MM_SRL32(a, b)		_mm_srli_epi32(a, b)
 #define	MM_STORE(a, b)		_mm_store_si128(a, b)
 #define	MM_STOREU(a, b)		_mm_storeu_si128(a, b)
-#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
 #define	MM_XOR(a, b)		_mm_xor_si128(a, b)
 
 #define	MM_SET16(a, b, c, d, e, f, g, h)	\
@@ -93,6 +95,100 @@ extern "C" {
 	_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,	\
 		c8, c9, cA, cB, cC, cD, cE, cF)
 
+
+#ifndef __SSE4_1__
+static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask)
+{
+	unsigned char tmpd[16], tmps[16], tmpm[16];
+	int i;
+
+	MM_STOREU((xmm_t *)&tmpd, dst);
+	MM_STOREU((xmm_t *)&tmps, src);
+	MM_STOREU((xmm_t *)&tmpm, mask);
+
+	for (i = 0; i < 16; i++)
+		if (mask[i] & 0x8)
+			dst[i] = src[i];
+
+	dst = MM_LOADU((xmm_t *)&tmpd);
+
+	return dst;
+}
+
+#define MM_BLENDV8(a, b, c)	pblendvb(a, b, c)
+
+
+static inline int ptestz(xmm_t a, xmm_t b)
+{
+	unsigned long long tmpa[2], tmpb[2];
+
+	MM_STOREU((xmm_t *)&tmpa, a);
+	MM_STOREU((xmm_t *)&tmpb, b);
+
+	if (tmpa[0] & tmpb[0])
+		return 1;
+	if (tmpa[1] & tmpb[1])
+		return 1;
+
+	return 0;
+}
+
+#define	MM_TESTZ(a, b)		ptestz(a, b)
+
+static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off)
+{
+	unsigned long long tmpa[2];
+	unsigned long long mask;
+	int32_t tmp;
+	
+	MM_STOREU((xmm_t *)&tmpa, dst);
+
+	/*
+	 * Inserting a dword is a bit odd as it can cross a word boundary
+	 */
+
+	if (off > 32) {
+		/*
+		 * If the offset is more than 32, then part of the 
+		 * inserted word will appear in the upper half of the xmm
+		 * register.  Grab the part of the value that crosses the 64 bit 
+		 * boundary.
+		 */
+		tmp = val >> (off - 32);
+
+		/*
+		 * Mask off the least significant bits of the upper longword
+		 */
+		mask = ~((1 << (off - 32)) - 1);
+		tmpa[1] &= mask;
+
+		/*
+		 * and insert the new value
+		 */
+		tmpa[1] |= tmp;
+	}
+	if (off < 64) {
+		/*
+		 * If the offset is less than 64 bits, we also need to mask and 
+		 * assign the lower longword
+		 */
+		mask = (1 << off) - 1;
+		tmpa[0] &= mask;
+		tmpa[0] |= (val << off);
+	}
+
+	dst = MM_LOADU((xmm_t *)&tmpa);
+	return dst;
+}
+
+#define	MM_INSERT32(a, c, b)	pinsrd(a, c, b)
+
+#else
+#define	MM_BLENDV8(a, b, c)	_mm_blendv_epi8(a, b, c)
+#define	MM_TESTZ(a, b)		_mm_testz_si128(a, b)
+#define	MM_INSERT32(a, c, b)	_mm_insert_epi32(a, c, b)
+#endif
+
 #ifdef RTE_ARCH_X86_64
 
 #define	MM_CVT64(a)		_mm_cvtsi128_si64(a)
-- 
1.8.3.1

             reply	other threads:[~2014-08-04 15:33 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-04 15:35 Neil Horman [this message]
2014-08-05 15:26 ` Ananyev, Konstantin
2014-08-05 18:20   ` Neil Horman
2014-08-06 10:52     ` Ananyev, Konstantin
2014-08-06 12:12       ` Neil Horman
2014-08-06 12:23         ` Ananyev, Konstantin
2014-08-06 13:35           ` Neil Horman
2014-08-06 11:39     ` Ananyev, Konstantin
2014-08-06 12:18       ` Neil Horman
2014-08-06 12:26         ` Ananyev, Konstantin
2014-08-06 16:59         ` Richardson, Bruce
2014-08-06 17:27           ` Neil Horman
2014-08-12 23:19             ` Thomas Monjalon
2014-08-13 12:33               ` Neil Horman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1407166558-9532-1-git-send-email-nhorman@tuxdriver.com \
    --to=nhorman@tuxdriver.com \
    --cc=dev@dpdk.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).