From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp.tuxdriver.com (charlotte.tuxdriver.com [70.61.120.58]) by dpdk.org (Postfix) with ESMTP id 1028C58DC for ; Mon, 4 Aug 2014 17:33:56 +0200 (CEST) Received: from hmsreliant.think-freely.org ([2001:470:8:a08:7aac:c0ff:fec2:933b] helo=localhost) by smtp.tuxdriver.com with esmtpsa (TLSv1:AES128-SHA:128) (Exim 4.63) (envelope-from ) id 1XEKIu-000570-Cr; Mon, 04 Aug 2014 11:36:11 -0400 From: Neil Horman To: dev@dpdk.org Date: Mon, 4 Aug 2014 11:35:58 -0400 Message-Id: <1407166558-9532-1-git-send-email-nhorman@tuxdriver.com> X-Mailer: git-send-email 1.8.3.1 X-Spam-Score: -2.9 (--) X-Spam-Status: No Subject: [dpdk-dev] [PATCH] acl: If build does not support sse4.2, emulate missing instructions with C code X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 04 Aug 2014 15:33:56 -0000 The ACL library makes extensive use of some SSE4.2 instructions, which means the default build can't compile this library. Work around the problem by testing the __SSE42__ definition in the acl_vects.h file and defining the macros there as intrinsics or c-level equivalants. Note this is a minimal patch, adjusting only the definitions that are currently used in the ACL library. Only compile tested so far, but I wanted to post it for early review so that others could aid in unit testing. Signed-off-by: Neil Horman CC: Thomas Monjalon CC: "Konstantin Ananyev" CC: Bruce Richardson --- lib/librte_acl/acl_bld.c | 3 +- lib/librte_acl/acl_vect.h | 102 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 5 deletions(-) diff --git a/lib/librte_acl/acl_bld.c b/lib/librte_acl/acl_bld.c index 873447b..de974a4 100644 --- a/lib/librte_acl/acl_bld.c +++ b/lib/librte_acl/acl_bld.c @@ -31,7 +31,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include #include "tb_mem.h" #include "acl.h" @@ -1481,7 +1480,7 @@ acl_calc_wildness(struct rte_acl_build_rule *head, switch (rule->config->defs[n].type) { case RTE_ACL_FIELD_TYPE_BITMASK: wild = (size - - _mm_popcnt_u32(fld->mask_range.u8)) / + __builtin_popcountl(fld->mask_range.u8)) / size; break; diff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h index d813600..e5f391b 100644 --- a/lib/librte_acl/acl_vect.h +++ b/lib/librte_acl/acl_vect.h @@ -34,6 +34,10 @@ #ifndef _RTE_ACL_VECT_H_ #define _RTE_ACL_VECT_H_ +#ifdef __SSE4_1__ +#include +#endif + /** * @file * @@ -44,12 +48,12 @@ extern "C" { #endif + #define MM_ADD16(a, b) _mm_add_epi16(a, b) #define MM_ADD32(a, b) _mm_add_epi32(a, b) #define MM_ALIGNR8(a, b, c) _mm_alignr_epi8(a, b, c) #define MM_AND(a, b) _mm_and_si128(a, b) #define MM_ANDNOT(a, b) _mm_andnot_si128(a, b) -#define MM_BLENDV8(a, b, c) _mm_blendv_epi8(a, b, c) #define MM_CMPEQ16(a, b) _mm_cmpeq_epi16(a, b) #define MM_CMPEQ32(a, b) _mm_cmpeq_epi32(a, b) #define MM_CMPEQ8(a, b) _mm_cmpeq_epi8(a, b) @@ -59,7 +63,6 @@ extern "C" { #define MM_CVT32(a) _mm_cvtsi128_si32(a) #define MM_CVTU32(a) _mm_cvtsi32_si128(a) #define MM_INSERT16(a, c, b) _mm_insert_epi16(a, c, b) -#define MM_INSERT32(a, c, b) _mm_insert_epi32(a, c, b) #define MM_LOAD(a) _mm_load_si128(a) #define MM_LOADH_PI(a, b) _mm_loadh_pi(a, b) #define MM_LOADU(a) _mm_loadu_si128(a) @@ -82,7 +85,6 @@ extern "C" { #define MM_SRL32(a, b) _mm_srli_epi32(a, b) #define MM_STORE(a, b) _mm_store_si128(a, b) #define MM_STOREU(a, b) _mm_storeu_si128(a, b) -#define MM_TESTZ(a, b) _mm_testz_si128(a, b) #define MM_XOR(a, b) _mm_xor_si128(a, b) #define MM_SET16(a, b, c, d, e, f, g, h) \ @@ -93,6 +95,100 @@ extern "C" { _mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7, \ c8, c9, cA, cB, cC, cD, cE, cF) + +#ifndef __SSE4_1__ +static inline xmm_t pblendvb(xmm_t dst, xmm_t src, xmm_t mask) +{ + unsigned char tmpd[16], tmps[16], tmpm[16]; + int i; + + MM_STOREU((xmm_t *)&tmpd, dst); + MM_STOREU((xmm_t *)&tmps, src); + MM_STOREU((xmm_t *)&tmpm, mask); + + for (i = 0; i < 16; i++) + if (mask[i] & 0x8) + dst[i] = src[i]; + + dst = MM_LOADU((xmm_t *)&tmpd); + + return dst; +} + +#define MM_BLENDV8(a, b, c) pblendvb(a, b, c) + + +static inline int ptestz(xmm_t a, xmm_t b) +{ + unsigned long long tmpa[2], tmpb[2]; + + MM_STOREU((xmm_t *)&tmpa, a); + MM_STOREU((xmm_t *)&tmpb, b); + + if (tmpa[0] & tmpb[0]) + return 1; + if (tmpa[1] & tmpb[1]) + return 1; + + return 0; +} + +#define MM_TESTZ(a, b) ptestz(a, b) + +static inline xmm_t pinsrd(xmm_t dst, int32_t val, char off) +{ + unsigned long long tmpa[2]; + unsigned long long mask; + int32_t tmp; + + MM_STOREU((xmm_t *)&tmpa, dst); + + /* + * Inserting a dword is a bit odd as it can cross a word boundary + */ + + if (off > 32) { + /* + * If the offset is more than 32, then part of the + * inserted word will appear in the upper half of the xmm + * register. Grab the part of the value that crosses the 64 bit + * boundary. + */ + tmp = val >> (off - 32); + + /* + * Mask off the least significant bits of the upper longword + */ + mask = ~((1 << (off - 32)) - 1); + tmpa[1] &= mask; + + /* + * and insert the new value + */ + tmpa[1] |= tmp; + } + if (off < 64) { + /* + * If the offset is less than 64 bits, we also need to mask and + * assign the lower longword + */ + mask = (1 << off) - 1; + tmpa[0] &= mask; + tmpa[0] |= (val << off); + } + + dst = MM_LOADU((xmm_t *)&tmpa); + return dst; +} + +#define MM_INSERT32(a, c, b) pinsrd(a, c, b) + +#else +#define MM_BLENDV8(a, b, c) _mm_blendv_epi8(a, b, c) +#define MM_TESTZ(a, b) _mm_testz_si128(a, b) +#define MM_INSERT32(a, c, b) _mm_insert_epi32(a, c, b) +#endif + #ifdef RTE_ARCH_X86_64 #define MM_CVT64(a) _mm_cvtsi128_si64(a) -- 1.8.3.1