DPDK patches and discussions
 help / color / mirror / Atom feed
* [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
@ 2017-04-27 14:06 Ashwin Sekhar T K
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-27 14:06 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

* Added CRC compute APIs for arm64 utilizing the pmull capability
* Added new file net_crc_neon.h to hold the arm64 pmull CRC
  implementation
* Added crypto capability in compilation of generic armv8 and
  thunderx targets
* pmull CRC version is used only after checking the pmull capability
  at runtime
* Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   3 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 9 files changed, 438 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 576d60a..283743e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..9a3dfdf 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,9 +34,18 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+#include <assert.h>
+
 #include "generic/rte_vect.h"
 #include "arm_neon.h"
 
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
+			+ __GNUC_PATCHLEVEL__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/*
+ * NEON intrinsic vreinterpretq_u64_p128() is not supported
+ * in GCC versions < 7
+ */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vreinterpretq_p64_u64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vgetq_lane_p64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	assert(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 76fd129..1daed30 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -55,6 +55,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -66,6 +67,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..6bbd742 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += PMULL
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-04-27 14:06 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-04  6:56 ` Ashwin Sekhar T K
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (3 more replies)
  2017-05-09  9:53 ` [dpdk-dev] [PATCH v4 " Ashwin Sekhar T K
                   ` (2 subsequent siblings)
  3 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:56 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

added the following flags to detect the corresponding
capability at compile time
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

at run-time, the following flags can be used to detect these
capabilities
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v3:
* Moved the feature detection changes into separate commit
* Added the AES, SHA1, SHA2 capabilities also under the CRYPTO flag
  along with PMULL

 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c14..a813c91 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-05-04  6:57   ` Ashwin Sekhar T K
  2017-05-04 15:22     ` Jan Viktorin
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:57 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h

Tested compilation on arm64 with gcc

Tested compilation on x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v3:
* Moved changes for GCC_VERSION into a separate commit
* Moved GCC_VERSION definition to common header
* Removed the same from rte_lru.h

 lib/librte_eal/common/include/rte_common.h | 8 ++++++++
 lib/librte_table/rte_lru.h                 | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index e057f6e..717b445 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,14 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef __GNUC__
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION (0)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru.h b/lib/librte_table/rte_lru.h
index e87e062..3c5aca7 100644
--- a/lib/librte_table/rte_lru.h
+++ b/lib/librte_table/rte_lru.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-04  6:57   ` Ashwin Sekhar T K
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2017-05-04 15:20   ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:57 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed merge conflict in MAINTAINERS

v3:
* Moved feature detection changes and GCC_VERSION definition
  changes to separate commit
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  29 ++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 5 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2..66d64c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..f28a19d 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,7 +34,9 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +80,33 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c..d01cf4b 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v3 4/4] test: add tests for arm64 CRC neon versions
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-04  6:57   ` Ashwin Sekhar T K
  2017-05-04 15:20   ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-04  6:57 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed checkpatch errors/warnings

 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69..9f2a17d 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                     ` (2 preceding siblings ...)
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
@ 2017-05-04 15:20   ` Jan Viktorin
  2017-05-04 22:10     ` Thomas Monjalon
  3 siblings, 1 reply; 33+ messages in thread
From: Jan Viktorin @ 2017-05-04 15:20 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: cristian.dumitrescu, thomas, jasvinder.singh, jerin.jacob,
	jianbo.liu, dev

On Wed,  3 May 2017 23:56:59 -0700
Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:

> armv8-a has optional CRYPTO extension which adds the
> AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
> enables code generation for the ARMv8-A architecture together
> with the optional CRYPTO extensions.
> 
> added the following flags to detect the corresponding
> capability at compile time
>  * RTE_MACHINE_CPUFLAG_AES
>  * RTE_MACHINE_CPUFLAG_PMULL
>  * RTE_MACHINE_CPUFLAG_SHA1
>  * RTE_MACHINE_CPUFLAG_SHA2
> 
> at run-time, the following flags can be used to detect these
> capabilities
>  * RTE_CPUFLAG_AES
>  * RTE_CPUFLAG_PMULL
>  * RTE_CPUFLAG_SHA1
>  * RTE_CPUFLAG_SHA2
> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>

Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header
  2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-04 15:22     ` Jan Viktorin
  0 siblings, 0 replies; 33+ messages in thread
From: Jan Viktorin @ 2017-05-04 15:22 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: cristian.dumitrescu, thomas, jasvinder.singh, jerin.jacob,
	jianbo.liu, dev

On Wed,  3 May 2017 23:57:00 -0700
Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:

> moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h

s/moved/Moved/

> to lib/librte_eal/common/include/rte_common.h

dot after the sentence

> 
> Tested compilation on arm64 with gcc
> 
> Tested compilation on x86 with gcc and clang

Tested compilation on:

* arm64 with gcc
* x86 with gcc and clang

> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>

Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-05-04 15:20   ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
@ 2017-05-04 22:10     ` Thomas Monjalon
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-05-04 22:10 UTC (permalink / raw)
  To: Jan Viktorin, Ashwin Sekhar T K, jerin.jacob, jianbo.liu; +Cc: dev

04/05/2017 17:20, Jan Viktorin:
> On Wed,  3 May 2017 23:56:59 -0700
> Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:
> 
> > armv8-a has optional CRYPTO extension which adds the
> > AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
> > enables code generation for the ARMv8-A architecture together
> > with the optional CRYPTO extensions.
> > 
> > added the following flags to detect the corresponding
> > capability at compile time
> >  * RTE_MACHINE_CPUFLAG_AES
> >  * RTE_MACHINE_CPUFLAG_PMULL
> >  * RTE_MACHINE_CPUFLAG_SHA1
> >  * RTE_MACHINE_CPUFLAG_SHA2
> > 
> > at run-time, the following flags can be used to detect these
> > capabilities
> >  * RTE_CPUFLAG_AES
> >  * RTE_CPUFLAG_PMULL
> >  * RTE_CPUFLAG_SHA1
> >  * RTE_CPUFLAG_SHA2
> > 
> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> 
> Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>

Do you agree that this series, and others bringing NEON optimizations,
are not candidates for 17.05?
If you see an urgent fix in all these NEON patches, please shout now.

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v4 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-04-27 14:06 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-05-09  9:53 ` Ashwin Sekhar T K
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (2 more replies)
  2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  3 siblings, 3 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

added the following flags to detect the corresponding
capability at compile time
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

at run-time, the following flags can be used to detect these
capabilities
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
v3:
* Moved the feature detection changes into separate commit
* Added the AES, SHA1, SHA2 capabilities also under the CRYPTO flag
  along with PMULL

 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c14..a813c91 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v4 2/4] eal: move gcc version definition to common header
  2017-05-09  9:53 ` [dpdk-dev] [PATCH v4 " Ashwin Sekhar T K
@ 2017-05-09  9:53   ` Ashwin Sekhar T K
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h.

Tested compilation on:
 * arm64 with gcc
 * x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
v3:
* Moved changes for GCC_VERSION into a separate commit
* Moved GCC_VERSION definition to common header
* Removed the same from rte_lru.h

v4:
* Edited the commit message body according to comments
* Moved definition and usage of GCC_VERSION under RTE_TOOLCHAIN_GCC flag

 lib/librte_eal/common/include/rte_common.h |  6 ++++++
 lib/librte_table/rte_lru.h                 | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index e057f6e..ff4a12b 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,12 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef RTE_TOOLCHAIN_GCC
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru.h b/lib/librte_table/rte_lru.h
index e87e062..5cc5966 100644
--- a/lib/librte_table/rte_lru.h
+++ b/lib/librte_table/rte_lru.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
@@ -120,7 +114,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 2
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
@@ -166,7 +160,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 3
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-09  9:53 ` [dpdk-dev] [PATCH v4 " Ashwin Sekhar T K
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-09  9:53   ` Ashwin Sekhar T K
  2017-05-12  5:51     ` Jianbo Liu
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2 siblings, 1 reply; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed merge conflict in MAINTAINERS

v3:
* Moved feature detection changes and GCC_VERSION definition
  changes to separate commit
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense

v4:
* Rebased on top of latest commit

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2..66d64c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..b215cc9 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -35,6 +35,7 @@
 
 #include <stdint.h>
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +79,33 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 9d1ee63..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
-#endif
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
+#endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c..d01cf4b 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v4 4/4] test: add tests for arm64 CRC neon versions
  2017-05-09  9:53 ` [dpdk-dev] [PATCH v4 " Ashwin Sekhar T K
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-09  9:53   ` Ashwin Sekhar T K
  2 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-09  9:53 UTC (permalink / raw)
  To: cristian.dumitrescu, thomas, jasvinder.singh, viktorin,
	jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed checkpatch errors/warnings

 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69..9f2a17d 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-12  5:51     ` Jianbo Liu
  2017-05-12  7:25       ` Sekhar, Ashwin
  0 siblings, 1 reply; 33+ messages in thread
From: Jianbo Liu @ 2017-05-12  5:51 UTC (permalink / raw)
  To: Ashwin Sekhar T K
  Cc: cristian.dumitrescu, thomas, jasvinder.singh, Jan Viktorin,
	Jerin Jacob, dev

On 9 May 2017 at 17:53, Ashwin Sekhar T K
<ashwin.sekhar@caviumnetworks.com> wrote:
> Added CRC compute APIs for arm64 utilizing the pmull
> capability
>
> Added new file net_crc_neon.h to hold the arm64 pmull
> CRC implementation
>
> Verified the changes with crc_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> ---
> v2:
> * Fixed merge conflict in MAINTAINERS
>
> v3:
> * Moved feature detection changes and GCC_VERSION definition
>   changes to separate commit
> * Replaced usage of assert() with RTE_ASSERT()
> * Made the comments in rte_vect.h more positive in sense
>
> v4:
> * Rebased on top of latest commit
>
>  MAINTAINERS                                       |   1 +
>  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
>  lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c                      |  34 ++-
>  lib/librte_net/rte_net_crc.h                      |   2 +
>  5 files changed, 416 insertions(+), 6 deletions(-)
>  create mode 100644 lib/librte_net/net_crc_neon.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index b6495d2..66d64c2 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
>  F: lib/librte_acl/acl_run_neon.*
>  F: lib/librte_lpm/rte_lpm_neon.h
>  F: lib/librte_hash/rte*_arm64.h
> +F: lib/librte_net/net_crc_neon.h
>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> index 4107c99..b215cc9 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> @@ -35,6 +35,7 @@
>
>  #include <stdint.h>
>  #include "generic/rte_vect.h"
> +#include "rte_debug.h"
>  #include "arm_neon.h"
>
>  #ifdef __cplusplus
> @@ -78,6 +79,33 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
>  }
>  #endif
>
> +#if (GCC_VERSION < 70000)
> +/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
> +static inline uint64x2_t
> +vreinterpretq_u64_p128(poly128_t x)
> +{
> +       return (uint64x2_t)x;
> +}
> +
> +/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
> +static inline poly64x2_t
> +vreinterpretq_p64_u64(uint64x2_t x)
> +{
> +       return (poly64x2_t)x;
> +}
> +
> +/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
> +static inline poly64_t
> +vgetq_lane_p64(poly64x2_t x, const int lane)
> +{
> +       RTE_ASSERT(lane >= 0 && lane <= 1);
> +
> +       poly64_t *p = (poly64_t *)&x;
> +
> +       return p[lane];
> +}
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
> new file mode 100644
> index 0000000..05120a7
> --- /dev/null
> +++ b/lib/librte_net/net_crc_neon.h
> @@ -0,0 +1,357 @@
> +/*
> + *   BSD LICENSE
> + *
> + *   Copyright (C) Cavium networks Ltd. 2017.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Cavium networks nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _NET_CRC_NEON_H_
> +#define _NET_CRC_NEON_H_
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_net_crc.h>
> +#include <rte_vect.h>
> +#include <rte_cpuflags.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/** PMULL CRC computation context structure */
> +struct crc_pmull_ctx {
> +       uint64x2_t rk1_rk2;
> +       uint64x2_t rk5_rk6;
> +       uint64x2_t rk7_rk8;
> +};
> +
> +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
> +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
> +
> +static inline uint8x16_t
> +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
> +{
> +       switch (n) {
> +       case 0: return vextq_u8(v0, v1, 0);
> +       case 1: return vextq_u8(v0, v1, 1);
> +       case 2: return vextq_u8(v0, v1, 2);
> +       case 3: return vextq_u8(v0, v1, 3);
> +       case 4: return vextq_u8(v0, v1, 4);
> +       case 5: return vextq_u8(v0, v1, 5);
> +       case 6: return vextq_u8(v0, v1, 6);
> +       case 7: return vextq_u8(v0, v1, 7);
> +       case 8: return vextq_u8(v0, v1, 8);
> +       case 9: return vextq_u8(v0, v1, 9);
> +       case 10: return vextq_u8(v0, v1, 10);
> +       case 11: return vextq_u8(v0, v1, 11);
> +       case 12: return vextq_u8(v0, v1, 12);
> +       case 13: return vextq_u8(v0, v1, 13);
> +       case 14: return vextq_u8(v0, v1, 14);
> +       case 15: return vextq_u8(v0, v1, 15);
> +       }
> +       return v1;
> +}
> +
> +/**
> + * Shifts right 128 bit register by specified number of bytes
> + *
> + * @param reg 128 bit value
> + * @param num number of bytes to shift reg by (0-16)
> + *
> + * @return reg << (num * 8)
> + */
> +static inline uint64x2_t
> +shift_bytes_right(uint64x2_t reg, const unsigned int num)
> +{
> +       /* Right Shift */
> +       return vreinterpretq_u64_u8(extract_vector(
> +                               vreinterpretq_u8_u64(reg),
> +                               vdupq_n_u8(0),
> +                               num));
> +}
> +
> +/**
> + * Shifts left 128 bit register by specified number of bytes
> + *
> + * @param reg 128 bit value
> + * @param num number of bytes to shift reg by (0-16)
> + *
> + * @return reg << (num * 8)
> + */
> +static inline uint64x2_t
> +shift_bytes_left(uint64x2_t reg, const unsigned int num)
> +{
> +       /* Left Shift */
> +       return vreinterpretq_u64_u8(extract_vector(
> +                               vdupq_n_u8(0),
> +                               vreinterpretq_u8_u64(reg),
> +                               16 - num));
> +}
> +

Can you move shift_bytes_right/shift_bytes_left to rte_vect.h because
they are common functions?

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12  5:51     ` Jianbo Liu
@ 2017-05-12  7:25       ` Sekhar, Ashwin
  2017-05-12  8:49         ` Jianbo Liu
  0 siblings, 1 reply; 33+ messages in thread
From: Sekhar, Ashwin @ 2017-05-12  7:25 UTC (permalink / raw)
  To: jianbo.liu
  Cc: thomas, jasvinder.singh, cristian.dumitrescu, viktorin, Jacob,
	 Jerin, dev

On Fri, 2017-05-12 at 13:51 +0800, Jianbo Liu wrote:
> On 9 May 2017 at 17:53, Ashwin Sekhar T K
> <ashwin.sekhar@caviumnetworks.com> wrote:
> > 
> > Added CRC compute APIs for arm64 utilizing the pmull
> > capability
> > 
> > Added new file net_crc_neon.h to hold the arm64 pmull
> > CRC implementation
> > 
> > Verified the changes with crc_autotest unit test case
> > 
> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> > ---
> > v2:
> > * Fixed merge conflict in MAINTAINERS
> > 
> > v3:
> > * Moved feature detection changes and GCC_VERSION definition
> >   changes to separate commit
> > * Replaced usage of assert() with RTE_ASSERT()
> > * Made the comments in rte_vect.h more positive in sense
> > 
> > v4:
> > * Rebased on top of latest commit
> > 
> >  MAINTAINERS                                       |   1 +
> >  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
> >  lib/librte_net/net_crc_neon.h                     | 357
> > ++++++++++++++++++++++
> >  lib/librte_net/rte_net_crc.c                      |  34 ++-
> >  lib/librte_net/rte_net_crc.h                      |   2 +
> >  5 files changed, 416 insertions(+), 6 deletions(-)
> >  create mode 100644 lib/librte_net/net_crc_neon.h
> > 
> > 
...
> > +
> > +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
> > +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
> > +
> > +static inline uint8x16_t
> > +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
> > +{
> > +       switch (n) {
> > +       case 0: return vextq_u8(v0, v1, 0);
> > +       case 1: return vextq_u8(v0, v1, 1);
> > +       case 2: return vextq_u8(v0, v1, 2);
> > +       case 3: return vextq_u8(v0, v1, 3);
> > +       case 4: return vextq_u8(v0, v1, 4);
> > +       case 5: return vextq_u8(v0, v1, 5);
> > +       case 6: return vextq_u8(v0, v1, 6);
> > +       case 7: return vextq_u8(v0, v1, 7);
> > +       case 8: return vextq_u8(v0, v1, 8);
> > +       case 9: return vextq_u8(v0, v1, 9);
> > +       case 10: return vextq_u8(v0, v1, 10);
> > +       case 11: return vextq_u8(v0, v1, 11);
> > +       case 12: return vextq_u8(v0, v1, 12);
> > +       case 13: return vextq_u8(v0, v1, 13);
> > +       case 14: return vextq_u8(v0, v1, 14);
> > +       case 15: return vextq_u8(v0, v1, 15);
> > +       }
> > +       return v1;
> > +}
> > +
> > +/**
> > + * Shifts right 128 bit register by specified number of bytes
> > + *
> > + * @param reg 128 bit value
> > + * @param num number of bytes to shift reg by (0-16)
> > + *
> > + * @return reg << (num * 8)
> > + */
> > +static inline uint64x2_t
> > +shift_bytes_right(uint64x2_t reg, const unsigned int num)
> > +{
> > +       /* Right Shift */
> > +       return vreinterpretq_u64_u8(extract_vector(
> > +                               vreinterpretq_u8_u64(reg),
> > +                               vdupq_n_u8(0),
> > +                               num));
> > +}
> > +
> > +/**
> > + * Shifts left 128 bit register by specified number of bytes
> > + *
> > + * @param reg 128 bit value
> > + * @param num number of bytes to shift reg by (0-16)
> > + *
> > + * @return reg << (num * 8)
> > + */
> > +static inline uint64x2_t
> > +shift_bytes_left(uint64x2_t reg, const unsigned int num)
> > +{
> > +       /* Left Shift */
> > +       return vreinterpretq_u64_u8(extract_vector(
> > +                               vdupq_n_u8(0),
> > +                               vreinterpretq_u8_u64(reg),
> > +                               16 - num));
> > +}
> > +
> Can you move shift_bytes_right/shift_bytes_left to rte_vect.h because
> they are common functions?
These are not really common functions. I dont think it will have a
wider usage as its shifting by bytes and not by bits.

In x86 case also, xmm_shift_left is not made a common function.

Moreover, I have not tested the behaviour of these functions when the
shift amt is (< 0) or (> 16) as these cases will never arise in the CRC
code.

Thanks
Ashwin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12  7:25       ` Sekhar, Ashwin
@ 2017-05-12  8:49         ` Jianbo Liu
  2017-05-12  8:56           ` Sekhar, Ashwin
  0 siblings, 1 reply; 33+ messages in thread
From: Jianbo Liu @ 2017-05-12  8:49 UTC (permalink / raw)
  To: Sekhar, Ashwin
  Cc: thomas, jasvinder.singh, cristian.dumitrescu, viktorin, Jacob,
	Jerin, dev

On 12 May 2017 at 15:25, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> On Fri, 2017-05-12 at 13:51 +0800, Jianbo Liu wrote:
>> On 9 May 2017 at 17:53, Ashwin Sekhar T K
>> <ashwin.sekhar@caviumnetworks.com> wrote:
>> >
>> > Added CRC compute APIs for arm64 utilizing the pmull
>> > capability
>> >
>> > Added new file net_crc_neon.h to hold the arm64 pmull
>> > CRC implementation
>> >
>> > Verified the changes with crc_autotest unit test case
>> >
>> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
>> > ---
>> > v2:
>> > * Fixed merge conflict in MAINTAINERS
>> >
>> > v3:
>> > * Moved feature detection changes and GCC_VERSION definition
>> >   changes to separate commit
>> > * Replaced usage of assert() with RTE_ASSERT()
>> > * Made the comments in rte_vect.h more positive in sense
>> >
>> > v4:
>> > * Rebased on top of latest commit
>> >
>> >  MAINTAINERS                                       |   1 +
>> >  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
>> >  lib/librte_net/net_crc_neon.h                     | 357
>> > ++++++++++++++++++++++
>> >  lib/librte_net/rte_net_crc.c                      |  34 ++-
>> >  lib/librte_net/rte_net_crc.h                      |   2 +
>> >  5 files changed, 416 insertions(+), 6 deletions(-)
>> >  create mode 100644 lib/librte_net/net_crc_neon.h
>> >
>> >
> ...
>> > +
>> > +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
>> > +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
>> > +
>> > +static inline uint8x16_t
>> > +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
>> > +{
>> > +       switch (n) {
>> > +       case 0: return vextq_u8(v0, v1, 0);
>> > +       case 1: return vextq_u8(v0, v1, 1);
>> > +       case 2: return vextq_u8(v0, v1, 2);
>> > +       case 3: return vextq_u8(v0, v1, 3);
>> > +       case 4: return vextq_u8(v0, v1, 4);
>> > +       case 5: return vextq_u8(v0, v1, 5);
>> > +       case 6: return vextq_u8(v0, v1, 6);
>> > +       case 7: return vextq_u8(v0, v1, 7);
>> > +       case 8: return vextq_u8(v0, v1, 8);
>> > +       case 9: return vextq_u8(v0, v1, 9);
>> > +       case 10: return vextq_u8(v0, v1, 10);
>> > +       case 11: return vextq_u8(v0, v1, 11);
>> > +       case 12: return vextq_u8(v0, v1, 12);
>> > +       case 13: return vextq_u8(v0, v1, 13);
>> > +       case 14: return vextq_u8(v0, v1, 14);
>> > +       case 15: return vextq_u8(v0, v1, 15);
>> > +       }
>> > +       return v1;
>> > +}
>> > +
>> > +/**
>> > + * Shifts right 128 bit register by specified number of bytes
>> > + *
>> > + * @param reg 128 bit value
>> > + * @param num number of bytes to shift reg by (0-16)
>> > + *
>> > + * @return reg << (num * 8)
>> > + */
>> > +static inline uint64x2_t
>> > +shift_bytes_right(uint64x2_t reg, const unsigned int num)
>> > +{
>> > +       /* Right Shift */
>> > +       return vreinterpretq_u64_u8(extract_vector(
>> > +                               vreinterpretq_u8_u64(reg),
>> > +                               vdupq_n_u8(0),
>> > +                               num));
>> > +}
>> > +
>> > +/**
>> > + * Shifts left 128 bit register by specified number of bytes
>> > + *
>> > + * @param reg 128 bit value
>> > + * @param num number of bytes to shift reg by (0-16)
>> > + *
>> > + * @return reg << (num * 8)
>> > + */
>> > +static inline uint64x2_t
>> > +shift_bytes_left(uint64x2_t reg, const unsigned int num)
>> > +{
>> > +       /* Left Shift */
>> > +       return vreinterpretq_u64_u8(extract_vector(
>> > +                               vdupq_n_u8(0),
>> > +                               vreinterpretq_u8_u64(reg),
>> > +                               16 - num));
>> > +}
>> > +
>> Can you move shift_bytes_right/shift_bytes_left to rte_vect.h because
>> they are common functions?
> These are not really common functions. I dont think it will have a
> wider usage as its shifting by bytes and not by bits.
>

I think these shifting may be used by other functions.
For example, to replace  _mm_srli_si128.

> In x86 case also, xmm_shift_left is not made a common function.
>

But its counterpart right shifting (_mm_srli_si128) is...

> Moreover, I have not tested the behaviour of these functions when the
> shift amt is (< 0) or (> 16) as these cases will never arise in the CRC
> code.
>

You can define thee functions according to current requirement.
And I don't think this parameter can be <0 or > 16.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12  8:49         ` Jianbo Liu
@ 2017-05-12  8:56           ` Sekhar, Ashwin
  0 siblings, 0 replies; 33+ messages in thread
From: Sekhar, Ashwin @ 2017-05-12  8:56 UTC (permalink / raw)
  To: Sekhar, Ashwin, jianbo.liu
  Cc: Jacob,  Jerin, thomas, jasvinder.singh, cristian.dumitrescu,
	viktorin, dev

On Fri, 2017-05-12 at 16:49 +0800, Jianbo Liu wrote:
> On 12 May 2017 at 15:25, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>
> wrote:
> > 
> > On Fri, 2017-05-12 at 13:51 +0800, Jianbo Liu wrote:
> > > 
> > > On 9 May 2017 at 17:53, Ashwin Sekhar T K
> > > <ashwin.sekhar@caviumnetworks.com> wrote:
> > > > 
> > > > 
> > > > Added CRC compute APIs for arm64 utilizing the pmull
> > > > capability
> > > > 
> > > > Added new file net_crc_neon.h to hold the arm64 pmull
> > > > CRC implementation
> > > > 
> > > > Verified the changes with crc_autotest unit test case
> > > > 
> > > > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.
> > > > com>
> > > > ---
> > > > v2:
> > > > * Fixed merge conflict in MAINTAINERS
> > > > 
> > > > v3:
> > > > * Moved feature detection changes and GCC_VERSION definition
> > > >   changes to separate commit
> > > > * Replaced usage of assert() with RTE_ASSERT()
> > > > * Made the comments in rte_vect.h more positive in sense
> > > > 
> > > > v4:
> > > > * Rebased on top of latest commit
> > > > 
> > > >  MAINTAINERS                                       |   1 +
> > > >  lib/librte_eal/common/include/arch/arm/rte_vect.h |  28 ++
> > > >  lib/librte_net/net_crc_neon.h                     | 357
> > > > ++++++++++++++++++++++
> > > >  lib/librte_net/rte_net_crc.c                      |  34 ++-
> > > >  lib/librte_net/rte_net_crc.h                      |   2 +
> > > >  5 files changed, 416 insertions(+), 6 deletions(-)
> > > >  create mode 100644 lib/librte_net/net_crc_neon.h
> > > > 
> > > > 
> > ...
> > > 
> > > > 
> > > > +
> > > > +struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
> > > > +struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
> > > > +
> > > > +static inline uint8x16_t
> > > > +extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
> > > > +{
> > > > +       switch (n) {
> > > > +       case 0: return vextq_u8(v0, v1, 0);
> > > > +       case 1: return vextq_u8(v0, v1, 1);
> > > > +       case 2: return vextq_u8(v0, v1, 2);
> > > > +       case 3: return vextq_u8(v0, v1, 3);
> > > > +       case 4: return vextq_u8(v0, v1, 4);
> > > > +       case 5: return vextq_u8(v0, v1, 5);
> > > > +       case 6: return vextq_u8(v0, v1, 6);
> > > > +       case 7: return vextq_u8(v0, v1, 7);
> > > > +       case 8: return vextq_u8(v0, v1, 8);
> > > > +       case 9: return vextq_u8(v0, v1, 9);
> > > > +       case 10: return vextq_u8(v0, v1, 10);
> > > > +       case 11: return vextq_u8(v0, v1, 11);
> > > > +       case 12: return vextq_u8(v0, v1, 12);
> > > > +       case 13: return vextq_u8(v0, v1, 13);
> > > > +       case 14: return vextq_u8(v0, v1, 14);
> > > > +       case 15: return vextq_u8(v0, v1, 15);
> > > > +       }
> > > > +       return v1;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Shifts right 128 bit register by specified number of bytes
> > > > + *
> > > > + * @param reg 128 bit value
> > > > + * @param num number of bytes to shift reg by (0-16)
> > > > + *
> > > > + * @return reg << (num * 8)
> > > > + */
> > > > +static inline uint64x2_t
> > > > +shift_bytes_right(uint64x2_t reg, const unsigned int num)
> > > > +{
> > > > +       /* Right Shift */
> > > > +       return vreinterpretq_u64_u8(extract_vector(
> > > > +                               vreinterpretq_u8_u64(reg),
> > > > +                               vdupq_n_u8(0),
> > > > +                               num));
> > > > +}
> > > > +
> > > > +/**
> > > > + * Shifts left 128 bit register by specified number of bytes
> > > > + *
> > > > + * @param reg 128 bit value
> > > > + * @param num number of bytes to shift reg by (0-16)
> > > > + *
> > > > + * @return reg << (num * 8)
> > > > + */
> > > > +static inline uint64x2_t
> > > > +shift_bytes_left(uint64x2_t reg, const unsigned int num)
> > > > +{
> > > > +       /* Left Shift */
> > > > +       return vreinterpretq_u64_u8(extract_vector(
> > > > +                               vdupq_n_u8(0),
> > > > +                               vreinterpretq_u8_u64(reg),
> > > > +                               16 - num));
> > > > +}
> > > > +
> > > Can you move shift_bytes_right/shift_bytes_left to rte_vect.h
> > > because
> > > they are common functions?
> > These are not really common functions. I dont think it will have a
> > wider usage as its shifting by bytes and not by bits.
> > 
> I think these shifting may be used by other functions.
> For example, to replace  _mm_srli_si128.
> 
> > 
> > In x86 case also, xmm_shift_left is not made a common function.
> > 
> But its counterpart right shifting (_mm_srli_si128) is...
> 
> > 
> > Moreover, I have not tested the behaviour of these functions when
> > the
> > shift amt is (< 0) or (> 16) as these cases will never arise in the
> > CRC
> > code.
> > 
> You can define thee functions according to current requirement.
> And I don't think this parameter can be <0 or > 16.

Okay. In that case, I will move it to rte_vect.h.

Ashwin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs
  2017-04-27 14:06 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-09  9:53 ` [dpdk-dev] [PATCH v4 " Ashwin Sekhar T K
@ 2017-05-12 10:15 ` Ashwin Sekhar T K
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                     ` (3 more replies)
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  3 siblings, 4 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

This patch series adds arm64 neon version of CRC compute APIs utilizing
the pmull capability (which is available as part of crypto extensions).

 * Patch 1 adds crypto capability in compilation of generic armv8a
   and thunderx targets.
 * Patch 2 moves GCC_VERSION defintion to a more common location as
   it will be used in the Patch 3.
 * Patch 3 adds the arm64 neon implementation of the CRC compute APIs.
 * Patch 4 adds the test case for testing arm64 neon implementation of the
   CRC compute APIs.

v5:
* Moved APIs shift_bytes_left, shift_bytes_right and extract_vector from
  net_crc_neon.h to rte_vect.h and renamed them to vshift_bytes_left,
  vshift_bytes_right and vextract respectively.

v4:
* Rebased on top of latest commit
* Edited the Patch 2 commit message body according to comments
* Moved definition and usage of GCC_VERSION under RTE_TOOLCHAIN_GCC flag

v3:
* Moved feature detection changes and GCC_VERSION definition changes
  to separate commits.
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense
* Moved GCC_VERSION definition to common header and removed the same from
  rte_lru.h

v2:
* Fixed merge conflict in MAINTAINERS
* Fixed checkpatch errors/warnings

Ashwin Sekhar T K (4):
  mk: add crypto capability for generic armv8a and thunderx
  eal: move gcc version definition to common header
  net: add arm64 neon version of CRC compute APIs
  test: add tests for arm64 CRC neon versions

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_eal/common/include/rte_common.h        |   6 +
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 lib/librte_table/rte_lru.h                        |  10 +-
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   6 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 test/test/test_crc.c                              |   9 +
 12 files changed, 442 insertions(+), 16 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

Added the following flags to detect the corresponding
capability at compile time.
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

At run-time, the following flags can be used to detect the
capabilities.
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1f1..51966a5b6 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379b0..678410581 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c1470..a813c91f4 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2a6..01ac7e232 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  2017-05-15  2:07     ` Jianbo Liu
  2017-07-03 20:51     ` Thomas Monjalon
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  3 siblings, 2 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h.

Tested compilation on:
 * arm64 with gcc
 * x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
 lib/librte_eal/common/include/rte_common.h |  6 ++++++
 lib/librte_table/rte_lru.h                 | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index e057f6e21..ff4a12bbe 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,12 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef RTE_TOOLCHAIN_GCC
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru.h b/lib/librte_table/rte_lru.h
index e87e062d0..5cc596613 100644
--- a/lib/librte_table/rte_lru.h
+++ b/lib/librte_table/rte_lru.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
@@ -120,7 +114,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 2
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
@@ -166,7 +160,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 3
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  2017-05-15  2:32     ` Jianbo Liu
  2017-07-03 21:06     ` Thomas Monjalon
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  3 siblings, 2 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability.

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation.

Added wrappers in rte_vect.h for those neon intrinsics
which are not supported in GCC version < 7.

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2b9..66d64c2c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c9988..55e228a77 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -35,6 +35,7 @@
 
 #include <stdint.h>
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +79,93 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if defined(RTE_ARCH_ARM64)
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+#endif
+
+/*
+ * If (0 <= index <= 15), then call the ASIMD ext intruction on the
+ * 128 bit regs v0 and v1 with the appropriate index.
+ *
+ * Else returns a zero vector.
+ */
+static inline uint8x16_t
+vextract(uint8x16_t v0, uint8x16_t v1, const int index)
+{
+	switch (index) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return vdupq_n_u8(0);
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				shift));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - shift));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 000000000..2be579d6b
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,297 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = vshift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = vshift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = vshift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = vshift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = vshift_bytes_left(fold, 16 - rem);
+		b = vshift_bytes_right(fold, rem);
+		mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 9d1ee63fa..be65f34bb 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
-#endif
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
+#endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c6e..d01cf4b47 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v5 4/4] test: add tests for arm64 CRC neon versions
  2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 preceding siblings ...)
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-12 10:15   ` Ashwin Sekhar T K
  3 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-05-12 10:15 UTC (permalink / raw)
  To: jerin.jacob, thomas, viktorin, jianbo.liu, jasvinder.singh
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69a2..9f2a17d49 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-05-15  2:07     ` Jianbo Liu
  2017-07-03 20:51     ` Thomas Monjalon
  1 sibling, 0 replies; 33+ messages in thread
From: Jianbo Liu @ 2017-05-15  2:07 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: Jerin Jacob, thomas, Jan Viktorin, jasvinder.singh, dev

On 12 May 2017 at 18:15, Ashwin Sekhar T K
<ashwin.sekhar@caviumnetworks.com> wrote:
> Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
> to lib/librte_eal/common/include/rte_common.h.
>
> Tested compilation on:
>  * arm64 with gcc
>  * x86 with gcc and clang
>
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
>  lib/librte_eal/common/include/rte_common.h |  6 ++++++
>  lib/librte_table/rte_lru.h                 | 10 ++--------
>  2 files changed, 8 insertions(+), 8 deletions(-)
>

Acked-by: Jianbo Liu <jianbo.liu@linaro.org>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-05-15  2:32     ` Jianbo Liu
  2017-07-03 21:06     ` Thomas Monjalon
  1 sibling, 0 replies; 33+ messages in thread
From: Jianbo Liu @ 2017-05-15  2:32 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: Jerin Jacob, thomas, Jan Viktorin, jasvinder.singh, dev

On 12 May 2017 at 18:15, Ashwin Sekhar T K
<ashwin.sekhar@caviumnetworks.com> wrote:
> Added CRC compute APIs for arm64 utilizing the pmull
> capability.
>
> Added new file net_crc_neon.h to hold the arm64 pmull
> CRC implementation.
>
> Added wrappers in rte_vect.h for those neon intrinsics
> which are not supported in GCC version < 7.
>
> Verified the changes with crc_autotest unit test case
>
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> ---
>  MAINTAINERS                                       |   1 +
>  lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
>  lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c                      |  34 ++-
>  lib/librte_net/rte_net_crc.h                      |   2 +
>  5 files changed, 416 insertions(+), 6 deletions(-)
>  create mode 100644 lib/librte_net/net_crc_neon.h
>

Acked-by: Jianbo Liu <jianbo.liu@linaro.org>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
  2017-05-15  2:07     ` Jianbo Liu
@ 2017-07-03 20:51     ` Thomas Monjalon
  2017-07-04  8:48       ` Sekhar, Ashwin
  1 sibling, 1 reply; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-03 20:51 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

12/05/2017 12:15, Ashwin Sekhar T K:
> Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
> to lib/librte_eal/common/include/rte_common.h.
> 
> Tested compilation on:
>  * arm64 with gcc
>  * x86 with gcc and clang
> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
> --- a/lib/librte_eal/common/include/rte_common.h
> +++ b/lib/librte_eal/common/include/rte_common.h
> +/** Define GCC_VERSION **/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
> +		__GNUC_PATCHLEVEL__)
> +#endif
[...]
> --- a/lib/librte_table/rte_lru.h
> +++ b/lib/librte_table/rte_lru.h
> -#ifdef __INTEL_COMPILER
> -#define GCC_VERSION (0)
> -#else
> -#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
> -#endif

The ICC check is lost when moving in rte_common.h.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs
  2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-05-15  2:32     ` Jianbo Liu
@ 2017-07-03 21:06     ` Thomas Monjalon
  1 sibling, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-03 21:06 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

12/05/2017 12:15, Ashwin Sekhar T K:
> +#elif defined(ARM64_NEON_PMULL)
> +       case RTE_NET_CRC_NEON:
> +               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> +                       handlers = handlers_neon;
> +                       break;
> +               }
> +               //-fallthrough
> +#endif
>         case RTE_NET_CRC_SCALAR:
> +               //-fallthrough
>         default:

These fallthrough comments are not in the right coding style.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header
  2017-07-03 20:51     ` Thomas Monjalon
@ 2017-07-04  8:48       ` Sekhar, Ashwin
  0 siblings, 0 replies; 33+ messages in thread
From: Sekhar, Ashwin @ 2017-07-04  8:48 UTC (permalink / raw)
  To: thomas; +Cc: jasvinder.singh, Jacob,  Jerin, viktorin, dev, jianbo.liu

On Mon, 2017-07-03 at 22:51 +0200, Thomas Monjalon wrote:
> 12/05/2017 12:15, Ashwin Sekhar T K:
> > 
> > Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
> > to lib/librte_eal/common/include/rte_common.h.
> > 
> > Tested compilation on:
> >  * arm64 with gcc
> >  * x86 with gcc and clang
> > 
> > Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> > Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
> > ---
> > --- a/lib/librte_eal/common/include/rte_common.h
> > +++ b/lib/librte_eal/common/include/rte_common.h
> > +/** Define GCC_VERSION **/
> > +#ifdef RTE_TOOLCHAIN_GCC
> > +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	
> > \
> > +		__GNUC_PATCHLEVEL__)
> > +#endif
> [...]
> > 
> > --- a/lib/librte_table/rte_lru.h
> > +++ b/lib/librte_table/rte_lru.h
> > -#ifdef __INTEL_COMPILER
> > -#define GCC_VERSION (0)
> > -#else
> > -#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 +
> > __GNUC_PATCHLEVEL__)
> > -#endif
> The ICC check is lost when moving in rte_common.h.

All usage of GCC_VERSION is kept under #ifdef RTE_TOOLCHAIN_GCC. So the
ICC check is not required.

Ashwin

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs
  2017-04-27 14:06 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                   ` (2 preceding siblings ...)
  2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04  9:24 ` Ashwin Sekhar T K
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
                     ` (4 more replies)
  3 siblings, 5 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

This patch series adds arm64 neon version of CRC compute APIs utilizing
the pmull capability (which is available as part of crypto extensions).

 * Patch 1 adds crypto capability in compilation of generic armv8a
   and thunderx targets.
 * Patch 2 moves GCC_VERSION defintion to a more common location as
   it will be used in the Patch 3.
 * Patch 3 adds the arm64 neon implementation of the CRC compute APIs.
 * Patch 4 adds the test case for testing arm64 neon implementation of the
   CRC compute APIs.

v6:
* Corrected the fallthrough comment style.
* Rebased to DPDK tip.

v5:
* Moved APIs shift_bytes_left, shift_bytes_right and extract_vector from
  net_crc_neon.h to rte_vect.h and renamed them to vshift_bytes_left,
  vshift_bytes_right and vextract respectively.

v4:
* Rebased on top of latest commit
* Edited the Patch 2 commit message body according to comments
* Moved definition and usage of GCC_VERSION under RTE_TOOLCHAIN_GCC flag

v3:
* Moved feature detection changes and GCC_VERSION definition changes
  to separate commits.
* Replaced usage of assert() with RTE_ASSERT()
* Made the comments in rte_vect.h more positive in sense
* Moved GCC_VERSION definition to common header and removed the same from
  rte_lru.h

v2:
* Fixed merge conflict in MAINTAINERS
* Fixed checkpatch errors/warnings


Ashwin Sekhar T K (4):
  mk: add crypto capability for generic armv8a and thunderx
  eal: move gcc version definition to common header
  net: add arm64 neon version of CRC compute APIs
  test: add tests for arm64 CRC neon versions

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_eal/common/include/rte_common.h        |   6 +
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 lib/librte_table/rte_lru_x86.h                    |  10 +-
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   6 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 test/test/test_crc.c                              |   9 +
 12 files changed, 442 insertions(+), 16 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

armv8-a has optional CRYPTO extension which adds the
AES, PMULL, SHA1 and SHA2 capabilities. -march=armv8-a+crypto
enables code generation for the ARMv8-A architecture together
with the optional CRYPTO extensions.

Added the following flags to detect the corresponding
capability at compile time.
 * RTE_MACHINE_CPUFLAG_AES
 * RTE_MACHINE_CPUFLAG_PMULL
 * RTE_MACHINE_CPUFLAG_SHA1
 * RTE_MACHINE_CPUFLAG_SHA2

At run-time, the following flags can be used to detect the
capabilities.
 * RTE_CPUFLAG_AES
 * RTE_CPUFLAG_PMULL
 * RTE_CPUFLAG_SHA1
 * RTE_CPUFLAG_SHA2

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
---
 mk/machine/armv8a/rte.vars.mk            | 2 +-
 mk/machine/thunderx/rte.vars.mk          | 2 +-
 mk/rte.cpuflags.mk                       | 6 ++++++
 mk/toolchain/gcc/rte.toolchain-compat.mk | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1f1..51966a5b6 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379b0..678410581 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 4288c1470..a813c91f4 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -125,6 +125,12 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += AES
+CPUFLAGS += PMULL
+CPUFLAGS += SHA1
+CPUFLAGS += SHA2
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2a6..01ac7e232 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v6 2/4] eal: move gcc version definition to common header
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

Moved the definition of GCC_VERSION from lib/librte_table/rte_lru.h
to lib/librte_eal/common/include/rte_common.h.

Tested compilation on:
 * arm64 with gcc
 * x86 with gcc and clang

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Reviewed-by: Jan Viktorin <viktorin@rehivetech.com>
Acked-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 lib/librte_eal/common/include/rte_common.h |  6 ++++++
 lib/librte_table/rte_lru_x86.h             | 10 ++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/include/rte_common.h b/lib/librte_eal/common/include/rte_common.h
index 99596de0a..1afc66e3f 100644
--- a/lib/librte_eal/common/include/rte_common.h
+++ b/lib/librte_eal/common/include/rte_common.h
@@ -66,6 +66,12 @@ extern "C" {
 #define RTE_STD_C11
 #endif
 
+/** Define GCC_VERSION **/
+#ifdef RTE_TOOLCHAIN_GCC
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +	\
+		__GNUC_PATCHLEVEL__)
+#endif
+
 #ifdef RTE_ARCH_STRICT_ALIGN
 typedef uint64_t unaligned_uint64_t __attribute__ ((aligned(1)));
 typedef uint32_t unaligned_uint32_t __attribute__ ((aligned(1)));
diff --git a/lib/librte_table/rte_lru_x86.h b/lib/librte_table/rte_lru_x86.h
index 041b538f1..ec9082343 100644
--- a/lib/librte_table/rte_lru_x86.h
+++ b/lib/librte_table/rte_lru_x86.h
@@ -40,12 +40,6 @@ extern "C" {
 
 #include <stdint.h>
 
-#ifdef __INTEL_COMPILER
-#define GCC_VERSION (0)
-#else
-#define GCC_VERSION (__GNUC__ * 10000+__GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif
-
 #ifndef RTE_TABLE_HASH_LRU_STRATEGY
 #ifdef __SSE4_2__
 #define RTE_TABLE_HASH_LRU_STRATEGY                        2
@@ -56,7 +50,7 @@ extern "C" {
 
 #if RTE_TABLE_HASH_LRU_STRATEGY == 2
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
@@ -100,7 +94,7 @@ do {									\
 
 #elif RTE_TABLE_HASH_LRU_STRATEGY == 3
 
-#if GCC_VERSION > 40306
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION > 40306)
 #include <x86intrin.h>
 #else
 #include <emmintrin.h>
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04 13:53     ` Thomas Monjalon
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2017-07-04 13:55   ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon
  4 siblings, 1 reply; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

Added CRC compute APIs for arm64 utilizing the pmull
capability.

Added new file net_crc_neon.h to hold the arm64 pmull
CRC implementation.

Added wrappers in rte_vect.h for those neon intrinsics
which are not supported in GCC version < 7.

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
Acked-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  88 +++++++
 lib/librte_net/net_crc_neon.h                     | 297 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  34 ++-
 lib/librte_net/rte_net_crc.h                      |   2 +
 5 files changed, 416 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index c14cbb90f..33921f721 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c9988..55e228a77 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -35,6 +35,7 @@
 
 #include <stdint.h>
 #include "generic/rte_vect.h"
+#include "rte_debug.h"
 #include "arm_neon.h"
 
 #ifdef __cplusplus
@@ -78,6 +79,93 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if defined(RTE_ARCH_ARM64)
+#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70000)
+/* NEON intrinsic vreinterpretq_u64_p128() is supported since GCC version 7 */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/* NEON intrinsic vreinterpretq_p64_u64() is supported since GCC version 7 */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/* NEON intrinsic vgetq_lane_p64() is supported since GCC version 7 */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	RTE_ASSERT(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+#endif
+
+/*
+ * If (0 <= index <= 15), then call the ASIMD ext intruction on the
+ * 128 bit regs v0 and v1 with the appropriate index.
+ *
+ * Else returns a zero vector.
+ */
+static inline uint8x16_t
+vextract(uint8x16_t v0, uint8x16_t v1, const int index)
+{
+	switch (index) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return vdupq_n_u8(0);
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_right(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				shift));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * Value of shift parameter must be in range 0 - 16
+ */
+static inline uint64x2_t
+vshift_bytes_left(uint64x2_t reg, const unsigned int shift)
+{
+	return vreinterpretq_u64_u8(vextract(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - shift));
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 000000000..2be579d6b
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,297 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = vshift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = vshift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = vshift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = vshift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = vshift_bytes_left(fold, 16 - rem);
+		b = vshift_bytes_right(fold, rem);
+		mask = vshift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 0391c7209..331737abe 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
-#endif
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		/* fall-through */
+#endif
 	case RTE_NET_CRC_SCALAR:
+		/* fall-through */
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index d22286c6e..d01cf4b47 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -57,6 +57,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -68,6 +69,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [dpdk-dev] [PATCH v6 4/4] test: add tests for arm64 CRC neon versions
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (2 preceding siblings ...)
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04  9:24   ` Ashwin Sekhar T K
  2017-07-04 13:55   ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon
  4 siblings, 0 replies; 33+ messages in thread
From: Ashwin Sekhar T K @ 2017-07-04  9:24 UTC (permalink / raw)
  To: jerin.jacob, viktorin, jianbo.liu, jasvinder.singh, thomas
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69a2..9f2a17d49 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.12.2

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-07-04 13:53     ` Thomas Monjalon
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-04 13:53 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

04/07/2017 11:24, Ashwin Sekhar T K:
>  rte_net_crc_set_alg(enum rte_net_crc_alg alg)
>  {
>         switch (alg) {
> -       case RTE_NET_CRC_SSE42:
>  #ifdef X86_64_SSE42_PCLMULQDQ
> +       case RTE_NET_CRC_SSE42:
>                 handlers = handlers_sse42;
> -#else
> -               alg = RTE_NET_CRC_SCALAR;
> -#endif
>                 break;
> +#elif defined(ARM64_NEON_PMULL)
> +       case RTE_NET_CRC_NEON:
> +               if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> +                       handlers = handlers_neon;
> +                       break;
> +               }
> +               /* fall-through */
> +#endif
>         case RTE_NET_CRC_SCALAR:
> +               /* fall-through */
>         default:
>                 handlers = handlers_scalar;
>                 break;
> 

I'm moving the fall-through comment outside of #ifdef
to fix warning.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs
  2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
                     ` (3 preceding siblings ...)
  2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
@ 2017-07-04 13:55   ` Thomas Monjalon
  4 siblings, 0 replies; 33+ messages in thread
From: Thomas Monjalon @ 2017-07-04 13:55 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: dev, jerin.jacob, viktorin, jianbo.liu, jasvinder.singh

> Ashwin Sekhar T K (4):
>   mk: add crypto capability for generic armv8a and thunderx
>   eal: move gcc version definition to common header
>   net: add arm64 neon version of CRC compute APIs
>   test: add tests for arm64 CRC neon versions

Applied with minor changes, thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2017-07-04 13:55 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-27 14:06 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-04  6:56 ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-04 15:22     ` Jan Viktorin
2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-04  6:57   ` [dpdk-dev] [PATCH v3 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-05-04 15:20   ` [dpdk-dev] [PATCH v3 1/4] mk: add crypto capability for generic armv8a and thunderx Jan Viktorin
2017-05-04 22:10     ` Thomas Monjalon
2017-05-09  9:53 ` [dpdk-dev] [PATCH v4 " Ashwin Sekhar T K
2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-12  5:51     ` Jianbo Liu
2017-05-12  7:25       ` Sekhar, Ashwin
2017-05-12  8:49         ` Jianbo Liu
2017-05-12  8:56           ` Sekhar, Ashwin
2017-05-09  9:53   ` [dpdk-dev] [PATCH v4 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-05-12 10:15 ` [dpdk-dev] [PATCH v5 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-05-15  2:07     ` Jianbo Liu
2017-07-03 20:51     ` Thomas Monjalon
2017-07-04  8:48       ` Sekhar, Ashwin
2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-05-15  2:32     ` Jianbo Liu
2017-07-03 21:06     ` Thomas Monjalon
2017-05-12 10:15   ` [dpdk-dev] [PATCH v5 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-07-04  9:24 ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 1/4] mk: add crypto capability for generic armv8a and thunderx Ashwin Sekhar T K
2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 2/4] eal: move gcc version definition to common header Ashwin Sekhar T K
2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 3/4] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-07-04 13:53     ` Thomas Monjalon
2017-07-04  9:24   ` [dpdk-dev] [PATCH v6 4/4] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-07-04 13:55   ` [dpdk-dev] [PATCH v6 0/4] add arm64 neon version of CRC compute APIs Thomas Monjalon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).