[dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs

DPDK patches and discussions
 help / color / mirror / Atom feed

* [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
@ 2017-04-27 14:10 Ashwin Sekhar T K
  2017-04-27 14:10 ` [dpdk-dev] [PATCH 2/2] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-27 14:10 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

* Added CRC compute APIs for arm64 utilizing the pmull capability
* Added new file net_crc_neon.h to hold the arm64 pmull CRC
  implementation
* Added crypto capability in compilation of generic armv8 and
  thunderx targets
* pmull CRC version is used only after checking the pmull capability
  at runtime
* Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   3 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 9 files changed, 438 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 576d60a..283743e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..9a3dfdf 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,9 +34,18 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+#include <assert.h>
+
 #include "generic/rte_vect.h"
 #include "arm_neon.h"
 
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
+			+ __GNUC_PATCHLEVEL__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/*
+ * NEON intrinsic vreinterpretq_u64_p128() is not supported
+ * in GCC versions < 7
+ */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vreinterpretq_p64_u64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vgetq_lane_p64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	assert(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 76fd129..1daed30 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -55,6 +55,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -66,6 +67,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..6bbd742 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += PMULL
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH 2/2] test: add tests for arm64 CRC neon versions
  2017-04-27 14:10 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-04-27 14:10 ` Ashwin Sekhar T K
  2017-04-28  9:34 ` [dpdk-dev] [PATCH v2 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-04-28  9:55 ` [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Jan Viktorin
  2 siblings, 0 replies; 8+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-27 14:10 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69..ac0e359 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test_crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH v2 1/2] net: add arm64 neon version of CRC compute APIs
  2017-04-27 14:10 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-04-27 14:10 ` [dpdk-dev] [PATCH 2/2] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
@ 2017-04-28  9:34 ` Ashwin Sekhar T K
  2017-04-28  9:34   ` [dpdk-dev] [PATCH v2 2/2] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2017-04-28  9:55 ` [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Jan Viktorin
  2 siblings, 1 reply; 8+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-28  9:34 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

* Added CRC compute APIs for arm64 utilizing the pmull capability
* Added new file net_crc_neon.h to hold the arm64 pmull CRC
  implementation
* Added crypto capability in compilation of generic armv8 and
  thunderx targets
* pmull CRC version is used only after checking the pmull capability
  at runtime
* Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed merge conflict in MAINTAINERS

 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   3 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 9 files changed, 438 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b6495d2..66d64c2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -147,6 +147,7 @@ F: lib/librte_eal/common/include/arch/arm/*_64.h
 F: lib/librte_acl/acl_run_neon.*
 F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..9a3dfdf 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,9 +34,18 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+#include <assert.h>
+
 #include "generic/rte_vect.h"
 #include "arm_neon.h"
 
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
+			+ __GNUC_PATCHLEVEL__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/*
+ * NEON intrinsic vreinterpretq_u64_p128() is not supported
+ * in GCC versions < 7
+ */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vreinterpretq_p64_u64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vgetq_lane_p64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	assert(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 76fd129..1daed30 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -55,6 +55,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -66,6 +67,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..6bbd742 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += PMULL
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH v2 2/2] test: add tests for arm64 CRC neon versions
  2017-04-28  9:34 ` [dpdk-dev] [PATCH v2 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-04-28  9:34   ` Ashwin Sekhar T K
  0 siblings, 0 replies; 8+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-28  9:34 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
v2:
* Fixed checkpatch errors/warnings

 test/test/test_crc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/test/test_crc.c b/test/test/test_crc.c
index cd5af69..9f2a17d 100644
--- a/test/test/test_crc.c
+++ b/test/test/test_crc.c
@@ -178,6 +178,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC neon mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_NEON);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (arm64 neon pmull): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
  2017-04-27 14:10 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
  2017-04-27 14:10 ` [dpdk-dev] [PATCH 2/2] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
  2017-04-28  9:34 ` [dpdk-dev] [PATCH v2 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
@ 2017-04-28  9:55 ` Jan Viktorin
  2017-04-28 10:19   ` Sekhar, Ashwin
  2 siblings, 1 reply; 8+ messages in thread
From: Jan Viktorin @ 2017-04-28  9:55 UTC (permalink / raw)
  To: Ashwin Sekhar T K; +Cc: thomas, jasvinder.singh, jerin.jacob, jianbo.liu, dev

Hello Ashwin Sekhar,

some comments below...

On Thu, 27 Apr 2017 07:10:20 -0700
Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:

> * Added CRC compute APIs for arm64 utilizing the pmull capability
> * Added new file net_crc_neon.h to hold the arm64 pmull CRC
>   implementation
> * Added crypto capability in compilation of generic armv8 and
>   thunderx targets
> * pmull CRC version is used only after checking the pmull capability
>   at runtime
> * Verified the changes with crc_autotest unit test case
> 
> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
> ---
>  MAINTAINERS                                       |   1 +
>  lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
>  lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
>  lib/librte_net/rte_net_crc.c                      |  32 +-
>  lib/librte_net/rte_net_crc.h                      |   2 +
>  mk/machine/armv8a/rte.vars.mk                     |   2 +-
>  mk/machine/thunderx/rte.vars.mk                   |   2 +-
>  mk/rte.cpuflags.mk                                |   3 +
>  mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
>  9 files changed, 438 insertions(+), 7 deletions(-)
>  create mode 100644 lib/librte_net/net_crc_neon.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 576d60a..283743e 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
>  F: lib/librte_hash/rte*_arm64.h
>  F: lib/librte_efd/rte*_arm64.h
>  F: lib/librte_table/rte*_arm64.h
> +F: lib/librte_net/net_crc_neon.h
>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> index 4107c99..9a3dfdf 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
> @@ -34,9 +34,18 @@
>  #define _RTE_VECT_ARM_H_
>  
>  #include <stdint.h>
> +#include <assert.h>
> +
>  #include "generic/rte_vect.h"
>  #include "arm_neon.h"
>  
> +#ifdef GCC_VERSION
> +#undef GCC_VERSION
> +#endif

Why are you doing this? What is wrong with GCC_VERSION?

> +
> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
> +			+ __GNUC_PATCHLEVEL__)
> +

If you have any specific requirements for testing GCC version then it
should be done in a more elegant way. However, I do not understand your
intention.

>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> @@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
>  }
>  #endif
>  
> +#if (GCC_VERSION < 70000)

Is this code is gcc-specific? In such case there should be check for
GCC compiler. We can also build e.g. by clang.

> +/*
> + * NEON intrinsic vreinterpretq_u64_p128() is not supported
> + * in GCC versions < 7
> + */

I'd be positive about those comments, like:

NEON intrinsic vreinterpretq_u64_p128() is supported since GCC 7.

> +static inline uint64x2_t
> +vreinterpretq_u64_p128(poly128_t x)
> +{
> +	return (uint64x2_t)x;
> +}
> +
> +/*
> + * NEON intrinsic vreinterpretq_p64_u64() is not supported
> + * in GCC versions < 7
> + */
> +static inline poly64x2_t
> +vreinterpretq_p64_u64(uint64x2_t x)
> +{
> +	return (poly64x2_t)x;
> +}
> +
> +/*
> + * NEON intrinsic vgetq_lane_p64() is not supported
> + * in GCC versions < 7
> + */
> +static inline poly64_t
> +vgetq_lane_p64(poly64x2_t x, const int lane)
> +{
> +	assert(lane >= 0 && lane <= 1);
> +
> +	poly64_t *p = (poly64_t *)&x;
> +
> +	return p[lane];
> +}
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h

[...]

>  # CPU_LDFLAGS =
>  # CPU_ASFLAGS =
>  
> -MACHINE_CFLAGS += -march=armv8-a+crc
> +MACHINE_CFLAGS += -march=armv8-a+crc+crypto
> diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
> index ad5a379..6784105 100644
> --- a/mk/machine/thunderx/rte.vars.mk
> +++ b/mk/machine/thunderx/rte.vars.mk
> @@ -55,4 +55,4 @@
>  # CPU_LDFLAGS =
>  # CPU_ASFLAGS =
>  
> -MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
> +MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> index e634abc..6bbd742 100644
> --- a/mk/rte.cpuflags.mk
> +++ b/mk/rte.cpuflags.mk
> @@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
>  CPUFLAGS += CRC32
>  endif
>  
> +ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
> +CPUFLAGS += PMULL
> +endif
>  
>  MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
>  
> diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
> index 280dde2..01ac7e2 100644
> --- a/mk/toolchain/gcc/rte.toolchain-compat.mk
> +++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
> @@ -60,6 +60,7 @@ else
>  #
>  	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
>  		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))

The line above is to be dropped, isn't it?

> +		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))

Please, split the "feature-detection" changes into a separate commit and
explain it. In the code, you test for GCC 7. Here you are ok with GCC
4.9. It's likely to be correct but it is not clear.

Also, please explain why is the "crypto" feature required.

Regards
Jan

>  	endif
>  	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
>  		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
  2017-04-28  9:55 ` [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Jan Viktorin
@ 2017-04-28 10:19   ` Sekhar, Ashwin
  2017-05-03 16:58     ` Jan Viktorin
  0 siblings, 1 reply; 8+ messages in thread
From: Sekhar, Ashwin @ 2017-04-28 10:19 UTC (permalink / raw)
  To: Jan Viktorin; +Cc: thomas, jasvinder.singh, Jacob,  Jerin, jianbo.liu, dev

Hi Jan,
Thanks for the comments. Please see my responses inline.

On Friday 28 April 2017 03:25 PM, Jan Viktorin wrote:
> Hello Ashwin Sekhar,
>
> some comments below...
>
> On Thu, 27 Apr 2017 07:10:20 -0700
> Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com> wrote:
>
>> * Added CRC compute APIs for arm64 utilizing the pmull capability
>> * Added new file net_crc_neon.h to hold the arm64 pmull CRC
>>   implementation
>> * Added crypto capability in compilation of generic armv8 and
>>   thunderx targets
>> * pmull CRC version is used only after checking the pmull capability
>>   at runtime
>> * Verified the changes with crc_autotest unit test case
>>
>> Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
>> ---
>>  MAINTAINERS                                       |   1 +
>>  lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
>>  lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
>>  lib/librte_net/rte_net_crc.c                      |  32 +-
>>  lib/librte_net/rte_net_crc.h                      |   2 +
>>  mk/machine/armv8a/rte.vars.mk                     |   2 +-
>>  mk/machine/thunderx/rte.vars.mk                   |   2 +-
>>  mk/rte.cpuflags.mk                                |   3 +
>>  mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
>>  9 files changed, 438 insertions(+), 7 deletions(-)
>>  create mode 100644 lib/librte_net/net_crc_neon.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 576d60a..283743e 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
>>  F: lib/librte_hash/rte*_arm64.h
>>  F: lib/librte_efd/rte*_arm64.h
>>  F: lib/librte_table/rte*_arm64.h
>> +F: lib/librte_net/net_crc_neon.h
>>  F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
>>  F: drivers/net/i40e/i40e_rxtx_vec_neon.c
>>  F: drivers/net/virtio/virtio_rxtx_simple_neon.c
>> diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
>> index 4107c99..9a3dfdf 100644
>> --- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
>> +++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
>> @@ -34,9 +34,18 @@
>>  #define _RTE_VECT_ARM_H_
>>
>>  #include <stdint.h>
>> +#include <assert.h>
>> +
>>  #include "generic/rte_vect.h"
>>  #include "arm_neon.h"
>>
>> +#ifdef GCC_VERSION
>> +#undef GCC_VERSION
>> +#endif
>
> Why are you doing this? What is wrong with GCC_VERSION?
>
This is just to avoid multiple definitions of GCC_VERSION. Not required 
really. Can be removed.

>> +
>> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
>> +			+ __GNUC_PATCHLEVEL__)
>> +
>
> If you have any specific requirements for testing GCC version then it
> should be done in a more elegant way. However, I do not understand your
> intention.
>
GCC version is checked so as to define wrappers for some neon intrinsics 
which are not available in GCC versions < 7.

Similar checks of GCC_VERSION done in ./lib/librte_table/rte_lru.h. 
Followed the same template here.
Also, this is the suggested approach by GCC. Please see below link.
https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html

Please advise on more elegant ways of gcc version detection.
>>  #ifdef __cplusplus
>>  extern "C" {
>>  #endif
>> @@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
>>  }
>>  #endif
>>
>> +#if (GCC_VERSION < 70000)
>
> Is this code is gcc-specific? In such case there should be check for
> GCC compiler. We can also build e.g. by clang.
>
Yes, the code is GCC specific. Currently there are only GCC targets  for 
arm and arm64. So no checks are done for other types of compilers.
>> +/*
>> + * NEON intrinsic vreinterpretq_u64_p128() is not supported
>> + * in GCC versions < 7
>> + */
>
> I'd be positive about those comments, like:
>
> NEON intrinsic vreinterpretq_u64_p128() is supported since GCC 7.
>
Thanks. Will make the comments positive.

>> +static inline uint64x2_t
>> +vreinterpretq_u64_p128(poly128_t x)
>> +{
>> +	return (uint64x2_t)x;
>> +}
>> +
>> +/*
>> + * NEON intrinsic vreinterpretq_p64_u64() is not supported
>> + * in GCC versions < 7
>> + */
>> +static inline poly64x2_t
>> +vreinterpretq_p64_u64(uint64x2_t x)
>> +{
>> +	return (poly64x2_t)x;
>> +}
>> +
>> +/*
>> + * NEON intrinsic vgetq_lane_p64() is not supported
>> + * in GCC versions < 7
>> + */
>> +static inline poly64_t
>> +vgetq_lane_p64(poly64x2_t x, const int lane)
>> +{
>> +	assert(lane >= 0 && lane <= 1);
>> +
>> +	poly64_t *p = (poly64_t *)&x;
>> +
>> +	return p[lane];
>> +}
>> +#endif
>> +
>>  #ifdef __cplusplus
>>  }
>>  #endif
>> diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
>
> [...]
>
>>  # CPU_LDFLAGS =
>>  # CPU_ASFLAGS =
>>
>> -MACHINE_CFLAGS += -march=armv8-a+crc
>> +MACHINE_CFLAGS += -march=armv8-a+crc+crypto
>> diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
>> index ad5a379..6784105 100644
>> --- a/mk/machine/thunderx/rte.vars.mk
>> +++ b/mk/machine/thunderx/rte.vars.mk
>> @@ -55,4 +55,4 @@
>>  # CPU_LDFLAGS =
>>  # CPU_ASFLAGS =
>>
>> -MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
>> +MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
>> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
>> index e634abc..6bbd742 100644
>> --- a/mk/rte.cpuflags.mk
>> +++ b/mk/rte.cpuflags.mk
>> @@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
>>  CPUFLAGS += CRC32
>>  endif
>>
>> +ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
>> +CPUFLAGS += PMULL
>> +endif
>>
>>  MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
>>
>> diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
>> index 280dde2..01ac7e2 100644
>> --- a/mk/toolchain/gcc/rte.toolchain-compat.mk
>> +++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
>> @@ -60,6 +60,7 @@ else
>>  #
>>  	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
>>  		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
>
> The line above is to be dropped, isn't it?
>
No. It is not to be dropped. For targets like xgene1, crypto is not 
defined. Above line is required for the substitution to happen in such 
targets.
>> +		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
>
> Please, split the "feature-detection" changes into a separate commit and
> explain it. In the code, you test for GCC 7. Here you are ok with GCC
> 4.9. It's likely to be correct but it is not clear.
Sure. Will split the feature detection changes to separate commit.
>
> Also, please explain why is the "crypto" feature required.
crypto feature is required for using the vmull_p64 intrinsic. More 
specifically the PMULL instruction.
Will add this as part of the commit message.
>
> Regards
> Jan
>
>>  	endif
>>  	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
>>  		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
>
Thanks and Regards,
Ashwin


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
  2017-04-28 10:19   ` Sekhar, Ashwin
@ 2017-05-03 16:58     ` Jan Viktorin
  0 siblings, 0 replies; 8+ messages in thread
From: Jan Viktorin @ 2017-05-03 16:58 UTC (permalink / raw)
  To: Sekhar, Ashwin; +Cc: thomas, jasvinder.singh, Jacob,  Jerin, jianbo.liu, dev

On Fri, 28 Apr 2017 10:19:20 +0000
"Sekhar, Ashwin" <Ashwin.Sekhar@cavium.com> wrote:

> Hi Jan,
> Thanks for the comments. Please see my responses inline.
> 
> On Friday 28 April 2017 03:25 PM, Jan Viktorin wrote:
> > Hello Ashwin Sekhar,
> >
> > some comments below...
> >

[...]

> >>
> >>  #include <stdint.h>
> >> +#include <assert.h>
> >> +

I'd prefer RTE_ASSERT (rte_debug.h) instead of this one.

> >>  #include "generic/rte_vect.h"
> >>  #include "arm_neon.h"
> >>
> >> +#ifdef GCC_VERSION
> >> +#undef GCC_VERSION
> >> +#endif  
> >
> > Why are you doing this? What is wrong with GCC_VERSION?
> >  
> This is just to avoid multiple definitions of GCC_VERSION. Not required 
> really. Can be removed.
> 
> >> +
> >> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
> >> +			+ __GNUC_PATCHLEVEL__)
> >> +  
> >
> > If you have any specific requirements for testing GCC version then it
> > should be done in a more elegant way. However, I do not understand your
> > intention.
> >  
> GCC version is checked so as to define wrappers for some neon intrinsics 
> which are not available in GCC versions < 7.
> 
> Similar checks of GCC_VERSION done in ./lib/librte_table/rte_lru.h. 
> Followed the same template here.
> Also, this is the suggested approach by GCC. Please see below link.
> https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html

This is OK, I understand.

> 
> Please advise on more elegant ways of gcc version detection.

I don't say that it is wrong. Just, it is quite a low-level definition
that might be solved once for all to avoid any further GCC_VERSION
definitions.

At least, I would go this way:

#ifdef __GNUC__
#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#else
#define GCC_VERSION 0
#endif

To be better, this should be defined in some more general file
(rte_common.h, rte_compiler.h)? I don't have any strong opinion about
this. The rte_lru.h can be refactorized in the same way.

> >>  #ifdef __cplusplus
> >>  extern "C" {
> >>  #endif
> >> @@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
> >>  }
> >>  #endif
> >>
> >> +#if (GCC_VERSION < 70000)  
> >

[...]

> >  
> >>  # CPU_LDFLAGS =
> >>  # CPU_ASFLAGS =
> >>
> >> -MACHINE_CFLAGS += -march=armv8-a+crc
> >> +MACHINE_CFLAGS += -march=armv8-a+crc+crypto
> >> diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
> >> index ad5a379..6784105 100644
> >> --- a/mk/machine/thunderx/rte.vars.mk
> >> +++ b/mk/machine/thunderx/rte.vars.mk
> >> @@ -55,4 +55,4 @@
> >>  # CPU_LDFLAGS =
> >>  # CPU_ASFLAGS =
> >>
> >> -MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
> >> +MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
> >> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> >> index e634abc..6bbd742 100644
> >> --- a/mk/rte.cpuflags.mk
> >> +++ b/mk/rte.cpuflags.mk
> >> @@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
> >>  CPUFLAGS += CRC32
> >>  endif
> >>
> >> +ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
> >> +CPUFLAGS += PMULL
> >> +endif
> >>
> >>  MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
> >>
> >> diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
> >> index 280dde2..01ac7e2 100644
> >> --- a/mk/toolchain/gcc/rte.toolchain-compat.mk
> >> +++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
> >> @@ -60,6 +60,7 @@ else
> >>  #
> >>  	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
> >>  		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))  
> >
> > The line above is to be dropped, isn't it?
> >  
> No. It is not to be dropped. For targets like xgene1, crypto is not 
> defined. Above line is required for the substitution to happen in such 
> targets.

Yes, I understand...

> >> +		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))  
> >
> > Please, split the "feature-detection" changes into a separate commit and
> > explain it. In the code, you test for GCC 7. Here you are ok with GCC
> > 4.9. It's likely to be correct but it is not clear.  
> Sure. Will split the feature detection changes to separate commit.
> >
> > Also, please explain why is the "crypto" feature required.  
> crypto feature is required for using the vmull_p64 intrinsic. More 
> specifically the PMULL instruction.
> Will add this as part of the commit message.

OK.

Jan

> >
> > Regards
> > Jan
> >  
> >>  	endif
> >>  	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
> >>  		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))  
> >  
> Thanks and Regards,
> Ashwin
> 



-- 
   Jan Viktorin                  E-mail: Viktorin@RehiveTech.com
   System Architect              Web:    www.RehiveTech.com
   RehiveTech
   Brno, Czech Republic

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs
@ 2017-04-27 14:06 Ashwin Sekhar T K
  0 siblings, 0 replies; 8+ messages in thread
From: Ashwin Sekhar T K @ 2017-04-27 14:06 UTC (permalink / raw)
  To: thomas, jasvinder.singh, viktorin, jerin.jacob, jianbo.liu
  Cc: dev, Ashwin Sekhar T K

* Added CRC compute APIs for arm64 utilizing the pmull capability
* Added new file net_crc_neon.h to hold the arm64 pmull CRC
  implementation
* Added crypto capability in compilation of generic armv8 and
  thunderx targets
* pmull CRC version is used only after checking the pmull capability
  at runtime
* Verified the changes with crc_autotest unit test case

Signed-off-by: Ashwin Sekhar T K <ashwin.sekhar@caviumnetworks.com>
---
 MAINTAINERS                                       |   1 +
 lib/librte_eal/common/include/arch/arm/rte_vect.h |  45 +++
 lib/librte_net/net_crc_neon.h                     | 357 ++++++++++++++++++++++
 lib/librte_net/rte_net_crc.c                      |  32 +-
 lib/librte_net/rte_net_crc.h                      |   2 +
 mk/machine/armv8a/rte.vars.mk                     |   2 +-
 mk/machine/thunderx/rte.vars.mk                   |   2 +-
 mk/rte.cpuflags.mk                                |   3 +
 mk/toolchain/gcc/rte.toolchain-compat.mk          |   1 +
 9 files changed, 438 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_net/net_crc_neon.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 576d60a..283743e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -149,6 +149,7 @@ F: lib/librte_lpm/rte_lpm_neon.h
 F: lib/librte_hash/rte*_arm64.h
 F: lib/librte_efd/rte*_arm64.h
 F: lib/librte_table/rte*_arm64.h
+F: lib/librte_net/net_crc_neon.h
 F: drivers/net/ixgbe/ixgbe_rxtx_vec_neon.c
 F: drivers/net/i40e/i40e_rxtx_vec_neon.c
 F: drivers/net/virtio/virtio_rxtx_simple_neon.c
diff --git a/lib/librte_eal/common/include/arch/arm/rte_vect.h b/lib/librte_eal/common/include/arch/arm/rte_vect.h
index 4107c99..9a3dfdf 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_vect.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_vect.h
@@ -34,9 +34,18 @@
 #define _RTE_VECT_ARM_H_
 
 #include <stdint.h>
+#include <assert.h>
+
 #include "generic/rte_vect.h"
 #include "arm_neon.h"
 
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 \
+			+ __GNUC_PATCHLEVEL__)
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -78,6 +87,42 @@ vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
 }
 #endif
 
+#if (GCC_VERSION < 70000)
+/*
+ * NEON intrinsic vreinterpretq_u64_p128() is not supported
+ * in GCC versions < 7
+ */
+static inline uint64x2_t
+vreinterpretq_u64_p128(poly128_t x)
+{
+	return (uint64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vreinterpretq_p64_u64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64x2_t
+vreinterpretq_p64_u64(uint64x2_t x)
+{
+	return (poly64x2_t)x;
+}
+
+/*
+ * NEON intrinsic vgetq_lane_p64() is not supported
+ * in GCC versions < 7
+ */
+static inline poly64_t
+vgetq_lane_p64(poly64x2_t x, const int lane)
+{
+	assert(lane >= 0 && lane <= 1);
+
+	poly64_t *p = (poly64_t *)&x;
+
+	return p[lane];
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_net/net_crc_neon.h b/lib/librte_net/net_crc_neon.h
new file mode 100644
index 0000000..05120a7
--- /dev/null
+++ b/lib/librte_net/net_crc_neon.h
@@ -0,0 +1,357 @@
+/*
+ *   BSD LICENSE
+ *
+ *   Copyright (C) Cavium networks Ltd. 2017.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _NET_CRC_NEON_H_
+#define _NET_CRC_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_net_crc.h>
+#include <rte_vect.h>
+#include <rte_cpuflags.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** PMULL CRC computation context structure */
+struct crc_pmull_ctx {
+	uint64x2_t rk1_rk2;
+	uint64x2_t rk5_rk6;
+	uint64x2_t rk7_rk8;
+};
+
+struct crc_pmull_ctx crc32_eth_pmull __rte_aligned(16);
+struct crc_pmull_ctx crc16_ccitt_pmull __rte_aligned(16);
+
+static inline uint8x16_t
+extract_vector(uint8x16_t v0, uint8x16_t v1, const int n)
+{
+	switch (n) {
+	case 0: return vextq_u8(v0, v1, 0);
+	case 1: return vextq_u8(v0, v1, 1);
+	case 2: return vextq_u8(v0, v1, 2);
+	case 3: return vextq_u8(v0, v1, 3);
+	case 4: return vextq_u8(v0, v1, 4);
+	case 5: return vextq_u8(v0, v1, 5);
+	case 6: return vextq_u8(v0, v1, 6);
+	case 7: return vextq_u8(v0, v1, 7);
+	case 8: return vextq_u8(v0, v1, 8);
+	case 9: return vextq_u8(v0, v1, 9);
+	case 10: return vextq_u8(v0, v1, 10);
+	case 11: return vextq_u8(v0, v1, 11);
+	case 12: return vextq_u8(v0, v1, 12);
+	case 13: return vextq_u8(v0, v1, 13);
+	case 14: return vextq_u8(v0, v1, 14);
+	case 15: return vextq_u8(v0, v1, 15);
+	}
+	return v1;
+}
+
+/**
+ * Shifts right 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_right(uint64x2_t reg, const unsigned int num)
+{
+	/* Right Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vreinterpretq_u8_u64(reg),
+				vdupq_n_u8(0),
+				num));
+}
+
+/**
+ * Shifts left 128 bit register by specified number of bytes
+ *
+ * @param reg 128 bit value
+ * @param num number of bytes to shift reg by (0-16)
+ *
+ * @return reg << (num * 8)
+ */
+static inline uint64x2_t
+shift_bytes_left(uint64x2_t reg, const unsigned int num)
+{
+	/* Left Shift */
+	return vreinterpretq_u64_u8(extract_vector(
+				vdupq_n_u8(0),
+				vreinterpretq_u8_u64(reg),
+				16 - num));
+}
+
+/**
+ * @brief Performs one folding round
+ *
+ * Logically function operates as follows:
+ *     DATA = READ_NEXT_16BYTES();
+ *     F1 = LSB8(FOLD)
+ *     F2 = MSB8(FOLD)
+ *     T1 = CLMUL(F1, RK1)
+ *     T2 = CLMUL(F2, RK2)
+ *     FOLD = XOR(T1, T2, DATA)
+ *
+ * @param data_block 16 byte data block
+ * @param precomp precomputed rk1 constanst
+ * @param fold running 16 byte folded data
+ *
+ * @return New 16 byte folded data
+ */
+static inline uint64x2_t
+crcr32_folding_round(uint64x2_t data_block, uint64x2_t precomp,
+	uint64x2_t fold)
+{
+	uint64x2_t tmp0 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 1),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+
+	uint64x2_t tmp1 = vreinterpretq_u64_p128(vmull_p64(
+			vgetq_lane_p64(vreinterpretq_p64_u64(fold), 0),
+			vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, veorq_u64(data_block, tmp0));
+}
+
+/**
+ * Performs reduction from 128 bits to 64 bits
+ *
+ * @param data128 128 bits data to be reduced
+ * @param precomp rk5 and rk6 precomputed constants
+ *
+ * @return data reduced to 64 bits
+ */
+static inline uint64x2_t
+crcr32_reduce_128_to_64(uint64x2_t data128,
+	uint64x2_t precomp)
+{
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	/* 64b fold */
+	tmp0 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(data128), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = shift_bytes_right(data128, 8);
+	tmp0 = veorq_u64(tmp0, tmp1);
+
+	/* 32b fold */
+	tmp2 = shift_bytes_left(tmp0, 4);
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp2), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+
+	return veorq_u64(tmp1, tmp0);
+}
+
+/**
+ * Performs Barret's reduction from 64 bits to 32 bits
+ *
+ * @param data64 64 bits data to be reduced
+ * @param precomp rk7 precomputed constant
+ *
+ * @return data reduced to 32 bits
+ */
+static inline uint32_t
+crcr32_reduce_64_to_32(uint64x2_t data64,
+	uint64x2_t precomp)
+{
+	static uint32_t mask1[4] __rte_aligned(16) = {
+		0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+	};
+	static uint32_t mask2[4] __rte_aligned(16) = {
+		0x00000000, 0xffffffff, 0xffffffff, 0xffffffff
+	};
+	uint64x2_t tmp0, tmp1, tmp2;
+
+	tmp0 = vandq_u64(data64, vld1q_u64((uint64_t *)mask2));
+
+	tmp1 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp0), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 0)));
+	tmp1 = veorq_u64(tmp1, tmp0);
+	tmp1 = vandq_u64(tmp1, vld1q_u64((uint64_t *)mask1));
+
+	tmp2 = vreinterpretq_u64_p128(vmull_p64(
+		vgetq_lane_p64(vreinterpretq_p64_u64(tmp1), 0),
+		vgetq_lane_p64(vreinterpretq_p64_u64(precomp), 1)));
+	tmp2 = veorq_u64(tmp2, tmp1);
+	tmp2 = veorq_u64(tmp2, tmp0);
+
+	return vgetq_lane_u32(vreinterpretq_u32_u64(tmp2), 2);
+}
+
+static inline uint32_t
+crc32_eth_calc_pmull(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_pmull_ctx *params)
+{
+	uint64x2_t temp, fold, k;
+	uint32_t n;
+
+	/* Get CRC init value */
+	temp = vreinterpretq_u64_u32(vsetq_lane_u32(crc, vmovq_n_u32(0), 0));
+
+	/**
+	 * Folding all data into single 16 byte data block
+	 * Assumes: fold holds first 16 bytes of data
+	 */
+	if (unlikely(data_len < 32)) {
+		if (unlikely(data_len == 16)) {
+			/* 16 bytes */
+			fold = vld1q_u64((const uint64_t *)data);
+			fold = veorq_u64(fold, temp);
+			goto reduction_128_64;
+		}
+
+		if (unlikely(data_len < 16)) {
+			/* 0 to 15 bytes */
+			uint8_t buffer[16] __rte_aligned(16);
+
+			memset(buffer, 0, sizeof(buffer));
+			memcpy(buffer, data, data_len);
+
+			fold = vld1q_u64((uint64_t *)buffer);
+			fold = veorq_u64(fold, temp);
+			if (unlikely(data_len < 4)) {
+				fold = shift_bytes_left(fold, 8 - data_len);
+				goto barret_reduction;
+			}
+			fold = shift_bytes_left(fold, 16 - data_len);
+			goto reduction_128_64;
+		}
+		/* 17 to 31 bytes */
+		fold = vld1q_u64((const uint64_t *)data);
+		fold = veorq_u64(fold, temp);
+		n = 16;
+		k = params->rk1_rk2;
+		goto partial_bytes;
+	}
+
+	/** At least 32 bytes in the buffer */
+	/** Apply CRC initial value */
+	fold = vld1q_u64((const uint64_t *)data);
+	fold = veorq_u64(fold, temp);
+
+	/** Main folding loop - the last 16 bytes is processed separately */
+	k = params->rk1_rk2;
+	for (n = 16; (n + 16) <= data_len; n += 16) {
+		temp = vld1q_u64((const uint64_t *)&data[n]);
+		fold = crcr32_folding_round(temp, k, fold);
+	}
+
+partial_bytes:
+	if (likely(n < data_len)) {
+		uint64x2_t last16, a, b, mask;
+		uint32_t rem = data_len & 15;
+
+		last16 = vld1q_u64((const uint64_t *)&data[data_len - 16]);
+		a = shift_bytes_left(fold, 16 - rem);
+		b = shift_bytes_right(fold, rem);
+		mask = shift_bytes_left(vdupq_n_u64(-1), 16 - rem);
+		b = vorrq_u64(b, vandq_u64(mask, last16));
+
+		/* k = rk1 & rk2 */
+		temp = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 1),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 0)));
+		fold = vreinterpretq_u64_p128(vmull_p64(
+				vgetq_lane_p64(vreinterpretq_p64_u64(a), 0),
+				vgetq_lane_p64(vreinterpretq_p64_u64(k), 1)));
+		fold = veorq_u64(fold, temp);
+		fold = veorq_u64(fold, b);
+	}
+
+	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
+reduction_128_64:
+	k = params->rk5_rk6;
+	fold = crcr32_reduce_128_to_64(fold, k);
+
+barret_reduction:
+	k = params->rk7_rk8;
+	n = crcr32_reduce_64_to_32(fold, k);
+
+	return n;
+}
+
+static inline void
+rte_net_crc_neon_init(void)
+{
+	/* Initialize CRC16 data */
+	uint64_t ccitt_k1_k2[2] = {0x189aeLLU, 0x8e10LLU};
+	uint64_t ccitt_k5_k6[2] = {0x189aeLLU, 0x114aaLLU};
+	uint64_t ccitt_k7_k8[2] = {0x11c581910LLU, 0x10811LLU};
+
+	/* Initialize CRC32 data */
+	uint64_t eth_k1_k2[2] = {0xccaa009eLLU, 0x1751997d0LLU};
+	uint64_t eth_k5_k6[2] = {0xccaa009eLLU, 0x163cd6124LLU};
+	uint64_t eth_k7_k8[2] = {0x1f7011640LLU, 0x1db710641LLU};
+
+	/** Save the params in context structure */
+	crc16_ccitt_pmull.rk1_rk2 = vld1q_u64(ccitt_k1_k2);
+	crc16_ccitt_pmull.rk5_rk6 = vld1q_u64(ccitt_k5_k6);
+	crc16_ccitt_pmull.rk7_rk8 = vld1q_u64(ccitt_k7_k8);
+
+	/** Save the params in context structure */
+	crc32_eth_pmull.rk1_rk2 = vld1q_u64(eth_k1_k2);
+	crc32_eth_pmull.rk5_rk6 = vld1q_u64(eth_k5_k6);
+	crc32_eth_pmull.rk7_rk8 = vld1q_u64(eth_k7_k8);
+}
+
+static inline uint32_t
+rte_crc16_ccitt_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return (uint16_t)~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_pmull);
+}
+
+static inline uint32_t
+rte_crc32_eth_neon_handler(const uint8_t *data,
+	uint32_t data_len)
+{
+	return ~crc32_eth_calc_pmull(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_pmull);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _NET_CRC_NEON_H_ */
diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index e8326fe..be65f34 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -43,10 +43,16 @@
 	&& defined(RTE_MACHINE_CPUFLAG_SSE4_2)		\
 	&& defined(RTE_MACHINE_CPUFLAG_PCLMULQDQ)
 #define X86_64_SSE42_PCLMULQDQ     1
+#elif defined(RTE_ARCH_ARM64)
+#if defined(RTE_MACHINE_CPUFLAG_PMULL)
+#define ARM64_NEON_PMULL           1
+#endif
 #endif
 
 #ifdef X86_64_SSE42_PCLMULQDQ
 #include <net_crc_sse.h>
+#elif defined(ARM64_NEON_PMULL)
+#include <net_crc_neon.h>
 #endif
 
 /* crc tables */
@@ -74,6 +80,11 @@ static rte_net_crc_handler handlers_sse42[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_sse42_handler,
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_sse42_handler,
 };
+#elif defined(ARM64_NEON_PMULL)
+static rte_net_crc_handler handlers_neon[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_neon_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
+};
 #endif
 
 /**
@@ -162,14 +173,20 @@ void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
 	switch (alg) {
-	case RTE_NET_CRC_SSE42:
 #ifdef X86_64_SSE42_PCLMULQDQ
+	case RTE_NET_CRC_SSE42:
 		handlers = handlers_sse42;
-#else
-		alg = RTE_NET_CRC_SCALAR;
 		break;
+#elif defined(ARM64_NEON_PMULL)
+	case RTE_NET_CRC_NEON:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+			handlers = handlers_neon;
+			break;
+		}
+		//-fallthrough
 #endif
 	case RTE_NET_CRC_SCALAR:
+		//-fallthrough
 	default:
 		handlers = handlers_scalar;
 		break;
@@ -199,8 +216,13 @@ rte_net_crc_init(void)
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-		alg = RTE_NET_CRC_SSE42;
-		rte_net_crc_sse42_init();
+	alg = RTE_NET_CRC_SSE42;
+	rte_net_crc_sse42_init();
+#elif defined(ARM64_NEON_PMULL)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		alg = RTE_NET_CRC_NEON;
+		rte_net_crc_neon_init();
+	}
 #endif
 
 	rte_net_crc_set_alg(alg);
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 76fd129..1daed30 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -55,6 +55,7 @@ enum rte_net_crc_type {
 enum rte_net_crc_alg {
 	RTE_NET_CRC_SCALAR = 0,
 	RTE_NET_CRC_SSE42,
+	RTE_NET_CRC_NEON,
 };
 
 /**
@@ -66,6 +67,7 @@ enum rte_net_crc_alg {
  *   This parameter is used to select the CRC implementation version.
  *   - RTE_NET_CRC_SCALAR
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
+ *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
diff --git a/mk/machine/armv8a/rte.vars.mk b/mk/machine/armv8a/rte.vars.mk
index d5049e1..51966a5 100644
--- a/mk/machine/armv8a/rte.vars.mk
+++ b/mk/machine/armv8a/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto
diff --git a/mk/machine/thunderx/rte.vars.mk b/mk/machine/thunderx/rte.vars.mk
index ad5a379..6784105 100644
--- a/mk/machine/thunderx/rte.vars.mk
+++ b/mk/machine/thunderx/rte.vars.mk
@@ -55,4 +55,4 @@
 # CPU_LDFLAGS =
 # CPU_ASFLAGS =
 
-MACHINE_CFLAGS += -march=armv8-a+crc -mcpu=thunderx
+MACHINE_CFLAGS += -march=armv8-a+crc+crypto -mcpu=thunderx
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index e634abc..6bbd742 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -119,6 +119,9 @@ ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32
 endif
 
+ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRYPTO),)
+CPUFLAGS += PMULL
+endif
 
 MACHINE_CFLAGS += $(addprefix -DRTE_MACHINE_CPUFLAG_,$(CPUFLAGS))
 
diff --git a/mk/toolchain/gcc/rte.toolchain-compat.mk b/mk/toolchain/gcc/rte.toolchain-compat.mk
index 280dde2..01ac7e2 100644
--- a/mk/toolchain/gcc/rte.toolchain-compat.mk
+++ b/mk/toolchain/gcc/rte.toolchain-compat.mk
@@ -60,6 +60,7 @@ else
 #
 	ifeq ($(shell test $(GCC_VERSION) -le 49 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc,-march=armv8-a+crc -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
+		MACHINE_CFLAGS := $(patsubst -march=armv8-a+crc+crypto,-march=armv8-a+crc+crypto -D__ARM_FEATURE_CRC32=1,$(MACHINE_CFLAGS))
 	endif
 	ifeq ($(shell test $(GCC_VERSION) -le 47 && echo 1), 1)
 		MACHINE_CFLAGS := $(patsubst -march=core-avx-i,-march=corei7-avx,$(MACHINE_CFLAGS))
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-05-03 16:59 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-27 14:10 [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-04-27 14:10 ` [dpdk-dev] [PATCH 2/2] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-04-28  9:34 ` [dpdk-dev] [PATCH v2 1/2] net: add arm64 neon version of CRC compute APIs Ashwin Sekhar T K
2017-04-28  9:34   ` [dpdk-dev] [PATCH v2 2/2] test: add tests for arm64 CRC neon versions Ashwin Sekhar T K
2017-04-28  9:55 ` [dpdk-dev] [PATCH 1/2] net: add arm64 neon version of CRC compute APIs Jan Viktorin
2017-04-28 10:19   ` Sekhar, Ashwin
2017-05-03 16:58     ` Jan Viktorin
  -- strict thread matches above, loose matches on Subject: below --
2017-04-27 14:06 Ashwin Sekhar T K

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).