DPDK patches and discussions
 help / color / mirror / Atom feed
From: Daniel Gregory <daniel.gregory@bytedance.com>
To: Stanislaw Kardach <stanislaw.kardach@gmail.com>,
	Thomas Monjalon <thomas@monjalon.net>,
	Jasvinder Singh <jasvinder.singh@intel.com>
Cc: dev@dpdk.org, Punit Agrawal <punit.agrawal@bytedance.com>,
	Liang Ma <liangma@liangbit.com>,
	Pengcheng Wang <wangpengcheng.pp@bytedance.com>,
	Chunsong Feng <fengchunsong@bytedance.com>,
	Daniel Gregory <daniel.gregory@bytedance.com>
Subject: [PATCH v2 3/9] net: implement crc using riscv carryless multiply
Date: Fri, 12 Jul 2024 16:46:39 +0100	[thread overview]
Message-ID: <20240712154645.80622-4-daniel.gregory@bytedance.com> (raw)
In-Reply-To: <20240712154645.80622-1-daniel.gregory@bytedance.com>

Using carryless multiply instructions (clmul) from RISC-V's Zbc
extension, implement CRC-32 and CRC-16 calculations on buffers.

Based on the approach described in Intel's whitepaper on "Fast CRC
Computation for Generic Polynomails Using PCLMULQDQ Instructions", we
perform repeated folds-by-1 whilst the buffer is still big enough, then
perform Barrett's reductions on the rest.

Add a case to the crc_autotest suite that tests this implementation.

Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
---
 MAINTAINERS           |   1 +
 app/test/test_crc.c   |   9 ++
 lib/net/meson.build   |   4 +
 lib/net/net_crc.h     |  11 +++
 lib/net/net_crc_zbc.c | 191 ++++++++++++++++++++++++++++++++++++++++++
 lib/net/rte_net_crc.c |  40 +++++++++
 lib/net/rte_net_crc.h |   2 +
 7 files changed, 258 insertions(+)
 create mode 100644 lib/net/net_crc_zbc.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 81f13ebcf2..58fbc51e64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -319,6 +319,7 @@ F: config/riscv/
 F: doc/guides/linux_gsg/cross_build_dpdk_for_riscv.rst
 F: lib/eal/riscv/
 F: lib/hash/rte_crc_riscv64.h
+F: lib/net/net_crc_zbc.c
 
 Intel x86
 M: Bruce Richardson <bruce.richardson@intel.com>
diff --git a/app/test/test_crc.c b/app/test/test_crc.c
index b85fca35fe..fa91557cf5 100644
--- a/app/test/test_crc.c
+++ b/app/test/test_crc.c
@@ -168,6 +168,15 @@ test_crc(void)
 		return ret;
 	}
 
+	/* set CRC riscv mode */
+	rte_net_crc_set_alg(RTE_NET_CRC_ZBC);
+
+	ret = test_crc_calc();
+	if (ret < 0) {
+		printf("test crc (riscv64 zbc clmul): failed (%d)\n", ret);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/lib/net/meson.build b/lib/net/meson.build
index 0b69138949..404d8dd3ae 100644
--- a/lib/net/meson.build
+++ b/lib/net/meson.build
@@ -125,4 +125,8 @@ elif (dpdk_conf.has('RTE_ARCH_ARM64') and
         cc.get_define('__ARM_FEATURE_CRYPTO', args: machine_args) != '')
     sources += files('net_crc_neon.c')
     cflags += ['-DCC_ARM64_NEON_PMULL_SUPPORT']
+elif (dpdk_conf.has('RTE_ARCH_RISCV') and
+        cc.get_define('RTE_RISCV_FEATURE_ZBC', args: machine_args) != '')
+    sources += files('net_crc_zbc.c')
+    cflags += ['-DCC_RISCV64_ZBC_CLMUL_SUPPORT']
 endif
diff --git a/lib/net/net_crc.h b/lib/net/net_crc.h
index 7a74d5406c..06ae113b47 100644
--- a/lib/net/net_crc.h
+++ b/lib/net/net_crc.h
@@ -42,4 +42,15 @@ rte_crc16_ccitt_neon_handler(const uint8_t *data, uint32_t data_len);
 uint32_t
 rte_crc32_eth_neon_handler(const uint8_t *data, uint32_t data_len);
 
+/* RISCV64 Zbc */
+void
+rte_net_crc_zbc_init(void);
+
+uint32_t
+rte_crc16_ccitt_zbc_handler(const uint8_t *data, uint32_t data_len);
+
+uint32_t
+rte_crc32_eth_zbc_handler(const uint8_t *data, uint32_t data_len);
+
+
 #endif /* _NET_CRC_H_ */
diff --git a/lib/net/net_crc_zbc.c b/lib/net/net_crc_zbc.c
new file mode 100644
index 0000000000..be416ba52f
--- /dev/null
+++ b/lib/net/net_crc_zbc.c
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) ByteDance 2024
+ */
+
+#include <riscv_bitmanip.h>
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_net_crc.h>
+
+#include "net_crc.h"
+
+/* CLMUL CRC computation context structure */
+struct crc_clmul_ctx {
+	uint64_t Pr;
+	uint64_t mu;
+	uint64_t k3;
+	uint64_t k4;
+	uint64_t k5;
+};
+
+struct crc_clmul_ctx crc32_eth_clmul;
+struct crc_clmul_ctx crc16_ccitt_clmul;
+
+/* Perform Barrett's reduction on 8, 16, 32 or 64-bit value */
+static inline uint32_t
+crc32_barrett_zbc(
+	const uint64_t data,
+	uint32_t crc,
+	uint32_t bits,
+	const struct crc_clmul_ctx *params)
+{
+	assert((bits == 64) || (bits == 32) || (bits == 16) || (bits == 8));
+
+	/* Combine data with the initial value */
+	uint64_t temp = (uint64_t)(data ^ crc) << (64 - bits);
+
+	/*
+	 * Multiply by mu, which is 2^96 / P. Division by 2^96 occurs by taking
+	 * the lower 64 bits of the result (remember we're inverted)
+	 */
+	temp = __riscv_clmul_64(temp, params->mu);
+	/* Multiply by P */
+	temp = __riscv_clmulh_64(temp, params->Pr);
+
+	/* Subtract from original (only needed for smaller sizes) */
+	if (bits == 16 || bits == 8)
+		temp ^= crc >> bits;
+
+	return temp;
+}
+
+/* Repeat Barrett's reduction for short buffer sizes */
+static inline uint32_t
+crc32_repeated_barrett_zbc(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_clmul_ctx *params)
+{
+	while (data_len >= 8) {
+		crc = crc32_barrett_zbc(*(const uint64_t *)data, crc, 64, params);
+		data += 8;
+		data_len -= 8;
+	}
+	if (data_len >= 4) {
+		crc = crc32_barrett_zbc(*(const uint32_t *)data, crc, 32, params);
+		data += 4;
+		data_len -= 4;
+	}
+	if (data_len >= 2) {
+		crc = crc32_barrett_zbc(*(const uint16_t *)data, crc, 16, params);
+		data += 2;
+		data_len -= 2;
+	}
+	if (data_len >= 1)
+		crc = crc32_barrett_zbc(*(const uint8_t *)data, crc, 8, params);
+
+	return crc;
+}
+
+/* Perform a reduction by 1 on a buffer (minimum length 2) */
+static inline void
+crc32_reduce_zbc(const uint64_t *data, uint64_t *high, uint64_t *low,
+		 const struct crc_clmul_ctx *params)
+{
+	uint64_t highh = __riscv_clmulh_64(params->k3, *high);
+	uint64_t highl = __riscv_clmul_64(params->k3, *high);
+	uint64_t lowh = __riscv_clmulh_64(params->k4, *low);
+	uint64_t lowl = __riscv_clmul_64(params->k4, *low);
+
+	*high = highl ^ lowl;
+	*low = highh ^ lowh;
+
+	*high ^= *(data++);
+	*low ^= *(data++);
+}
+
+static inline uint32_t
+crc32_eth_calc_zbc(
+	const uint8_t *data,
+	uint32_t data_len,
+	uint32_t crc,
+	const struct crc_clmul_ctx *params)
+{
+	uint64_t high, low;
+	/* Minimum length we can do reduction-by-1 over */
+	const uint32_t min_len = 16;
+	/* Barrett reduce until buffer aligned to 8-byte word */
+	uint32_t misalign = (size_t)data & 7;
+	if (misalign != 0 && misalign <= data_len) {
+		crc = crc32_repeated_barrett_zbc(data, misalign, crc, params);
+		data += misalign;
+		data_len -= misalign;
+	}
+
+	if (data_len < min_len)
+		return crc32_repeated_barrett_zbc(data, data_len, crc, params);
+
+	/* Fold buffer into two 8-byte words */
+	high = *((const uint64_t *)data) ^ crc;
+	low = *((const uint64_t *)(data + 8));
+	data += 16;
+	data_len -= 16;
+
+	for (; data_len >= 16; data_len -= 16, data += 16)
+		crc32_reduce_zbc((const uint64_t *)data, &high, &low, params);
+
+	/* Fold last 128 bits into 96 */
+	low = __riscv_clmul_64(params->k4, high) ^ low;
+	high = __riscv_clmulh_64(params->k4, high);
+	/* Upper 32 bits of high are now zero */
+	high = (low >> 32) | (high << 32);
+
+	/* Fold last 96 bits into 64 */
+	low = __riscv_clmul_64(low & 0xffffffff, params->k5);
+	low ^= high;
+
+	/*
+	 * Barrett reduction of remaining 64 bits, using high to store initial
+	 * value of low
+	 */
+	high = low;
+	low = __riscv_clmul_64(low, params->mu);
+	low &= 0xffffffff;
+	low = __riscv_clmul_64(low, params->Pr);
+	crc = (high ^ low) >> 32;
+
+	/* Combine crc with any excess */
+	crc = crc32_repeated_barrett_zbc(data, data_len, crc, params);
+
+	return crc;
+}
+
+void
+rte_net_crc_zbc_init(void)
+{
+	/* Initialise CRC32 data */
+	crc32_eth_clmul.Pr = 0x1db710641LL; /* polynomial P reversed */
+	crc32_eth_clmul.mu = 0xb4e5b025f7011641LL; /* (2 ^ 64 / P) reversed */
+	crc32_eth_clmul.k3 = 0x1751997d0LL; /* (x^(128+32) mod P << 32) reversed << 1 */
+	crc32_eth_clmul.k4 = 0x0ccaa009eLL; /* (x^(128-32) mod P << 32) reversed << 1 */
+	crc32_eth_clmul.k5 = 0x163cd6124LL; /* (x^64 mod P << 32) reversed << 1 */
+
+	/* Initialise CRC16 data */
+	/* Same calculations as above, with polynomial << 16 */
+	crc16_ccitt_clmul.Pr = 0x10811LL;
+	crc16_ccitt_clmul.mu = 0x859b040b1c581911LL;
+	crc16_ccitt_clmul.k3 = 0x8e10LL;
+	crc16_ccitt_clmul.k4 = 0x189aeLL;
+	crc16_ccitt_clmul.k5 = 0x114aaLL;
+}
+
+uint32_t
+rte_crc16_ccitt_zbc_handler(const uint8_t *data, uint32_t data_len)
+{
+	/* Negate the crc, which is present in the lower 16-bits */
+	return (uint16_t)~crc32_eth_calc_zbc(data,
+		data_len,
+		0xffff,
+		&crc16_ccitt_clmul);
+}
+
+uint32_t
+rte_crc32_eth_zbc_handler(const uint8_t *data, uint32_t data_len)
+{
+	return ~crc32_eth_calc_zbc(data,
+		data_len,
+		0xffffffffUL,
+		&crc32_eth_clmul);
+}
diff --git a/lib/net/rte_net_crc.c b/lib/net/rte_net_crc.c
index 346c285c15..9f04a0cb57 100644
--- a/lib/net/rte_net_crc.c
+++ b/lib/net/rte_net_crc.c
@@ -67,6 +67,12 @@ static const rte_net_crc_handler handlers_neon[] = {
 	[RTE_NET_CRC32_ETH] = rte_crc32_eth_neon_handler,
 };
 #endif
+#ifdef CC_RISCV64_ZBC_CLMUL_SUPPORT
+static const rte_net_crc_handler handlers_zbc[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_zbc_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_zbc_handler,
+};
+#endif
 
 static uint16_t max_simd_bitwidth;
 
@@ -244,6 +250,31 @@ neon_pmull_init(void)
 #endif
 }
 
+/* ZBC/CLMUL handling */
+
+#define ZBC_CLMUL_CPU_SUPPORTED \
+	rte_cpu_get_flag_enabled(RTE_CPUFLAG_RISCV_EXT_ZBC)
+
+static const rte_net_crc_handler *
+zbc_clmul_get_handlers(void)
+{
+#ifdef CC_RISCV64_ZBC_CLMUL_SUPPORT
+	if (ZBC_CLMUL_CPU_SUPPORTED)
+		return handlers_zbc;
+#endif
+	NET_LOG(INFO, "Requirements not met, can't use Zbc");
+	return NULL;
+}
+
+static void
+zbc_clmul_init(void)
+{
+#ifdef CC_RISCV64_ZBC_CLMUL_SUPPORT
+	if (ZBC_CLMUL_CPU_SUPPORTED)
+		rte_net_crc_zbc_init();
+#endif
+}
+
 /* Default handling */
 
 static uint32_t
@@ -260,6 +291,9 @@ rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
 	if (handlers != NULL)
 		return handlers[RTE_NET_CRC16_CCITT](data, data_len);
 	handlers = neon_pmull_get_handlers();
+	if (handlers != NULL)
+		return handlers[RTE_NET_CRC16_CCITT](data, data_len);
+	handlers = zbc_clmul_get_handlers();
 	if (handlers != NULL)
 		return handlers[RTE_NET_CRC16_CCITT](data, data_len);
 	handlers = handlers_scalar;
@@ -282,6 +316,8 @@ rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
 	handlers = neon_pmull_get_handlers();
 	if (handlers != NULL)
 		return handlers[RTE_NET_CRC32_ETH](data, data_len);
+	handlers = zbc_clmul_get_handlers();
+		return handlers[RTE_NET_CRC32_ETH](data, data_len);
 	handlers = handlers_scalar;
 	return handlers[RTE_NET_CRC32_ETH](data, data_len);
 }
@@ -306,6 +342,9 @@ rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 		break; /* for x86, always break here */
 	case RTE_NET_CRC_NEON:
 		handlers = neon_pmull_get_handlers();
+		break;
+	case RTE_NET_CRC_ZBC:
+		handlers = zbc_clmul_get_handlers();
 		/* fall-through */
 	case RTE_NET_CRC_SCALAR:
 		/* fall-through */
@@ -338,4 +377,5 @@ RTE_INIT(rte_net_crc_init)
 	sse42_pclmulqdq_init();
 	avx512_vpclmulqdq_init();
 	neon_pmull_init();
+	zbc_clmul_init();
 }
diff --git a/lib/net/rte_net_crc.h b/lib/net/rte_net_crc.h
index 72d3e10ff6..12fa6a8a02 100644
--- a/lib/net/rte_net_crc.h
+++ b/lib/net/rte_net_crc.h
@@ -24,6 +24,7 @@ enum rte_net_crc_alg {
 	RTE_NET_CRC_SSE42,
 	RTE_NET_CRC_NEON,
 	RTE_NET_CRC_AVX512,
+	RTE_NET_CRC_ZBC,
 };
 
 /**
@@ -37,6 +38,7 @@ enum rte_net_crc_alg {
  *   - RTE_NET_CRC_SSE42 (Use 64-bit SSE4.2 intrinsic)
  *   - RTE_NET_CRC_NEON (Use ARM Neon intrinsic)
  *   - RTE_NET_CRC_AVX512 (Use 512-bit AVX intrinsic)
+ *   - RTE_NET_CRC_ZBC (Use RISC-V Zbc extension)
  */
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg);
-- 
2.39.2


  parent reply	other threads:[~2024-07-12 15:47 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-18 17:41 [PATCH 0/5] riscv: implement accelerated crc using zbc Daniel Gregory
2024-06-18 17:41 ` [PATCH 1/5] config/riscv: add flag for using Zbc extension Daniel Gregory
2024-06-18 20:03   ` Stephen Hemminger
2024-06-19  7:08     ` Morten Brørup
2024-06-19 14:49       ` Stephen Hemminger
2024-06-19 16:41       ` Daniel Gregory
2024-06-18 17:41 ` [PATCH 2/5] hash: implement crc using riscv carryless multiply Daniel Gregory
2024-06-18 17:41 ` [PATCH 3/5] net: " Daniel Gregory
2024-06-18 17:41 ` [PATCH 4/5] examples/l3fwd: use accelerated crc on riscv Daniel Gregory
2024-06-18 17:41 ` [PATCH 5/5] ipfrag: " Daniel Gregory
2024-07-12 15:46 ` [PATCH v2 0/9] riscv: implement accelerated crc using zbc Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 1/9] config/riscv: detect presence of Zbc extension Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 2/9] hash: implement crc using riscv carryless multiply Daniel Gregory
2024-07-12 15:46   ` Daniel Gregory [this message]
2024-07-12 15:46   ` [PATCH v2 4/9] config/riscv: add qemu crossbuild target Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 5/9] examples/l3fwd: use accelerated crc on riscv Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 6/9] ipfrag: " Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 7/9] examples/l3fwd-power: " Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 8/9] hash/cuckoo: " Daniel Gregory
2024-07-12 15:46   ` [PATCH v2 9/9] member: " Daniel Gregory
2024-07-12 17:19   ` [PATCH v2 0/9] riscv: implement accelerated crc using zbc David Marchand
2024-08-27 15:32   ` [PATCH v3 " Daniel Gregory
2024-08-27 15:32     ` [PATCH v3 1/9] config/riscv: detect presence of Zbc extension Daniel Gregory
2024-08-27 15:32     ` [PATCH v3 2/9] hash: implement CRC using riscv carryless multiply Daniel Gregory
2024-08-27 15:32     ` [PATCH v3 3/9] net: " Daniel Gregory
2024-08-27 15:32     ` [PATCH v3 4/9] config/riscv: add qemu crossbuild target Daniel Gregory
2024-08-27 15:36     ` [PATCH v3 5/9] examples/l3fwd: use accelerated CRC on riscv Daniel Gregory
2024-08-27 15:36       ` [PATCH v3 6/9] ipfrag: " Daniel Gregory
2024-08-27 15:36       ` [PATCH v3 7/9] examples/l3fwd-power: " Daniel Gregory
2024-08-27 15:36       ` [PATCH v3 8/9] hash/cuckoo: " Daniel Gregory
2024-08-27 15:36       ` [PATCH v3 9/9] member: " Daniel Gregory
2024-09-17 14:26     ` [PATCH v3 0/9] riscv: implement accelerated crc using zbc Daniel Gregory

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240712154645.80622-4-daniel.gregory@bytedance.com \
    --to=daniel.gregory@bytedance.com \
    --cc=dev@dpdk.org \
    --cc=fengchunsong@bytedance.com \
    --cc=jasvinder.singh@intel.com \
    --cc=liangma@liangbit.com \
    --cc=punit.agrawal@bytedance.com \
    --cc=stanislaw.kardach@gmail.com \
    --cc=thomas@monjalon.net \
    --cc=wangpengcheng.pp@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).