[PATCH v3] net: optimize raw checksum computation

DPDK patches and discussions
 help / color / mirror / Atom feed

* [PATCH v3] net: optimize raw checksum computation
@ 2026-01-07 17:04 scott.k.mitch1
  2026-01-07 17:56 ` Morten Brørup
  0 siblings, 1 reply; 5+ messages in thread
From: scott.k.mitch1 @ 2026-01-07 17:04 UTC (permalink / raw)
  To: dev; +Cc: mb, Scott Mitchell

From: Scott Mitchell <scott.k.mitch1@gmail.com>

Optimize __rte_raw_cksum() by processing data in larger unrolled loops
instead of iterating word-by-word. The new implementation processes
64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
32/16/8/4/2-byte chunks.

Uses uint32_t accumulator with explicit casts to prevent signed integer
overflow and leverages unaligned_uint16_t for safe unaligned access on
all platforms. Adds __rte_no_ubsan_alignment attribute to suppress false
positive alignment warnings from UndefinedBehaviorSanitizer.

Performance results from cksum_perf_autotest (TSC cycles/byte):
  Block size    Before    After    Improvement
         100  0.40-0.64  0.13-0.14    ~3-4x
        1500  0.49-0.51  0.10-0.11    ~4-5x
        9000  0.48-0.51  0.11-0.12    ~4x

Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
---
Changes in v3:
- Added __rte_no_ubsan_alignment macro to suppress false-positive UBSAN
  alignment warnings when using unaligned_uint16_t
- Fixed false-positive GCC maybe-uninitialized warning in rte_ip6.h exposed
  by optimization (can be split to separate patch once verified on CI)

Changes in v2:
- Fixed UndefinedBehaviorSanitizer errors by adding uint32_t casts to prevent
  signed integer overflow in addition chains
- Restored uint32_t sum accumulator instead of uint64_t
- Added 64k length to test_cksum_perf.c

 app/test/meson.build         |   1 +
 app/test/test_cksum_fuzz.c   | 241 +++++++++++++++++++++++++++++++++++
 app/test/test_cksum_perf.c   |   2 +-
 lib/eal/include/rte_common.h |   9 ++
 lib/net/rte_cksum.h          |  60 +++++++--
 lib/net/rte_ip6.h            |   2 +-
 6 files changed, 303 insertions(+), 12 deletions(-)
 create mode 100644 app/test/test_cksum_fuzz.c

diff --git a/app/test/meson.build b/app/test/meson.build
index efec42a6bf..c92325ad58 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -38,6 +38,7 @@ source_file_deps = {
     'test_byteorder.c': [],
     'test_cfgfile.c': ['cfgfile'],
     'test_cksum.c': ['net'],
+    'test_cksum_fuzz.c': ['net'],
     'test_cksum_perf.c': ['net'],
     'test_cmdline.c': [],
     'test_cmdline_cirbuf.c': [],
diff --git a/app/test/test_cksum_fuzz.c b/app/test/test_cksum_fuzz.c
new file mode 100644
index 0000000000..cc3c3e71e1
--- /dev/null
+++ b/app/test/test_cksum_fuzz.c
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Apple Inc.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_hexdump.h>
+#include <rte_cksum.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+
+#include "test.h"
+
+/*
+ * Fuzz test for __rte_raw_cksum optimization.
+ * Compares the optimized implementation against the original reference
+ * implementation across random data of various lengths.
+ */
+
+#define DEFAULT_ITERATIONS 1000
+#define MAX_TEST_LEN 65536  /* 64K to match GRO frame sizes */
+
+/*
+ * Original (reference) implementation of __rte_raw_cksum from DPDK v23.11.
+ * This is retained here for comparison testing against the optimized version.
+ */
+static inline uint32_t
+__rte_raw_cksum_reference(const void *buf, size_t len, uint32_t sum)
+{
+	const void *end;
+
+	for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, sizeof(uint16_t)));
+	     buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
+		uint16_t v;
+
+		memcpy(&v, buf, sizeof(uint16_t));
+		sum += v;
+	}
+
+	/* if length is odd, keeping it byte order independent */
+	if (unlikely(len % 2)) {
+		uint16_t left = 0;
+
+		memcpy(&left, end, 1);
+		sum += left;
+	}
+
+	return sum;
+}
+
+static void
+init_random_buffer(uint8_t *buf, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++)
+		buf[i] = (uint8_t)rte_rand();
+}
+
+static inline uint32_t
+get_initial_sum(bool random_initial_sum)
+{
+	return random_initial_sum ? (rte_rand() & 0xFFFFFFFF) : 0;
+}
+
+/*
+ * Test a single buffer length with specific alignment and initial sum
+ */
+static int
+test_cksum_fuzz_length_aligned(size_t len, bool aligned, uint32_t initial_sum)
+{
+	uint8_t *data;
+	uint8_t *buf;
+	size_t alloc_size;
+	uint32_t sum_ref, sum_opt;
+
+	if (len == 0 && !aligned) {
+		/* Skip unaligned test for zero length - nothing to test */
+		return TEST_SUCCESS;
+	}
+
+	/* Allocate exact size for aligned, +1 for unaligned offset */
+	alloc_size = aligned ? len : len + 1;
+	if (alloc_size == 0)
+		alloc_size = 1;  /* rte_malloc doesn't like 0 */
+
+	data = rte_malloc(NULL, alloc_size, 64);
+	if (data == NULL) {
+		printf("Failed to allocate %zu bytes\n", alloc_size);
+		return TEST_FAILED;
+	}
+
+	buf = aligned ? data : (data + 1);
+
+	init_random_buffer(buf, len);
+
+	sum_ref = __rte_raw_cksum_reference(buf, len, initial_sum);
+	sum_opt = __rte_raw_cksum(buf, len, initial_sum);
+
+	if (sum_ref != sum_opt) {
+		printf("MISMATCH at len=%zu aligned='%s' initial_sum=0x%08x ref=0x%08x opt=0x%08x\n",
+		       len, aligned ? "aligned" : "unaligned",
+		       initial_sum, sum_ref, sum_opt);
+		rte_hexdump(stdout, "failing buffer", buf, len);
+		rte_free(data);
+		return TEST_FAILED;
+	}
+
+	rte_free(data);
+	return TEST_SUCCESS;
+}
+
+/*
+ * Test a length with both alignments
+ */
+static int
+test_cksum_fuzz_length(size_t len, uint32_t initial_sum)
+{
+	int rc;
+
+	/* Test aligned */
+	rc = test_cksum_fuzz_length_aligned(len, true, initial_sum);
+	if (rc != TEST_SUCCESS)
+		return rc;
+
+	/* Test unaligned */
+	rc = test_cksum_fuzz_length_aligned(len, false, initial_sum);
+
+	return rc;
+}
+
+/*
+ * Test specific edge case lengths
+ */
+static int
+test_cksum_fuzz_edge_cases(void)
+{
+	/* Edge case lengths that might trigger bugs */
+	static const size_t edge_lengths[] = {
+		0, 1, 2, 3, 4, 5, 6, 7, 8,
+		15, 16, 17,
+		31, 32, 33,
+		63, 64, 65,
+		127, 128, 129,
+		255, 256, 257,
+		511, 512, 513,
+		1023, 1024, 1025,
+		1500, 1501,  /* MTU boundaries */
+		2047, 2048, 2049,
+		4095, 4096, 4097,
+		8191, 8192, 8193,
+		16383, 16384, 16385,
+		32767, 32768, 32769,
+		65534, 65535, 65536  /* 64K GRO boundaries */
+	};
+	unsigned int i;
+	int rc;
+
+	printf("Testing edge case lengths...\n");
+
+	for (i = 0; i < RTE_DIM(edge_lengths); i++) {
+		/* Test with zero initial sum */
+		rc = test_cksum_fuzz_length(edge_lengths[i], 0);
+		if (rc != TEST_SUCCESS)
+			return rc;
+
+		/* Test with random initial sum */
+		rc = test_cksum_fuzz_length(edge_lengths[i], get_initial_sum(true));
+		if (rc != TEST_SUCCESS)
+			return rc;
+	}
+
+	return TEST_SUCCESS;
+}
+
+/*
+ * Test random lengths with optional random initial sums
+ */
+static int
+test_cksum_fuzz_random(unsigned int iterations, bool random_initial_sum)
+{
+	unsigned int i;
+	int rc;
+
+	printf("Testing random lengths (0-%d)%s...\n", MAX_TEST_LEN,
+	       random_initial_sum ? " with random initial sums" : "");
+
+	for (i = 0; i < iterations; i++) {
+		size_t len = rte_rand() % (MAX_TEST_LEN + 1);
+
+		rc = test_cksum_fuzz_length(len, get_initial_sum(random_initial_sum));
+		if (rc != TEST_SUCCESS) {
+			printf("Failed at len=%zu\n", len);
+			return rc;
+		}
+	}
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_cksum_fuzz(void)
+{
+	int rc;
+	unsigned int iterations = DEFAULT_ITERATIONS;
+
+	printf("### __rte_raw_cksum optimization fuzz test ###\n");
+	printf("Iterations per test: %u\n\n", iterations);
+
+	/* Test edge cases */
+	rc = test_cksum_fuzz_edge_cases();
+	if (rc != TEST_SUCCESS) {
+		printf("Edge case test FAILED\n");
+		return rc;
+	}
+	printf("Edge case test PASSED\n\n");
+
+	/* Test random lengths with zero initial sum */
+	rc = test_cksum_fuzz_random(iterations, false);
+	if (rc != TEST_SUCCESS) {
+		printf("Random length test FAILED\n");
+		return rc;
+	}
+	printf("Random length test PASSED\n\n");
+
+	/* Test random lengths with random initial sums */
+	rc = test_cksum_fuzz_random(iterations, true);
+	if (rc != TEST_SUCCESS) {
+		printf("Random initial sum test FAILED\n");
+		return rc;
+	}
+	printf("Random initial sum test PASSED\n\n");
+
+	printf("All fuzz tests PASSED!\n");
+	return TEST_SUCCESS;
+}
+
+REGISTER_FAST_TEST(cksum_fuzz_autotest, true, true, test_cksum_fuzz);
diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c
index 0b919cd59f..6b1d4589e0 100644
--- a/app/test/test_cksum_perf.c
+++ b/app/test/test_cksum_perf.c
@@ -15,7 +15,7 @@
 #define NUM_BLOCKS 10
 #define ITERATIONS 1000000
 
-static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 };
+static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501, 9000, 9001, 65536, 65537 };
 
 static __rte_noinline uint16_t
 do_rte_raw_cksum(const void *buf, size_t len)
diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
index 9e7d84f929..37a36a1b22 100644
--- a/lib/eal/include/rte_common.h
+++ b/lib/eal/include/rte_common.h
@@ -546,6 +546,15 @@ static void __attribute__((destructor(RTE_PRIO(prio)), used)) func(void)
 #define __rte_no_asan
 #endif
 
+/**
+ * Disable UndefinedBehaviorSanitizer alignment checks
+ */
+#if defined(RTE_TOOLCHAIN_GCC) || defined(RTE_TOOLCHAIN_CLANG)
+#define __rte_no_ubsan_alignment __attribute__((no_sanitize("alignment")))
+#else
+#define __rte_no_ubsan_alignment
+#endif
+
 /*********** Macros for pointer arithmetic ********/
 
 /**
diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
index a8e8927952..d6e313dea5 100644
--- a/lib/net/rte_cksum.h
+++ b/lib/net/rte_cksum.h
@@ -39,24 +39,64 @@ extern "C" {
  * @return
  *   sum += Sum of all words in the buffer.
  */
+__rte_no_ubsan_alignment
 static inline uint32_t
 __rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
 {
-	const void *end;
+	/* Process in 64 byte blocks (32 x uint16_t). */
+	/* Always process as uint16_t chunks to preserve overflow/carry. */
+	const void *end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, 64));
+	while (buf != end) {
+		const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)buf;
+		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
+			 p16[4] + p16[5] + p16[6] + p16[7] +
+			 p16[8] + p16[9] + p16[10] + p16[11] +
+			 p16[12] + p16[13] + p16[14] + p16[15] +
+			 p16[16] + p16[17] + p16[18] + p16[19] +
+			 p16[20] + p16[21] + p16[22] + p16[23] +
+			 p16[24] + p16[25] + p16[26] + p16[27] +
+			 p16[28] + p16[29] + p16[30] + p16[31];
+		buf = RTE_PTR_ADD(buf, 64);
+	}
 
-	for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, sizeof(uint16_t)));
-	     buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
-		uint16_t v;
+	if (len & 32) {
+		const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)buf;
+		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
+			 p16[4] + p16[5] + p16[6] + p16[7] +
+			 p16[8] + p16[9] + p16[10] + p16[11] +
+			 p16[12] + p16[13] + p16[14] + p16[15];
+		buf = RTE_PTR_ADD(buf, 32);
+	}
 
-		memcpy(&v, buf, sizeof(uint16_t));
-		sum += v;
+	if (len & 16) {
+		const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)buf;
+		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
+			 p16[4] + p16[5] + p16[6] + p16[7];
+		buf = RTE_PTR_ADD(buf, 16);
 	}
 
-	/* if length is odd, keeping it byte order independent */
-	if (unlikely(len % 2)) {
-		uint16_t left = 0;
+	if (len & 8) {
+		const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)buf;
+		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3];
+		buf = RTE_PTR_ADD(buf, 8);
+	}
 
-		memcpy(&left, end, 1);
+	if (len & 4) {
+		const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)buf;
+		sum += (uint32_t)p16[0] + p16[1];
+		buf = RTE_PTR_ADD(buf, 4);
+	}
+
+	if (len & 2) {
+		const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)buf;
+		sum += *p16;
+		buf = RTE_PTR_ADD(buf, 2);
+	}
+
+	/* If length is odd use memcpy for byte order independence */
+	if (len & 1) {
+		uint16_t left = 0;
+		memcpy(&left, buf, 1);
 		sum += left;
 	}
 
diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
index d1abf1f5d5..af65a39815 100644
--- a/lib/net/rte_ip6.h
+++ b/lib/net/rte_ip6.h
@@ -564,7 +564,7 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr *ipv6_hdr, uint64_t ol_flags)
 	struct {
 		rte_be32_t len;   /* L4 length. */
 		rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero */
-	} psd_hdr;
+	} psd_hdr = {0}; /* Empty initializer avoids false-positive maybe-uninitialized warning */
 
 	psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
 	if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH v3] net: optimize raw checksum computation
  2026-01-07 17:04 [PATCH v3] net: optimize raw checksum computation scott.k.mitch1
@ 2026-01-07 17:56 ` Morten Brørup
  2026-01-07 22:06   ` Scott Mitchell
  2026-01-07 22:28   ` Scott Mitchell
  0 siblings, 2 replies; 5+ messages in thread
From: Morten Brørup @ 2026-01-07 17:56 UTC (permalink / raw)
  To: scott.k.mitch1, dev

> From: scott.k.mitch1@gmail.com [mailto:scott.k.mitch1@gmail.com]
> Sent: Wednesday, 7 January 2026 18.04
> 
> From: Scott Mitchell <scott.k.mitch1@gmail.com>
> 
> Optimize __rte_raw_cksum() by processing data in larger unrolled loops
> instead of iterating word-by-word. The new implementation processes
> 64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
> 32/16/8/4/2-byte chunks.

Playing around with Godbolt:
https://godbolt.org/z/oYdP9xxfG

With the original code (built with -msse4.2), the compiler vectorizes the loop to process 16-byte chunks (instead of the 2-byte chunks the source code indicates).
When built with -mavx512f, it processes 32-byte chunks.

IMHO, the compiled output of the new code is too big; using more than 12 kB instructions consumes too much L1 Instruction Cache.
I suppose the compiler both vectorizes and loop unrolls.

> 
> Uses uint32_t accumulator with explicit casts to prevent signed integer
> overflow and leverages unaligned_uint16_t for safe unaligned access on
> all platforms. Adds __rte_no_ubsan_alignment attribute to suppress
> false
> positive alignment warnings from UndefinedBehaviorSanitizer.
> 
> Performance results from cksum_perf_autotest (TSC cycles/byte):
>   Block size    Before    After    Improvement
>          100  0.40-0.64  0.13-0.14    ~3-4x
>         1500  0.49-0.51  0.10-0.11    ~4-5x
>         9000  0.48-0.51  0.11-0.12    ~4x

On which machine do you achieve these perf numbers?

Can a measurable performance increase be achieved using significantly smaller compiled code than this patch?

> 
> Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
> ---
> Changes in v3:
> - Added __rte_no_ubsan_alignment macro to suppress false-positive UBSAN
>   alignment warnings when using unaligned_uint16_t
> - Fixed false-positive GCC maybe-uninitialized warning in rte_ip6.h
> exposed
>   by optimization (can be split to separate patch once verified on CI)
> 
> Changes in v2:
> - Fixed UndefinedBehaviorSanitizer errors by adding uint32_t casts to
> prevent
>   signed integer overflow in addition chains
> - Restored uint32_t sum accumulator instead of uint64_t
> - Added 64k length to test_cksum_perf.c
> 


> diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
> index a8e8927952..d6e313dea5 100644
> --- a/lib/net/rte_cksum.h
> +++ b/lib/net/rte_cksum.h
> @@ -39,24 +39,64 @@ extern "C" {
>   * @return
>   *   sum += Sum of all words in the buffer.
>   */
> +__rte_no_ubsan_alignment
>  static inline uint32_t
>  __rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
>  {
> -	const void *end;
> +	/* Process in 64 byte blocks (32 x uint16_t). */
> +	/* Always process as uint16_t chunks to preserve overflow/carry.
> */
> +	const void *end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, 64));
> +	while (buf != end) {
> +		const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> *)buf;
> +		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> +			 p16[4] + p16[5] + p16[6] + p16[7] +
> +			 p16[8] + p16[9] + p16[10] + p16[11] +
> +			 p16[12] + p16[13] + p16[14] + p16[15] +
> +			 p16[16] + p16[17] + p16[18] + p16[19] +
> +			 p16[20] + p16[21] + p16[22] + p16[23] +
> +			 p16[24] + p16[25] + p16[26] + p16[27] +
> +			 p16[28] + p16[29] + p16[30] + p16[31];
> +		buf = RTE_PTR_ADD(buf, 64);
> +	}
> 
> -	for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len,
> sizeof(uint16_t)));
> -	     buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
> -		uint16_t v;
> +	if (len & 32) {
> +		const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> *)buf;
> +		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> +			 p16[4] + p16[5] + p16[6] + p16[7] +
> +			 p16[8] + p16[9] + p16[10] + p16[11] +
> +			 p16[12] + p16[13] + p16[14] + p16[15];
> +		buf = RTE_PTR_ADD(buf, 32);
> +	}
> 
> -		memcpy(&v, buf, sizeof(uint16_t));
> -		sum += v;
> +	if (len & 16) {
> +		const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> *)buf;
> +		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> +			 p16[4] + p16[5] + p16[6] + p16[7];
> +		buf = RTE_PTR_ADD(buf, 16);
>  	}
> 
> -	/* if length is odd, keeping it byte order independent */
> -	if (unlikely(len % 2)) {
> -		uint16_t left = 0;
> +	if (len & 8) {
> +		const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> *)buf;
> +		sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3];
> +		buf = RTE_PTR_ADD(buf, 8);
> +	}
> 
> -		memcpy(&left, end, 1);
> +	if (len & 4) {
> +		const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> *)buf;
> +		sum += (uint32_t)p16[0] + p16[1];
> +		buf = RTE_PTR_ADD(buf, 4);
> +	}
> +
> +	if (len & 2) {
> +		const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> *)buf;
> +		sum += *p16;
> +		buf = RTE_PTR_ADD(buf, 2);
> +	}
> +
> +	/* If length is odd use memcpy for byte order independence */
> +	if (len & 1) {
> +		uint16_t left = 0;
> +		memcpy(&left, buf, 1);
>  		sum += left;
>  	}
> 
> diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
> index d1abf1f5d5..af65a39815 100644
> --- a/lib/net/rte_ip6.h
> +++ b/lib/net/rte_ip6.h
> @@ -564,7 +564,7 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr
> *ipv6_hdr, uint64_t ol_flags)
>  	struct {
>  		rte_be32_t len;   /* L4 length. */
>  		rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero
> */
> -	} psd_hdr;
> +	} psd_hdr = {0}; /* Empty initializer avoids false-positive
> maybe-uninitialized warning */
> 
>  	psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
>  	if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))

Maybe ipv6 can be fixed like this instead:
-	if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
-		psd_hdr.len = 0;
-	else
-		psd_hdr.len = ipv6_hdr->payload_len;
+	psd_hdr.len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG)) ?
+			0 : psd_hdr.len = ipv6_hdr->payload_len;

> --
> 2.39.5 (Apple Git-154)


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] net: optimize raw checksum computation
  2026-01-07 17:56 ` Morten Brørup
@ 2026-01-07 22:06   ` Scott Mitchell
  2026-01-07 22:28   ` Scott Mitchell
  1 sibling, 0 replies; 5+ messages in thread
From: Scott Mitchell @ 2026-01-07 22:06 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Wed, Jan 7, 2026 at 12:56 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: scott.k.mitch1@gmail.com [mailto:scott.k.mitch1@gmail.com]
> > Sent: Wednesday, 7 January 2026 18.04
> >
> > From: Scott Mitchell <scott.k.mitch1@gmail.com>
> >
> > Optimize __rte_raw_cksum() by processing data in larger unrolled loops
> > instead of iterating word-by-word. The new implementation processes
> > 64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
> > 32/16/8/4/2-byte chunks.
>
> Playing around with Godbolt:
> https://godbolt.org/z/oYdP9xxfG
>
> With the original code (built with -msse4.2), the compiler vectorizes the loop to process 16-byte chunks (instead of the 2-byte chunks the source code indicates).
> When built with -mavx512f, it processes 32-byte chunks.
>
> IMHO, the compiled output of the new code is too big; using more than 12 kB instructions consumes too much L1 Instruction Cache.
> I suppose the compiler both vectorizes and loop unrolls.

Good observation, and godbolt is very handy! Agreed this patch isn't
desirable on x86-64 with gcc 15.2. I am using clang 18.1.8 (Redhat)
and the original version doesn't vectorize while my patch does
vectorize and icache isn't as bloated as gcc (explains the perf
difference).

I'm exploring an approach that will vectorize on both gcc and clang
and will submit an update soon.

>
> >
> > Uses uint32_t accumulator with explicit casts to prevent signed integer
> > overflow and leverages unaligned_uint16_t for safe unaligned access on
> > all platforms. Adds __rte_no_ubsan_alignment attribute to suppress
> > false
> > positive alignment warnings from UndefinedBehaviorSanitizer.
> >
> > Performance results from cksum_perf_autotest (TSC cycles/byte):
> >   Block size    Before    After    Improvement
> >          100  0.40-0.64  0.13-0.14    ~3-4x
> >         1500  0.49-0.51  0.10-0.11    ~4-5x
> >         9000  0.48-0.51  0.11-0.12    ~4x
>
> On which machine do you achieve these perf numbers?
>
> Can a measurable performance increase be achieved using significantly smaller compiled code than this patch?
>
> >
> > Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
> > ---
> > Changes in v3:
> > - Added __rte_no_ubsan_alignment macro to suppress false-positive UBSAN
> >   alignment warnings when using unaligned_uint16_t
> > - Fixed false-positive GCC maybe-uninitialized warning in rte_ip6.h
> > exposed
> >   by optimization (can be split to separate patch once verified on CI)
> >
> > Changes in v2:
> > - Fixed UndefinedBehaviorSanitizer errors by adding uint32_t casts to
> > prevent
> >   signed integer overflow in addition chains
> > - Restored uint32_t sum accumulator instead of uint64_t
> > - Added 64k length to test_cksum_perf.c
> >
>
>
> > diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
> > index a8e8927952..d6e313dea5 100644
> > --- a/lib/net/rte_cksum.h
> > +++ b/lib/net/rte_cksum.h
> > @@ -39,24 +39,64 @@ extern "C" {
> >   * @return
> >   *   sum += Sum of all words in the buffer.
> >   */
> > +__rte_no_ubsan_alignment
> >  static inline uint32_t
> >  __rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
> >  {
> > -     const void *end;
> > +     /* Process in 64 byte blocks (32 x uint16_t). */
> > +     /* Always process as uint16_t chunks to preserve overflow/carry.
> > */
> > +     const void *end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, 64));
> > +     while (buf != end) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> > +                      p16[4] + p16[5] + p16[6] + p16[7] +
> > +                      p16[8] + p16[9] + p16[10] + p16[11] +
> > +                      p16[12] + p16[13] + p16[14] + p16[15] +
> > +                      p16[16] + p16[17] + p16[18] + p16[19] +
> > +                      p16[20] + p16[21] + p16[22] + p16[23] +
> > +                      p16[24] + p16[25] + p16[26] + p16[27] +
> > +                      p16[28] + p16[29] + p16[30] + p16[31];
> > +             buf = RTE_PTR_ADD(buf, 64);
> > +     }
> >
> > -     for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len,
> > sizeof(uint16_t)));
> > -          buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
> > -             uint16_t v;
> > +     if (len & 32) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> > +                      p16[4] + p16[5] + p16[6] + p16[7] +
> > +                      p16[8] + p16[9] + p16[10] + p16[11] +
> > +                      p16[12] + p16[13] + p16[14] + p16[15];
> > +             buf = RTE_PTR_ADD(buf, 32);
> > +     }
> >
> > -             memcpy(&v, buf, sizeof(uint16_t));
> > -             sum += v;
> > +     if (len & 16) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> > +                      p16[4] + p16[5] + p16[6] + p16[7];
> > +             buf = RTE_PTR_ADD(buf, 16);
> >       }
> >
> > -     /* if length is odd, keeping it byte order independent */
> > -     if (unlikely(len % 2)) {
> > -             uint16_t left = 0;
> > +     if (len & 8) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3];
> > +             buf = RTE_PTR_ADD(buf, 8);
> > +     }
> >
> > -             memcpy(&left, end, 1);
> > +     if (len & 4) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1];
> > +             buf = RTE_PTR_ADD(buf, 4);
> > +     }
> > +
> > +     if (len & 2) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += *p16;
> > +             buf = RTE_PTR_ADD(buf, 2);
> > +     }
> > +
> > +     /* If length is odd use memcpy for byte order independence */
> > +     if (len & 1) {
> > +             uint16_t left = 0;
> > +             memcpy(&left, buf, 1);
> >               sum += left;
> >       }
> >
> > diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
> > index d1abf1f5d5..af65a39815 100644
> > --- a/lib/net/rte_ip6.h
> > +++ b/lib/net/rte_ip6.h
> > @@ -564,7 +564,7 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr
> > *ipv6_hdr, uint64_t ol_flags)
> >       struct {
> >               rte_be32_t len;   /* L4 length. */
> >               rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero
> > */
> > -     } psd_hdr;
> > +     } psd_hdr = {0}; /* Empty initializer avoids false-positive
> > maybe-uninitialized warning */
> >
> >       psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
> >       if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
>
> Maybe ipv6 can be fixed like this instead:
> -       if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
> -               psd_hdr.len = 0;
> -       else
> -               psd_hdr.len = ipv6_hdr->payload_len;
> +       psd_hdr.len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG)) ?
> +                       0 : psd_hdr.len = ipv6_hdr->payload_len;
>
> > --
> > 2.39.5 (Apple Git-154)
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] net: optimize raw checksum computation
  2026-01-07 17:56 ` Morten Brørup
  2026-01-07 22:06   ` Scott Mitchell
@ 2026-01-07 22:28   ` Scott Mitchell
  2026-01-08  0:09     ` Stephen Hemminger
  1 sibling, 1 reply; 5+ messages in thread
From: Scott Mitchell @ 2026-01-07 22:28 UTC (permalink / raw)
  To: Morten Brørup; +Cc: dev

On Wed, Jan 7, 2026 at 12:56 PM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: scott.k.mitch1@gmail.com [mailto:scott.k.mitch1@gmail.com]
> > Sent: Wednesday, 7 January 2026 18.04
> >
> > From: Scott Mitchell <scott.k.mitch1@gmail.com>
> >
> > Optimize __rte_raw_cksum() by processing data in larger unrolled loops
> > instead of iterating word-by-word. The new implementation processes
> > 64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
> > 32/16/8/4/2-byte chunks.
>
> Playing around with Godbolt:
> https://godbolt.org/z/oYdP9xxfG
>
> With the original code (built with -msse4.2), the compiler vectorizes the loop to process 16-byte chunks (instead of the 2-byte chunks the source code indicates).
> When built with -mavx512f, it processes 32-byte chunks.
>
> IMHO, the compiled output of the new code is too big; using more than 12 kB instructions consumes too much L1 Instruction Cache.
> I suppose the compiler both vectorizes and loop unrolls.
>
> >
> > Uses uint32_t accumulator with explicit casts to prevent signed integer
> > overflow and leverages unaligned_uint16_t for safe unaligned access on
> > all platforms. Adds __rte_no_ubsan_alignment attribute to suppress
> > false
> > positive alignment warnings from UndefinedBehaviorSanitizer.
> >
> > Performance results from cksum_perf_autotest (TSC cycles/byte):
> >   Block size    Before    After    Improvement
> >          100  0.40-0.64  0.13-0.14    ~3-4x
> >         1500  0.49-0.51  0.10-0.11    ~4-5x
> >         9000  0.48-0.51  0.11-0.12    ~4x
>
> On which machine do you achieve these perf numbers?
>
> Can a measurable performance increase be achieved using significantly smaller compiled code than this patch?
>
> >
> > Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
> > ---
> > Changes in v3:
> > - Added __rte_no_ubsan_alignment macro to suppress false-positive UBSAN
> >   alignment warnings when using unaligned_uint16_t
> > - Fixed false-positive GCC maybe-uninitialized warning in rte_ip6.h
> > exposed
> >   by optimization (can be split to separate patch once verified on CI)
> >
> > Changes in v2:
> > - Fixed UndefinedBehaviorSanitizer errors by adding uint32_t casts to
> > prevent
> >   signed integer overflow in addition chains
> > - Restored uint32_t sum accumulator instead of uint64_t
> > - Added 64k length to test_cksum_perf.c
> >
>
>
> > diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
> > index a8e8927952..d6e313dea5 100644
> > --- a/lib/net/rte_cksum.h
> > +++ b/lib/net/rte_cksum.h
> > @@ -39,24 +39,64 @@ extern "C" {
> >   * @return
> >   *   sum += Sum of all words in the buffer.
> >   */
> > +__rte_no_ubsan_alignment
> >  static inline uint32_t
> >  __rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
> >  {
> > -     const void *end;
> > +     /* Process in 64 byte blocks (32 x uint16_t). */
> > +     /* Always process as uint16_t chunks to preserve overflow/carry.
> > */
> > +     const void *end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, 64));
> > +     while (buf != end) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> > +                      p16[4] + p16[5] + p16[6] + p16[7] +
> > +                      p16[8] + p16[9] + p16[10] + p16[11] +
> > +                      p16[12] + p16[13] + p16[14] + p16[15] +
> > +                      p16[16] + p16[17] + p16[18] + p16[19] +
> > +                      p16[20] + p16[21] + p16[22] + p16[23] +
> > +                      p16[24] + p16[25] + p16[26] + p16[27] +
> > +                      p16[28] + p16[29] + p16[30] + p16[31];
> > +             buf = RTE_PTR_ADD(buf, 64);
> > +     }
> >
> > -     for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len,
> > sizeof(uint16_t)));
> > -          buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
> > -             uint16_t v;
> > +     if (len & 32) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> > +                      p16[4] + p16[5] + p16[6] + p16[7] +
> > +                      p16[8] + p16[9] + p16[10] + p16[11] +
> > +                      p16[12] + p16[13] + p16[14] + p16[15];
> > +             buf = RTE_PTR_ADD(buf, 32);
> > +     }
> >
> > -             memcpy(&v, buf, sizeof(uint16_t));
> > -             sum += v;
> > +     if (len & 16) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3] +
> > +                      p16[4] + p16[5] + p16[6] + p16[7];
> > +             buf = RTE_PTR_ADD(buf, 16);
> >       }
> >
> > -     /* if length is odd, keeping it byte order independent */
> > -     if (unlikely(len % 2)) {
> > -             uint16_t left = 0;
> > +     if (len & 8) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1] + p16[2] + p16[3];
> > +             buf = RTE_PTR_ADD(buf, 8);
> > +     }
> >
> > -             memcpy(&left, end, 1);
> > +     if (len & 4) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += (uint32_t)p16[0] + p16[1];
> > +             buf = RTE_PTR_ADD(buf, 4);
> > +     }
> > +
> > +     if (len & 2) {
> > +             const unaligned_uint16_t *p16 = (const unaligned_uint16_t
> > *)buf;
> > +             sum += *p16;
> > +             buf = RTE_PTR_ADD(buf, 2);
> > +     }
> > +
> > +     /* If length is odd use memcpy for byte order independence */
> > +     if (len & 1) {
> > +             uint16_t left = 0;
> > +             memcpy(&left, buf, 1);
> >               sum += left;
> >       }
> >
> > diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
> > index d1abf1f5d5..af65a39815 100644
> > --- a/lib/net/rte_ip6.h
> > +++ b/lib/net/rte_ip6.h
> > @@ -564,7 +564,7 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr
> > *ipv6_hdr, uint64_t ol_flags)
> >       struct {
> >               rte_be32_t len;   /* L4 length. */
> >               rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero
> > */
> > -     } psd_hdr;
> > +     } psd_hdr = {0}; /* Empty initializer avoids false-positive
> > maybe-uninitialized warning */
> >
> >       psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
> >       if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
>
> Maybe ipv6 can be fixed like this instead:
> -       if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
> -               psd_hdr.len = 0;
> -       else
> -               psd_hdr.len = ipv6_hdr->payload_len;
> +       psd_hdr.len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG)) ?
> +                       0 : psd_hdr.len = ipv6_hdr->payload_len;
>

(sorry missed this in my last response). I tried this and a few other
options (compound literal with each field explicitly initialized,
zero/empty initializer) the only code solution that removed the
warning was an explicit memset(0), but this also modified the
assembly. Safest option is to use memset (with some runtime cost) and
a fallback is to add the zero initializer "just in case the
compiler/target-architecture requires it" and add `#pragma GCC
diagnostic ignored "-Wmaybe-uninitialized"` for now to suppress
warning. I'll push the second option in my next patch and we can
discuss/adjust accordingly.

> > --
> > 2.39.5 (Apple Git-154)
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] net: optimize raw checksum computation
  2026-01-07 22:28   ` Scott Mitchell
@ 2026-01-08  0:09     ` Stephen Hemminger
  0 siblings, 0 replies; 5+ messages in thread
From: Stephen Hemminger @ 2026-01-08  0:09 UTC (permalink / raw)
  To: Scott Mitchell; +Cc: Morten Brørup, dev

On Wed, 7 Jan 2026 17:28:41 -0500
Scott Mitchell <scott.k.mitch1@gmail.com> wrote:

> (sorry missed this in my last response). I tried this and a few other
> options (compound literal with each field explicitly initialized,
> zero/empty initializer) the only code solution that removed the
> warning was an explicit memset(0), but this also modified the
> assembly. Safest option is to use memset (with some runtime cost) and
> a fallback is to add the zero initializer "just in case the
> compiler/target-architecture requires it" and add `#pragma GCC
> diagnostic ignored "-Wmaybe-uninitialized"` for now to suppress
> warning. I'll push the second option in my next patch and we can
> discuss/adjust accordingly.


No pragma's please. The compiler is usually right.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2026-01-08  0:09 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-07 17:04 [PATCH v3] net: optimize raw checksum computation scott.k.mitch1
2026-01-07 17:56 ` Morten Brørup
2026-01-07 22:06   ` Scott Mitchell
2026-01-07 22:28   ` Scott Mitchell
2026-01-08  0:09     ` Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).