* [PATCH] net: optimize raw checksum computation
@ 2026-01-05 23:27 scott.k.mitch1
2026-01-06 10:59 ` Morten Brørup
0 siblings, 1 reply; 4+ messages in thread
From: scott.k.mitch1 @ 2026-01-05 23:27 UTC (permalink / raw)
To: dev; +Cc: Scott Mitchell
From: Scott Mitchell <scott.k.mitch1@gmail.com>
Optimize __rte_raw_cksum() by processing data in larger unrolled loops
instead of iterating word-by-word. The new implementation processes
64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
32/16/8/4/2-byte chunks.
Uses uint64_t accumulator to reduce carry propagation overhead and
leverages unaligned_uint16_t for safe unaligned access on all platforms.
Performance results from cksum_perf_autotest (TSC cycles/byte):
Block size Before After Improvement
100 0.40-0.64 0.13-0.14 ~3-4x
1500 0.49-0.51 0.10-0.11 ~4-5x
9000 0.48-0.51 0.11-0.12 ~4x
Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
---
app/test/meson.build | 1 +
app/test/test_cksum_fuzz.c | 241 +++++++++++++++++++++++++++++++++++++
app/test/test_cksum_perf.c | 2 +-
lib/net/rte_cksum.h | 63 ++++++++--
4 files changed, 295 insertions(+), 12 deletions(-)
create mode 100644 app/test/test_cksum_fuzz.c
diff --git a/app/test/meson.build b/app/test/meson.build
index efec42a6bf..c92325ad58 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -38,6 +38,7 @@ source_file_deps = {
'test_byteorder.c': [],
'test_cfgfile.c': ['cfgfile'],
'test_cksum.c': ['net'],
+ 'test_cksum_fuzz.c': ['net'],
'test_cksum_perf.c': ['net'],
'test_cmdline.c': [],
'test_cmdline_cirbuf.c': [],
diff --git a/app/test/test_cksum_fuzz.c b/app/test/test_cksum_fuzz.c
new file mode 100644
index 0000000000..cc3c3e71e1
--- /dev/null
+++ b/app/test/test_cksum_fuzz.c
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Apple Inc.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_hexdump.h>
+#include <rte_cksum.h>
+#include <rte_malloc.h>
+#include <rte_random.h>
+
+#include "test.h"
+
+/*
+ * Fuzz test for __rte_raw_cksum optimization.
+ * Compares the optimized implementation against the original reference
+ * implementation across random data of various lengths.
+ */
+
+#define DEFAULT_ITERATIONS 1000
+#define MAX_TEST_LEN 65536 /* 64K to match GRO frame sizes */
+
+/*
+ * Original (reference) implementation of __rte_raw_cksum from DPDK v23.11.
+ * This is retained here for comparison testing against the optimized version.
+ */
+static inline uint32_t
+__rte_raw_cksum_reference(const void *buf, size_t len, uint32_t sum)
+{
+ const void *end;
+
+ for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, sizeof(uint16_t)));
+ buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
+ uint16_t v;
+
+ memcpy(&v, buf, sizeof(uint16_t));
+ sum += v;
+ }
+
+ /* if length is odd, keeping it byte order independent */
+ if (unlikely(len % 2)) {
+ uint16_t left = 0;
+
+ memcpy(&left, end, 1);
+ sum += left;
+ }
+
+ return sum;
+}
+
+static void
+init_random_buffer(uint8_t *buf, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++)
+ buf[i] = (uint8_t)rte_rand();
+}
+
+static inline uint32_t
+get_initial_sum(bool random_initial_sum)
+{
+ return random_initial_sum ? (rte_rand() & 0xFFFFFFFF) : 0;
+}
+
+/*
+ * Test a single buffer length with specific alignment and initial sum
+ */
+static int
+test_cksum_fuzz_length_aligned(size_t len, bool aligned, uint32_t initial_sum)
+{
+ uint8_t *data;
+ uint8_t *buf;
+ size_t alloc_size;
+ uint32_t sum_ref, sum_opt;
+
+ if (len == 0 && !aligned) {
+ /* Skip unaligned test for zero length - nothing to test */
+ return TEST_SUCCESS;
+ }
+
+ /* Allocate exact size for aligned, +1 for unaligned offset */
+ alloc_size = aligned ? len : len + 1;
+ if (alloc_size == 0)
+ alloc_size = 1; /* rte_malloc doesn't like 0 */
+
+ data = rte_malloc(NULL, alloc_size, 64);
+ if (data == NULL) {
+ printf("Failed to allocate %zu bytes\n", alloc_size);
+ return TEST_FAILED;
+ }
+
+ buf = aligned ? data : (data + 1);
+
+ init_random_buffer(buf, len);
+
+ sum_ref = __rte_raw_cksum_reference(buf, len, initial_sum);
+ sum_opt = __rte_raw_cksum(buf, len, initial_sum);
+
+ if (sum_ref != sum_opt) {
+ printf("MISMATCH at len=%zu aligned='%s' initial_sum=0x%08x ref=0x%08x opt=0x%08x\n",
+ len, aligned ? "aligned" : "unaligned",
+ initial_sum, sum_ref, sum_opt);
+ rte_hexdump(stdout, "failing buffer", buf, len);
+ rte_free(data);
+ return TEST_FAILED;
+ }
+
+ rte_free(data);
+ return TEST_SUCCESS;
+}
+
+/*
+ * Test a length with both alignments
+ */
+static int
+test_cksum_fuzz_length(size_t len, uint32_t initial_sum)
+{
+ int rc;
+
+ /* Test aligned */
+ rc = test_cksum_fuzz_length_aligned(len, true, initial_sum);
+ if (rc != TEST_SUCCESS)
+ return rc;
+
+ /* Test unaligned */
+ rc = test_cksum_fuzz_length_aligned(len, false, initial_sum);
+
+ return rc;
+}
+
+/*
+ * Test specific edge case lengths
+ */
+static int
+test_cksum_fuzz_edge_cases(void)
+{
+ /* Edge case lengths that might trigger bugs */
+ static const size_t edge_lengths[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 15, 16, 17,
+ 31, 32, 33,
+ 63, 64, 65,
+ 127, 128, 129,
+ 255, 256, 257,
+ 511, 512, 513,
+ 1023, 1024, 1025,
+ 1500, 1501, /* MTU boundaries */
+ 2047, 2048, 2049,
+ 4095, 4096, 4097,
+ 8191, 8192, 8193,
+ 16383, 16384, 16385,
+ 32767, 32768, 32769,
+ 65534, 65535, 65536 /* 64K GRO boundaries */
+ };
+ unsigned int i;
+ int rc;
+
+ printf("Testing edge case lengths...\n");
+
+ for (i = 0; i < RTE_DIM(edge_lengths); i++) {
+ /* Test with zero initial sum */
+ rc = test_cksum_fuzz_length(edge_lengths[i], 0);
+ if (rc != TEST_SUCCESS)
+ return rc;
+
+ /* Test with random initial sum */
+ rc = test_cksum_fuzz_length(edge_lengths[i], get_initial_sum(true));
+ if (rc != TEST_SUCCESS)
+ return rc;
+ }
+
+ return TEST_SUCCESS;
+}
+
+/*
+ * Test random lengths with optional random initial sums
+ */
+static int
+test_cksum_fuzz_random(unsigned int iterations, bool random_initial_sum)
+{
+ unsigned int i;
+ int rc;
+
+ printf("Testing random lengths (0-%d)%s...\n", MAX_TEST_LEN,
+ random_initial_sum ? " with random initial sums" : "");
+
+ for (i = 0; i < iterations; i++) {
+ size_t len = rte_rand() % (MAX_TEST_LEN + 1);
+
+ rc = test_cksum_fuzz_length(len, get_initial_sum(random_initial_sum));
+ if (rc != TEST_SUCCESS) {
+ printf("Failed at len=%zu\n", len);
+ return rc;
+ }
+ }
+
+ return TEST_SUCCESS;
+}
+
+static int
+test_cksum_fuzz(void)
+{
+ int rc;
+ unsigned int iterations = DEFAULT_ITERATIONS;
+
+ printf("### __rte_raw_cksum optimization fuzz test ###\n");
+ printf("Iterations per test: %u\n\n", iterations);
+
+ /* Test edge cases */
+ rc = test_cksum_fuzz_edge_cases();
+ if (rc != TEST_SUCCESS) {
+ printf("Edge case test FAILED\n");
+ return rc;
+ }
+ printf("Edge case test PASSED\n\n");
+
+ /* Test random lengths with zero initial sum */
+ rc = test_cksum_fuzz_random(iterations, false);
+ if (rc != TEST_SUCCESS) {
+ printf("Random length test FAILED\n");
+ return rc;
+ }
+ printf("Random length test PASSED\n\n");
+
+ /* Test random lengths with random initial sums */
+ rc = test_cksum_fuzz_random(iterations, true);
+ if (rc != TEST_SUCCESS) {
+ printf("Random initial sum test FAILED\n");
+ return rc;
+ }
+ printf("Random initial sum test PASSED\n\n");
+
+ printf("All fuzz tests PASSED!\n");
+ return TEST_SUCCESS;
+}
+
+REGISTER_FAST_TEST(cksum_fuzz_autotest, true, true, test_cksum_fuzz);
diff --git a/app/test/test_cksum_perf.c b/app/test/test_cksum_perf.c
index 0b919cd59f..092332ceba 100644
--- a/app/test/test_cksum_perf.c
+++ b/app/test/test_cksum_perf.c
@@ -15,7 +15,7 @@
#define NUM_BLOCKS 10
#define ITERATIONS 1000000
-static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501 };
+static const size_t data_sizes[] = { 20, 21, 100, 101, 1500, 1501, 9000, 9001 };
static __rte_noinline uint16_t
do_rte_raw_cksum(const void *buf, size_t len)
diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
index a8e8927952..0322e69a74 100644
--- a/lib/net/rte_cksum.h
+++ b/lib/net/rte_cksum.h
@@ -42,25 +42,66 @@ extern "C" {
static inline uint32_t
__rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
{
- const void *end;
+ const void *ptr = buf;
+ const void *end = RTE_PTR_ADD(ptr, RTE_ALIGN_FLOOR(len, 64));
+ uint64_t sum64 = sum;
+
+ /* Process in 64 byte blocks (32 x uint16_t). */
+ /* Always process as uint16_t chunks to preserve overflow/carry. */
+ while (ptr != end) {
+ const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)ptr;
+ sum64 += p16[0] + p16[1] + p16[2] + p16[3] +
+ p16[4] + p16[5] + p16[6] + p16[7] +
+ p16[8] + p16[9] + p16[10] + p16[11] +
+ p16[12] + p16[13] + p16[14] + p16[15] +
+ p16[16] + p16[17] + p16[18] + p16[19] +
+ p16[20] + p16[21] + p16[22] + p16[23] +
+ p16[24] + p16[25] + p16[26] + p16[27] +
+ p16[28] + p16[29] + p16[30] + p16[31];
+ ptr = RTE_PTR_ADD(ptr, 64);
+ }
- for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len, sizeof(uint16_t)));
- buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
- uint16_t v;
+ if ((len & 32) != 0) {
+ const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)ptr;
+ sum64 += p16[0] + p16[1] + p16[2] + p16[3] +
+ p16[4] + p16[5] + p16[6] + p16[7] +
+ p16[8] + p16[9] + p16[10] + p16[11] +
+ p16[12] + p16[13] + p16[14] + p16[15];
+ ptr = RTE_PTR_ADD(ptr, 32);
+ }
- memcpy(&v, buf, sizeof(uint16_t));
- sum += v;
+ if ((len & 16) != 0) {
+ const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)ptr;
+ sum64 += p16[0] + p16[1] + p16[2] + p16[3] + p16[4] + p16[5] + p16[6] + p16[7];
+ ptr = RTE_PTR_ADD(ptr, 16);
+ }
+
+ if ((len & 8) != 0) {
+ const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)ptr;
+ sum64 += p16[0] + p16[1] + p16[2] + p16[3];
+ ptr = RTE_PTR_ADD(ptr, 8);
+ }
+
+ if ((len & 4) != 0) {
+ const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)ptr;
+ sum64 += p16[0] + p16[1];
+ ptr = RTE_PTR_ADD(ptr, 4);
+ }
+
+ if ((len & 2) != 0) {
+ const unaligned_uint16_t *p16 = (const unaligned_uint16_t *)ptr;
+ sum64 += *p16;
+ ptr = RTE_PTR_ADD(ptr, 2);
}
/* if length is odd, keeping it byte order independent */
- if (unlikely(len % 2)) {
+ if (unlikely(len & 1)) {
uint16_t left = 0;
-
- memcpy(&left, end, 1);
- sum += left;
+ memcpy(&left, ptr, 1);
+ sum64 += left;
}
- return sum;
+ return (uint32_t)sum64;
}
/**
--
2.39.5 (Apple Git-154)
^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: [PATCH] net: optimize raw checksum computation
2026-01-05 23:27 [PATCH] net: optimize raw checksum computation scott.k.mitch1
@ 2026-01-06 10:59 ` Morten Brørup
2026-01-06 18:16 ` Scott Mitchell
0 siblings, 1 reply; 4+ messages in thread
From: Morten Brørup @ 2026-01-06 10:59 UTC (permalink / raw)
To: Scott Mitchell, dev
> From: Scott Mitchell <scott.k.mitch1@gmail.com>
>
> Optimize __rte_raw_cksum() by processing data in larger unrolled loops
> instead of iterating word-by-word. The new implementation processes
> 64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
> 32/16/8/4/2-byte chunks.
Good idea processing in 64-byte blocks!
I wonder if there would be further gain by 64-byte aligning the 64-byte chunks, so the compiler can use vector instructions for summing the 32 2-byte words of each 64-byte chunk.
This would require a 3-step algorithm:
1. Process the first 0..63 bytes preceding the first 64-byte aligned address. (These bytes are unaligned; nothing new here.)
2. Process 64-byte chunks, if any. These are now 64-byte aligned, and you should ensure that the compiler knows it.
3. Process the last 32/16/8/4/2/1-byte chunks. These are now aligned, which eliminates the need for unaligned_uint16_t in this step. Specifically, the 32-byte chunk will be 64-byte aligned, allowing the compiler to use vector instructions. The 16-byte chunk will be 32-byte aligned. Etc.
<random idea>
Step 1 may be performed in reverse order of step 3, i.e. process in chunks of 1/2/4/8/16/32 bytes (using the lowest bits of the address as condition) - which will cause the alignment to increase accordingly.
</random idea>
<feature creep>
Checking the alignment at runtime has a non-zero cost, so a an alternative (simpler) code path might be beneficial for small lengths (when the alignment is unknown at runtime).
</feature creep>
>
> Uses uint64_t accumulator to reduce carry propagation overhead
You return (uint32_t)sum64 at the end, so why replace the existing 32-bit "sum" with a 64-bit "sum64" accumulator?
> and
> leverages unaligned_uint16_t for safe unaligned access on all
> platforms.
>
> Performance results from cksum_perf_autotest (TSC cycles/byte):
> Block size Before After Improvement
> 100 0.40-0.64 0.13-0.14 ~3-4x
> 1500 0.49-0.51 0.10-0.11 ~4-5x
> 9000 0.48-0.51 0.11-0.12 ~4x
>
> Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] net: optimize raw checksum computation
2026-01-06 10:59 ` Morten Brørup
@ 2026-01-06 18:16 ` Scott Mitchell
2026-01-06 19:00 ` Morten Brørup
0 siblings, 1 reply; 4+ messages in thread
From: Scott Mitchell @ 2026-01-06 18:16 UTC (permalink / raw)
To: Morten Brørup; +Cc: dev
On Tue, Jan 6, 2026 at 5:59 AM Morten Brørup <mb@smartsharesystems.com> wrote:
>
> > From: Scott Mitchell <scott.k.mitch1@gmail.com>
> >
> > Optimize __rte_raw_cksum() by processing data in larger unrolled loops
> > instead of iterating word-by-word. The new implementation processes
> > 64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
> > 32/16/8/4/2-byte chunks.
>
> Good idea processing in 64-byte blocks!
>
> I wonder if there would be further gain by 64-byte aligning the 64-byte chunks, so the compiler can use vector instructions for summing the 32 2-byte words of each 64-byte chunk.
> This would require a 3-step algorithm:
> 1. Process the first 0..63 bytes preceding the first 64-byte aligned address. (These bytes are unaligned; nothing new here.)
> 2. Process 64-byte chunks, if any. These are now 64-byte aligned, and you should ensure that the compiler knows it.
> 3. Process the last 32/16/8/4/2/1-byte chunks. These are now aligned, which eliminates the need for unaligned_uint16_t in this step. Specifically, the 32-byte chunk will be 64-byte aligned, allowing the compiler to use vector instructions. The 16-byte chunk will be 32-byte aligned. Etc.
>
> <random idea>
> Step 1 may be performed in reverse order of step 3, i.e. process in chunks of 1/2/4/8/16/32 bytes (using the lowest bits of the address as condition) - which will cause the alignment to increase accordingly.
> </random idea>
>
> <feature creep>
> Checking the alignment at runtime has a non-zero cost, so a an alternative (simpler) code path might be beneficial for small lengths (when the alignment is unknown at runtime).
> </feature creep>
>
Good idea! I implemented your suggestion but I didn't observe a
measurable difference in cksum_perf_autotest. I suggest we proceed
with the approach in this patch as an incremental step and I can post
a followup with your suggestion above to review/discuss. Note the
checksum computation requires processing in 16 bit blocks for
correctness which requires special case handling for odd
length/buffer-address alignment so complexity/code is higher.
> >
> > Uses uint64_t accumulator to reduce carry propagation overhead
>
> You return (uint32_t)sum64 at the end, so why replace the existing 32-bit "sum" with a 64-bit "sum64" accumulator?
Good catch. It gives more headroom to avoid overflow but not necessary
and I will revert.
>
> > and
> > leverages unaligned_uint16_t for safe unaligned access on all
> > platforms.
> >
> > Performance results from cksum_perf_autotest (TSC cycles/byte):
> > Block size Before After Improvement
> > 100 0.40-0.64 0.13-0.14 ~3-4x
> > 1500 0.49-0.51 0.10-0.11 ~4-5x
> > 9000 0.48-0.51 0.11-0.12 ~4x
> >
> > Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* RE: [PATCH] net: optimize raw checksum computation
2026-01-06 18:16 ` Scott Mitchell
@ 2026-01-06 19:00 ` Morten Brørup
0 siblings, 0 replies; 4+ messages in thread
From: Morten Brørup @ 2026-01-06 19:00 UTC (permalink / raw)
To: Scott Mitchell; +Cc: dev
> From: Scott Mitchell [mailto:scott.k.mitch1@gmail.com]
> Sent: Tuesday, 6 January 2026 19.16
>
> On Tue, Jan 6, 2026 at 5:59 AM Morten Brørup <mb@smartsharesystems.com>
> wrote:
> >
> > > From: Scott Mitchell <scott.k.mitch1@gmail.com>
> > >
> > > Optimize __rte_raw_cksum() by processing data in larger unrolled
> loops
> > > instead of iterating word-by-word. The new implementation processes
> > > 64-byte blocks (32 x uint16_t) in the hot path, followed by smaller
> > > 32/16/8/4/2-byte chunks.
> >
> > Good idea processing in 64-byte blocks!
> >
> > I wonder if there would be further gain by 64-byte aligning the 64-
> byte chunks, so the compiler can use vector instructions for summing
> the 32 2-byte words of each 64-byte chunk.
> > This would require a 3-step algorithm:
> > 1. Process the first 0..63 bytes preceding the first 64-byte aligned
> address. (These bytes are unaligned; nothing new here.)
> > 2. Process 64-byte chunks, if any. These are now 64-byte aligned, and
> you should ensure that the compiler knows it.
> > 3. Process the last 32/16/8/4/2/1-byte chunks. These are now aligned,
> which eliminates the need for unaligned_uint16_t in this step.
> Specifically, the 32-byte chunk will be 64-byte aligned, allowing the
> compiler to use vector instructions. The 16-byte chunk will be 32-byte
> aligned. Etc.
> >
> > <random idea>
> > Step 1 may be performed in reverse order of step 3, i.e. process in
> chunks of 1/2/4/8/16/32 bytes (using the lowest bits of the address as
> condition) - which will cause the alignment to increase accordingly.
> > </random idea>
> >
> > <feature creep>
> > Checking the alignment at runtime has a non-zero cost, so a an
> alternative (simpler) code path might be beneficial for small lengths
> (when the alignment is unknown at runtime).
> > </feature creep>
> >
>
> Good idea! I implemented your suggestion but I didn't observe a
> measurable difference in cksum_perf_autotest. I suggest we proceed
> with the approach in this patch as an incremental step and I can post
> a followup with your suggestion above to review/discuss.
Strongly agree to proceed with this patch first.
It brings a big performance benefit, while remaining relatively simple.
Then vector optimized variants can be experimented with later.
Thanks for trying it out.
> Note the
> checksum computation requires processing in 16 bit blocks for
> correctness which requires special case handling for odd
> length/buffer-address alignment so complexity/code is higher.
Good point. The vector optimized variant might not be as simple as initially thought.
>
> > >
> > > Uses uint64_t accumulator to reduce carry propagation overhead
> >
> > You return (uint32_t)sum64 at the end, so why replace the existing
> 32-bit "sum" with a 64-bit "sum64" accumulator?
>
> Good catch. It gives more headroom to avoid overflow but not necessary
> and I will revert.
Thanks.
>
> >
> > > and
> > > leverages unaligned_uint16_t for safe unaligned access on all
> > > platforms.
> > >
> > > Performance results from cksum_perf_autotest (TSC cycles/byte):
> > > Block size Before After Improvement
> > > 100 0.40-0.64 0.13-0.14 ~3-4x
> > > 1500 0.49-0.51 0.10-0.11 ~4-5x
> > > 9000 0.48-0.51 0.11-0.12 ~4x
> > >
> > > Signed-off-by: Scott Mitchell <scott.k.mitch1@gmail.com>
> >
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2026-01-07 8:56 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-01-05 23:27 [PATCH] net: optimize raw checksum computation scott.k.mitch1
2026-01-06 10:59 ` Morten Brørup
2026-01-06 18:16 ` Scott Mitchell
2026-01-06 19:00 ` Morten Brørup
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).